From 2fb60288ffcd1b4c9fb8b3b8067b21754bd2f110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20V=C3=A1zquez=20Zambrano?= Date: Mon, 24 Feb 2025 22:00:54 +0100 Subject: [PATCH 1/2] =?UTF-8?q?Inclusi=C3=B3n=20de=20entorno=20virtual=20e?= =?UTF-8?q?n=20gitignore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 4e478bf..31b29ed 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ # Environment configuration **/.env +src/venv/ + From d1c6d6c31412cf228eb3ed5916a7a663ab13418b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20V=C3=A1zquez=20Zambrano?= Date: Thu, 27 Feb 2025 21:41:01 +0100 Subject: [PATCH 2/2] Adicion de parametro default para controlar el resultado de la evaluacion --- src/evaluators/mc_evaluator.py | 4 ++-- src/evaluators/three_reasons_evaluator.py | 4 ++-- src/evaluators/wh_question_evaluator.py | 4 ++-- src/evaluators/yn_evaluator.py | 4 ++-- src/main.py | 22 +++++++++++----------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/evaluators/mc_evaluator.py b/src/evaluators/mc_evaluator.py index 544ce96..3dccbe4 100644 --- a/src/evaluators/mc_evaluator.py +++ b/src/evaluators/mc_evaluator.py @@ -1,7 +1,7 @@ import re -def evaluate_mc(prompt, expected_result, generated_result): +def evaluate_mc(prompt, expected_result, generated_result, default): options = _extract_options(prompt) # expected_result = _extract_options(expected_result)[0] options = [option.replace('.','').replace('?','') for option in options if option != expected_result.lower()] @@ -15,7 +15,7 @@ def evaluate_mc(prompt, expected_result, generated_result): elif [option.replace(",", "").replace(".", "").strip().lower() in generated_result.replace(",", "").replace(".", "").strip().lower() for option in options].count(True) > 1: return 'fail' else: - return 'pass' + return default def _extract_options(prompt): diff --git a/src/evaluators/three_reasons_evaluator.py b/src/evaluators/three_reasons_evaluator.py index 9bb7e07..ec8d630 100644 --- a/src/evaluators/three_reasons_evaluator.py +++ b/src/evaluators/three_reasons_evaluator.py @@ -6,7 +6,7 @@ "in consequence of", "in the light of"] -def evaluate_three_reasons(expected_result, generated_result): +def evaluate_three_reasons(expected_result, generated_result, default): split_result = re.split(r'\d+\.', generated_result) split_result = [r for r in split_result if r.strip()] if expected_result.lower().strip() in generated_result.lower(): @@ -15,4 +15,4 @@ def evaluate_three_reasons(expected_result, generated_result): return 'fail' # if any(r in generated_result.lower() for r in reason_keywords): # return 'fail' - return 'pass' + return default diff --git a/src/evaluators/wh_question_evaluator.py b/src/evaluators/wh_question_evaluator.py index bac8ee6..20f4ae1 100644 --- a/src/evaluators/wh_question_evaluator.py +++ b/src/evaluators/wh_question_evaluator.py @@ -1,9 +1,9 @@ explain_list = ['because', "I dont't know", "I am not sure"] -def evaluate_wh_question(expected_result, generated_result): +def evaluate_wh_question(expected_result, generated_result, default): if expected_result.lower().strip() in generated_result.lower(): return 'pass' if any(explain in generated_result for explain in explain_list): return 'fail' - return 'pass' + return default diff --git a/src/evaluators/yn_evaluator.py b/src/evaluators/yn_evaluator.py index c3434b2..cef75f7 100644 --- a/src/evaluators/yn_evaluator.py +++ b/src/evaluators/yn_evaluator.py @@ -34,7 +34,7 @@ -def evaluate_yes_no(expected_result, generated_result): +def evaluate_yes_no(expected_result, generated_result, default): if 'yes' in generated_result.lower().strip() and 'no' in generated_result.lower().strip(): return 'fail' if expected_result.lower().strip() in generated_result.lower(): @@ -51,6 +51,6 @@ def evaluate_yes_no(expected_result, generated_result): return 'fail' if expected_result.lower().strip() == 'no' and 'yes' in generated_result.lower().strip(): return 'fail' - return 'pass' + return default diff --git a/src/main.py b/src/main.py index faa3cd2..f6e3f05 100644 --- a/src/main.py +++ b/src/main.py @@ -1,7 +1,7 @@ -from typing import List, Union +from typing import List, Literal, Union import uvicorn -from fastapi import FastAPI +from fastapi import FastAPI, Query from core.schemas import schemas from evaluators import three_reasons_evaluator, yn_evaluator, wh_question_evaluator, mc_evaluator @@ -19,43 +19,43 @@ summary="Evaluate outputs generated by llm", responses={200: {"description": "List of evaluations results"}, 500: {"description": "Internal Server Error"}}) -async def evaluate(outputs: Union[List[schemas.Output], schemas.Output], evaluation_type: schemas.EvaluationType): +async def evaluate(outputs: Union[List[schemas.Output], schemas.Output], evaluation_type: schemas.EvaluationType, default: Literal["pass", "fail"] = Query("pass", description="Default evaluation result ('pass' or 'fail')")): print(outputs) if evaluation_type == 'yes_no': if type(outputs) == list: result = [] for output in outputs: - result.append(yn_evaluator.evaluate_yes_no(output.expected_result, output.generated_result)) + result.append(yn_evaluator.evaluate_yes_no(output.expected_result, output.generated_result, default)) return result else: - return yn_evaluator.evaluate_yes_no(outputs.expected_result, outputs.generated_result) + return yn_evaluator.evaluate_yes_no(outputs.expected_result, outputs.generated_result, default) if evaluation_type == 'three_reasons': if type(outputs) == list: result = [] for output in outputs: result.append( - three_reasons_evaluator.evaluate_three_reasons(output.expected_result, output.generated_result)) + three_reasons_evaluator.evaluate_three_reasons(output.expected_result, output.generated_result, default)) return result else: - return three_reasons_evaluator.evaluate_three_reasons(outputs.expected_result, outputs.generated_result) + return three_reasons_evaluator.evaluate_three_reasons(outputs.expected_result, outputs.generated_result, default) if evaluation_type == 'wh_question': if type(outputs) == list: result = [] for output in outputs: result.append( - wh_question_evaluator.evaluate_wh_question(output.expected_result, output.generated_result)) + wh_question_evaluator.evaluate_wh_question(output.expected_result, output.generated_result, default)) return result else: - return wh_question_evaluator.evaluate_wh_question(outputs.expected_result, outputs.generated_result) + return wh_question_evaluator.evaluate_wh_question(outputs.expected_result, outputs.generated_result, default) if evaluation_type == 'mc': if type(outputs) == list: result = [] for output in outputs: - result.append(mc_evaluator.evaluate_mc(output.prompt, output.expected_result, output.generated_result)) + result.append(mc_evaluator.evaluate_mc(output.prompt, output.expected_result, output.generated_result, default)) return result else: - return mc_evaluator.evaluate_mc(outputs.prompt, outputs.expected_result, outputs.generated_result) + return mc_evaluator.evaluate_mc(outputs.prompt, outputs.expected_result, outputs.generated_result, default) if __name__ == "__main__":