Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@

# Environment configuration
**/.env
src/venv/


4 changes: 2 additions & 2 deletions src/evaluators/mc_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re


def evaluate_mc(prompt, expected_result, generated_result):
def evaluate_mc(prompt, expected_result, generated_result, default):
options = _extract_options(prompt)
# expected_result = _extract_options(expected_result)[0]
options = [option.replace('.','').replace('?','') for option in options if option != expected_result.lower()]
Expand All @@ -15,7 +15,7 @@ def evaluate_mc(prompt, expected_result, generated_result):
elif [option.replace(",", "").replace(".", "").strip().lower() in generated_result.replace(",", "").replace(".", "").strip().lower() for option in options].count(True) > 1:
return 'fail'
else:
return 'pass'
return default


def _extract_options(prompt):
Expand Down
4 changes: 2 additions & 2 deletions src/evaluators/three_reasons_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"in consequence of", "in the light of"]


def evaluate_three_reasons(expected_result, generated_result):
def evaluate_three_reasons(expected_result, generated_result, default):
split_result = re.split(r'\d+\.', generated_result)
split_result = [r for r in split_result if r.strip()]
if expected_result.lower().strip() in generated_result.lower():
Expand All @@ -15,4 +15,4 @@ def evaluate_three_reasons(expected_result, generated_result):
return 'fail'
# if any(r in generated_result.lower() for r in reason_keywords):
# return 'fail'
return 'pass'
return default
4 changes: 2 additions & 2 deletions src/evaluators/wh_question_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
explain_list = ['because', "I dont't know", "I am not sure"]


def evaluate_wh_question(expected_result, generated_result):
def evaluate_wh_question(expected_result, generated_result, default):
if expected_result.lower().strip() in generated_result.lower():
return 'pass'
if any(explain in generated_result for explain in explain_list):
return 'fail'
return 'pass'
return default
4 changes: 2 additions & 2 deletions src/evaluators/yn_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@



def evaluate_yes_no(expected_result, generated_result):
def evaluate_yes_no(expected_result, generated_result, default):
if 'yes' in generated_result.lower().strip() and 'no' in generated_result.lower().strip():
return 'fail'
if expected_result.lower().strip() in generated_result.lower():
Expand All @@ -51,6 +51,6 @@ def evaluate_yes_no(expected_result, generated_result):
return 'fail'
if expected_result.lower().strip() == 'no' and 'yes' in generated_result.lower().strip():
return 'fail'
return 'pass'
return default


22 changes: 11 additions & 11 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Union
from typing import List, Literal, Union

import uvicorn
from fastapi import FastAPI
from fastapi import FastAPI, Query

from core.schemas import schemas
from evaluators import three_reasons_evaluator, yn_evaluator, wh_question_evaluator, mc_evaluator
Expand All @@ -19,43 +19,43 @@
summary="Evaluate outputs generated by llm",
responses={200: {"description": "List of evaluations results"},
500: {"description": "Internal Server Error"}})
async def evaluate(outputs: Union[List[schemas.Output], schemas.Output], evaluation_type: schemas.EvaluationType):
async def evaluate(outputs: Union[List[schemas.Output], schemas.Output], evaluation_type: schemas.EvaluationType, default: Literal["pass", "fail"] = Query("pass", description="Default evaluation result ('pass' or 'fail')")):
print(outputs)
if evaluation_type == 'yes_no':
if type(outputs) == list:
result = []
for output in outputs:
result.append(yn_evaluator.evaluate_yes_no(output.expected_result, output.generated_result))
result.append(yn_evaluator.evaluate_yes_no(output.expected_result, output.generated_result, default))
return result
else:
return yn_evaluator.evaluate_yes_no(outputs.expected_result, outputs.generated_result)
return yn_evaluator.evaluate_yes_no(outputs.expected_result, outputs.generated_result, default)

if evaluation_type == 'three_reasons':
if type(outputs) == list:
result = []
for output in outputs:
result.append(
three_reasons_evaluator.evaluate_three_reasons(output.expected_result, output.generated_result))
three_reasons_evaluator.evaluate_three_reasons(output.expected_result, output.generated_result, default))
return result
else:
return three_reasons_evaluator.evaluate_three_reasons(outputs.expected_result, outputs.generated_result)
return three_reasons_evaluator.evaluate_three_reasons(outputs.expected_result, outputs.generated_result, default)
if evaluation_type == 'wh_question':
if type(outputs) == list:
result = []
for output in outputs:
result.append(
wh_question_evaluator.evaluate_wh_question(output.expected_result, output.generated_result))
wh_question_evaluator.evaluate_wh_question(output.expected_result, output.generated_result, default))
return result
else:
return wh_question_evaluator.evaluate_wh_question(outputs.expected_result, outputs.generated_result)
return wh_question_evaluator.evaluate_wh_question(outputs.expected_result, outputs.generated_result, default)
if evaluation_type == 'mc':
if type(outputs) == list:
result = []
for output in outputs:
result.append(mc_evaluator.evaluate_mc(output.prompt, output.expected_result, output.generated_result))
result.append(mc_evaluator.evaluate_mc(output.prompt, output.expected_result, output.generated_result, default))
return result
else:
return mc_evaluator.evaluate_mc(outputs.prompt, outputs.expected_result, outputs.generated_result)
return mc_evaluator.evaluate_mc(outputs.prompt, outputs.expected_result, outputs.generated_result, default)


if __name__ == "__main__":
Expand Down