diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py index 461357ae3921..e77708057173 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py @@ -11,7 +11,7 @@ # Import all evals from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator -from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator +from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator from azure.ai.evaluation import ( BleuScoreEvaluator, CodeVulnerabilityEvaluator, @@ -68,7 +68,7 @@ SexualEvaluator: "sexual", SimilarityEvaluator: "similarity", TaskAdherenceEvaluator: "task_adherence", - TaskSuccessEvaluator: "task_success", + TaskCompletionEvaluator: "task_completion", ToolCallAccuracyEvaluator: "tool_call_accuracy", UngroundedAttributesEvaluator: "ungrounded_attributes", ViolenceEvaluator: "violence", diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/__init__.py similarity index 66% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/__init__.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/__init__.py index a20c2d4a8f72..dc4d08e31a9b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/__init__.py @@ -2,6 +2,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from ._task_success import TaskSuccessEvaluator +from ._task_completion import TaskCompletionEvaluator -__all__ = ["TaskSuccessEvaluator"] +__all__ = ["TaskCompletionEvaluator"] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/_task_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py similarity index 83% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/_task_success.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py index 9449f4f3c57f..bc6b6fbfd53a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/_task_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py @@ -18,8 +18,8 @@ @experimental -class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]): - """The Task Success evaluator determines whether an AI agent successfully completed the requested task based on: +class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]): + """The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on: - Final outcome and deliverable of the task - Completeness of task requirements @@ -39,29 +39,29 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]): .. admonition:: Example: .. literalinclude:: ../samples/evaluation_samples_evaluate.py - :start-after: [START task_success_evaluator] - :end-before: [END task_success_evaluator] + :start-after: [START task_completion_evaluator] + :end-before: [END task_completion_evaluator] :language: python :dedent: 8 - :caption: Initialize and call a TaskSuccessEvaluator with a query and response. + :caption: Initialize and call a TaskCompletionEvaluator with a query and response. .. admonition:: Example using Azure AI Project URL: .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py - :start-after: [START task_success_evaluator] - :end-before: [END task_success_evaluator] + :start-after: [START task_completion_evaluator] + :end-before: [END task_completion_evaluator] :language: python :dedent: 8 - :caption: Initialize and call TaskSuccessEvaluator using Azure AI Project URL in the following format + :caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} """ - _PROMPTY_FILE = "task_success.prompty" - _RESULT_KEY = "task_success" + _PROMPTY_FILE = "task_completion.prompty" + _RESULT_KEY = "task_completion" _OPTIONAL_PARAMS = ["tool_definitions"] - id = "azureai://built-in/evaluators/task_success" + id = "azureai://built-in/evaluators/task_completion" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override @@ -84,19 +84,19 @@ def __call__( response: Union[str, List[dict]], tool_definitions: Optional[Union[dict, List[dict]]] = None, ) -> Dict[str, Union[str, bool]]: - """Evaluate task success for a given query, response, and optionally tool definitions. + """Evaluate task completion for a given query, response, and optionally tool definitions. The query and response can be either a string or a list of messages. Example with string inputs and no tools: - evaluator = TaskSuccessEvaluator(model_config) + evaluator = TaskCompletionEvaluator(model_config) query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine." response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..." result = evaluator(query=query, response=response) Example with list of messages: - evaluator = TaskSuccessEvaluator(model_config) + evaluator = TaskCompletionEvaluator(model_config) query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}] response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}] tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}] @@ -109,7 +109,7 @@ def __call__( :paramtype response: Union[str, List[dict]] :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. :paramtype tool_definitions: Optional[Union[dict, List[dict]]] - :return: A dictionary with the task success evaluation results. + :return: A dictionary with the task completion evaluation results. :rtype: Dict[str, Union[str, bool]] """ @@ -128,7 +128,7 @@ def __call__( # pylint: disable=docstring-missing-param @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override] - """Do Task Success evaluation. + """Do Task Completion evaluation. :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method :type eval_input: Dict :return: The evaluation result. @@ -138,11 +138,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty # which is a different schema than _base_prompty_eval.py if "query" not in eval_input and "response" not in eval_input: raise EvaluationException( - message=f"Both query and response must be provided as input to the Task Success evaluator.", - internal_message=f"Both query and response must be provided as input to the Task Success evaluator.", + message=f"Both query and response must be provided as input to the Task Completion evaluator.", + internal_message=f"Both query and response must be provided as input to the Task Completion evaluator.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.MISSING_FIELD, - target=ErrorTarget.TASK_SUCCESS_EVALUATOR, + target=ErrorTarget.TASK_COMPLETION_EVALUATOR, ) eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) @@ -155,7 +155,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty if isinstance(success, str): success = success.upper() == "TRUE" - success_result = "pass" if success == True else "fail" + success_result = "pass" if success else "fail" reason = llm_output.get("explanation", "") return { f"{self._result_key}": success, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/task_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty similarity index 97% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/task_success.prompty rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty index 526f71ebd90b..8af03918e3c3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty @@ -1,5 +1,5 @@ --- -name: Task Success +name: Task Completion description: Evaluates whether a task was successfully completed model: api: chat @@ -27,7 +27,7 @@ You are an expert evaluator who determines if an agent has successfully complete user: ROLE ==== -You are a judge on Task Success who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?** +You are a judge on Task Completion who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?** You are NOT evaluating: - How well the agent followed instructions diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py index 102d0885272c..9b28686b9bf6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py @@ -85,7 +85,7 @@ class ErrorTarget(Enum): FLUENCY_EVALUATOR = "FluencyEvaluator" RETRIEVAL_EVALUATOR = "RetrievalEvaluator" TASK_ADHERENCE_EVALUATOR = "TaskAdherenceEvaluator" - TASK_SUCCESS_EVALUATOR = "TaskSuccessEvaluator" + TASK_COMPLETION_EVALUATOR = "TaskCompletionEvaluator" INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator" INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator" ADVERSARIAL_SIMULATOR = "AdversarialSimulator" diff --git a/sdk/evaluation/azure-ai-evaluation/cspell.json b/sdk/evaluation/azure-ai-evaluation/cspell.json index 0b63086faf8e..ebd62688b44b 100644 --- a/sdk/evaluation/azure-ai-evaluation/cspell.json +++ b/sdk/evaluation/azure-ai-evaluation/cspell.json @@ -33,8 +33,8 @@ "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty", "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_data_sources/grounding.json", "sdk/evaluation/azure-ai-evaluation/samples/data/evaluate_test_data.jsonl", - "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/task_success.prompty", - "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/_task_success.py" + "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty", + "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py" ], "words": [ "Aoai", diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py index 02b38bb50d9f..a6a0f3b6805d 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py @@ -422,9 +422,9 @@ def evaluation_evaluate_classes_methods(self): task_adherence_evaluator(query=query, response=response, tool_definitions=tool_definitions) # [END task_adherence_evaluator] - # [START task_success_evaluator] + # [START task_completion_evaluator] import os - from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator + from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator model_config = { "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), @@ -432,7 +432,7 @@ def evaluation_evaluate_classes_methods(self): "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"), } - task_success_evaluator = TaskSuccessEvaluator(model_config=model_config) + task_completion_evaluator = TaskCompletionEvaluator(model_config=model_config) query = [ {"role": "system", "content": "You are a travel booking assistant. Help users find and book flights."}, @@ -499,8 +499,8 @@ def evaluation_evaluate_classes_methods(self): } ] - task_success_evaluator(query=query, response=response, tool_definitions=tool_definitions) - # [END task_success_evaluator] + task_completion_evaluator(query=query, response=response, tool_definitions=tool_definitions) + # [END task_completion_evaluator] # [START indirect_attack_evaluator] import os diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py index 505076821259..d6b023a581b1 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py @@ -432,9 +432,9 @@ def evaluation_evaluate_classes_methods(self): task_adherence_evaluator(query=query, response=response, tool_definitions=tool_definitions) # [END task_adherence_evaluator] - # [START task_success_evaluator] + # [START task_completion_evaluator] import os - from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator + from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator model_config = { "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), # https://.services.ai.azure.com @@ -442,7 +442,7 @@ def evaluation_evaluate_classes_methods(self): "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"), } - task_success_evaluator = TaskSuccessEvaluator(model_config=model_config) + task_completion_evaluator = TaskCompletionEvaluator(model_config=model_config) query = [ {"role": "system", "content": "You are a travel booking assistant. Help users find and book flights."}, @@ -509,8 +509,8 @@ def evaluation_evaluate_classes_methods(self): } ] - task_success_evaluator(query=query, response=response, tool_definitions=tool_definitions) - # [END task_success_evaluator] + task_completion_evaluator(query=query, response=response, tool_definitions=tool_definitions) + # [END task_completion_evaluator] # [START indirect_attack_evaluator] import os