Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
08960f1
Add Task Success Evaluator V0
Aug 11, 2025
1f16681
Add samples for task success evaluator
Aug 12, 2025
d3c068e
Run black
Aug 14, 2025
c2519bb
Modify output format
Aug 17, 2025
25a7ef1
Modify output format in the examples
Aug 17, 2025
05d1520
Make Task Success a private preview evaluator
Sep 16, 2025
11de796
Merge remote-tracking branch 'upstream/main' into selshafey/task_succ…
Sep 16, 2025
bfd7cc4
Minor TaskSuccessEvaluator prompt update
Sep 16, 2025
3cb4adb
Fix path for importing Task Success Evaluator in samples
Sep 16, 2025
8599d28
Modify path for TaskSuccessEvaluator in eval mapping
Sep 16, 2025
281c93b
Remove sample notebook
Sep 17, 2025
6beafde
To retrigger build pipelines
Sep 17, 2025
91a15fb
Merge branch 'main' into selshafey/task_success_evaluator
Sep 17, 2025
c86d86c
Add credential to TaskSuccessEvaluator
Sep 17, 2025
06264be
Run Black
Sep 17, 2025
58ed291
Merge branch 'main' into selshafey/task_success_evaluator
Sep 18, 2025
6b2ccaa
To retrigger build pipeline
Sep 18, 2025
b5c65c6
Minor prompt modification
Sep 18, 2025
111a617
Change tool_definitions type in TaskSuccess prompt
Sep 18, 2025
0a61b6d
Mark model grader tests as skip
Sep 18, 2025
d030d4c
Remove task success evaluator from the samples notebook
Sep 18, 2025
dd73e37
Rename Task Success to Task Completion
Sep 30, 2025
1e23d5a
Minor definition modification
Sep 30, 2025
38fc2da
Minor rename
Sep 30, 2025
cc2c973
Merge branch 'main' into selshafey/task_success_evaluator
Sep 30, 2025
e0047b6
remove task_success
Sep 30, 2025
55ad29d
Fix merge issue
Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# Import all evals
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
from azure.ai.evaluation import (
BleuScoreEvaluator,
CodeVulnerabilityEvaluator,
Expand Down Expand Up @@ -68,7 +68,7 @@
SexualEvaluator: "sexual",
SimilarityEvaluator: "similarity",
TaskAdherenceEvaluator: "task_adherence",
TaskSuccessEvaluator: "task_success",
TaskCompletionEvaluator: "task_completion",
ToolCallAccuracyEvaluator: "tool_call_accuracy",
UngroundedAttributesEvaluator: "ungrounded_attributes",
ViolenceEvaluator: "violence",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._task_success import TaskSuccessEvaluator
from ._task_completion import TaskCompletionEvaluator

__all__ = ["TaskSuccessEvaluator"]
__all__ = ["TaskCompletionEvaluator"]
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@


@experimental
class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
"""The Task Success evaluator determines whether an AI agent successfully completed the requested task based on:
class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:

- Final outcome and deliverable of the task
- Completeness of task requirements
Expand All @@ -39,29 +39,29 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):

.. admonition:: Example:
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START task_success_evaluator]
:end-before: [END task_success_evaluator]
:start-after: [START task_completion_evaluator]
:end-before: [END task_completion_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a TaskSuccessEvaluator with a query and response.
:caption: Initialize and call a TaskCompletionEvaluator with a query and response.

.. admonition:: Example using Azure AI Project URL:

.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
:start-after: [START task_success_evaluator]
:end-before: [END task_success_evaluator]
:start-after: [START task_completion_evaluator]
:end-before: [END task_completion_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call TaskSuccessEvaluator using Azure AI Project URL in the following format
:caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}

"""

_PROMPTY_FILE = "task_success.prompty"
_RESULT_KEY = "task_success"
_PROMPTY_FILE = "task_completion.prompty"
_RESULT_KEY = "task_completion"
_OPTIONAL_PARAMS = ["tool_definitions"]

id = "azureai://built-in/evaluators/task_success"
id = "azureai://built-in/evaluators/task_completion"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
Expand All @@ -84,19 +84,19 @@ def __call__(
response: Union[str, List[dict]],
tool_definitions: Optional[Union[dict, List[dict]]] = None,
) -> Dict[str, Union[str, bool]]:
"""Evaluate task success for a given query, response, and optionally tool definitions.
"""Evaluate task completion for a given query, response, and optionally tool definitions.
The query and response can be either a string or a list of messages.


Example with string inputs and no tools:
evaluator = TaskSuccessEvaluator(model_config)
evaluator = TaskCompletionEvaluator(model_config)
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."

result = evaluator(query=query, response=response)

Example with list of messages:
evaluator = TaskSuccessEvaluator(model_config)
evaluator = TaskCompletionEvaluator(model_config)
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
Expand All @@ -109,7 +109,7 @@ def __call__(
:paramtype response: Union[str, List[dict]]
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
:return: A dictionary with the task success evaluation results.
:return: A dictionary with the task completion evaluation results.
:rtype: Dict[str, Union[str, bool]]
"""

Expand All @@ -128,7 +128,7 @@ def __call__( # pylint: disable=docstring-missing-param

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
"""Do Task Success evaluation.
"""Do Task Completion evaluation.
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
:type eval_input: Dict
:return: The evaluation result.
Expand All @@ -138,11 +138,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
# which is a different schema than _base_prompty_eval.py
if "query" not in eval_input and "response" not in eval_input:
raise EvaluationException(
message=f"Both query and response must be provided as input to the Task Success evaluator.",
internal_message=f"Both query and response must be provided as input to the Task Success evaluator.",
message=f"Both query and response must be provided as input to the Task Completion evaluator.",
internal_message=f"Both query and response must be provided as input to the Task Completion evaluator.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.TASK_SUCCESS_EVALUATOR,
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
)
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
Expand All @@ -155,7 +155,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
if isinstance(success, str):
success = success.upper() == "TRUE"

success_result = "pass" if success == True else "fail"
success_result = "pass" if success else "fail"
reason = llm_output.get("explanation", "")
return {
f"{self._result_key}": success,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
name: Task Success
name: Task Completion
description: Evaluates whether a task was successfully completed
model:
api: chat
Expand Down Expand Up @@ -27,7 +27,7 @@ You are an expert evaluator who determines if an agent has successfully complete
user:
ROLE
====
You are a judge on Task Success who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**
You are a judge on Task Completion who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**

You are NOT evaluating:
- How well the agent followed instructions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class ErrorTarget(Enum):
FLUENCY_EVALUATOR = "FluencyEvaluator"
RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
TASK_ADHERENCE_EVALUATOR = "TaskAdherenceEvaluator"
TASK_SUCCESS_EVALUATOR = "TaskSuccessEvaluator"
TASK_COMPLETION_EVALUATOR = "TaskCompletionEvaluator"
INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
Expand Down
4 changes: 2 additions & 2 deletions sdk/evaluation/azure-ai-evaluation/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty",
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_data_sources/grounding.json",
"sdk/evaluation/azure-ai-evaluation/samples/data/evaluate_test_data.jsonl",
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/task_success.prompty",
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/_task_success.py"
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty",
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py"
],
"words": [
"Aoai",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,17 +422,17 @@ def evaluation_evaluate_classes_methods(self):
task_adherence_evaluator(query=query, response=response, tool_definitions=tool_definitions)
# [END task_adherence_evaluator]

# [START task_success_evaluator]
# [START task_completion_evaluator]
import os
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator

model_config = {
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
}

task_success_evaluator = TaskSuccessEvaluator(model_config=model_config)
task_completion_evaluator = TaskCompletionEvaluator(model_config=model_config)

query = [
{"role": "system", "content": "You are a travel booking assistant. Help users find and book flights."},
Expand Down Expand Up @@ -499,8 +499,8 @@ def evaluation_evaluate_classes_methods(self):
}
]

task_success_evaluator(query=query, response=response, tool_definitions=tool_definitions)
# [END task_success_evaluator]
task_completion_evaluator(query=query, response=response, tool_definitions=tool_definitions)
# [END task_completion_evaluator]

# [START indirect_attack_evaluator]
import os
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,17 +432,17 @@ def evaluation_evaluate_classes_methods(self):
task_adherence_evaluator(query=query, response=response, tool_definitions=tool_definitions)
# [END task_adherence_evaluator]

# [START task_success_evaluator]
# [START task_completion_evaluator]
import os
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator

model_config = {
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), # https://<account_name>.services.ai.azure.com
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
}

task_success_evaluator = TaskSuccessEvaluator(model_config=model_config)
task_completion_evaluator = TaskCompletionEvaluator(model_config=model_config)

query = [
{"role": "system", "content": "You are a travel booking assistant. Help users find and book flights."},
Expand Down Expand Up @@ -509,8 +509,8 @@ def evaluation_evaluate_classes_methods(self):
}
]

task_success_evaluator(query=query, response=response, tool_definitions=tool_definitions)
# [END task_success_evaluator]
task_completion_evaluator(query=query, response=response, tool_definitions=tool_definitions)
# [END task_completion_evaluator]

# [START indirect_attack_evaluator]
import os
Expand Down