Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
08960f1
Add Task Success Evaluator V0
Aug 11, 2025
1f16681
Add samples for task success evaluator
Aug 12, 2025
d3c068e
Run black
Aug 14, 2025
c2519bb
Modify output format
Aug 17, 2025
25a7ef1
Modify output format in the examples
Aug 17, 2025
05d1520
Make Task Success a private preview evaluator
Sep 16, 2025
11de796
Merge remote-tracking branch 'upstream/main' into selshafey/task_succ…
Sep 16, 2025
bfd7cc4
Minor TaskSuccessEvaluator prompt update
Sep 16, 2025
3cb4adb
Fix path for importing Task Success Evaluator in samples
Sep 16, 2025
8599d28
Modify path for TaskSuccessEvaluator in eval mapping
Sep 16, 2025
281c93b
Remove sample notebook
Sep 17, 2025
6beafde
To retrigger build pipelines
Sep 17, 2025
91a15fb
Merge branch 'main' into selshafey/task_success_evaluator
Sep 17, 2025
c86d86c
Add credential to TaskSuccessEvaluator
Sep 17, 2025
06264be
Run Black
Sep 17, 2025
58ed291
Merge branch 'main' into selshafey/task_success_evaluator
Sep 18, 2025
6b2ccaa
To retrigger build pipeline
Sep 18, 2025
b5c65c6
Minor prompt modification
Sep 18, 2025
111a617
Change tool_definitions type in TaskSuccess prompt
Sep 18, 2025
0a61b6d
Mark model grader tests as skip
Sep 18, 2025
d030d4c
Remove task success evaluator from the samples notebook
Sep 18, 2025
dd73e37
Rename Task Success to Task Completion
Sep 30, 2025
1e23d5a
Minor definition modification
Sep 30, 2025
38fc2da
Minor rename
Sep 30, 2025
cc2c973
Merge branch 'main' into selshafey/task_success_evaluator
Sep 30, 2025
e0047b6
remove task_success
Sep 30, 2025
55ad29d
Fix merge issue
Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

# Import all evals
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
from azure.ai.evaluation import (
BleuScoreEvaluator,
CodeVulnerabilityEvaluator,
Expand Down Expand Up @@ -67,6 +68,7 @@
SexualEvaluator: "sexual",
SimilarityEvaluator: "similarity",
TaskAdherenceEvaluator: "task_adherence",
TaskCompletionEvaluator: "task_completion",
ToolCallAccuracyEvaluator: "tool_call_accuracy",
UngroundedAttributesEvaluator: "ungrounded_attributes",
ViolenceEvaluator: "violence",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._task_completion import TaskCompletionEvaluator

__all__ = ["TaskCompletionEvaluator"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import os
import math
import logging
from typing import Dict, Union, List, Optional

from typing_extensions import overload, override

from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
from azure.ai.evaluation._model_configurations import Message
from azure.ai.evaluation._common._experimental import experimental

logger = logging.getLogger(__name__)


@experimental
class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:

- Final outcome and deliverable of the task
- Completeness of task requirements

This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.

Scoring is binary:
- TRUE: Task fully completed with usable deliverable that meets all user requirements
- FALSE: Task incomplete, partially completed, or deliverable does not meet requirements

The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.


:param model_config: Configuration for the Azure OpenAI model.
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
~azure.ai.evaluation.OpenAIModelConfiguration]

.. admonition:: Example:
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START task_completion_evaluator]
:end-before: [END task_completion_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a TaskCompletionEvaluator with a query and response.

.. admonition:: Example using Azure AI Project URL:

.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
:start-after: [START task_completion_evaluator]
:end-before: [END task_completion_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}

"""

_PROMPTY_FILE = "task_completion.prompty"
_RESULT_KEY = "task_completion"
_OPTIONAL_PARAMS = ["tool_definitions"]

id = "azureai://built-in/evaluators/task_completion"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
def __init__(self, model_config, *, credential=None, **kwargs):
current_dir = os.path.dirname(__file__)
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
super().__init__(
model_config=model_config,
prompty_file=prompty_path,
result_key=self._RESULT_KEY,
credential=credential,
**kwargs,
)

@overload
def __call__(
self,
*,
query: Union[str, List[dict]],
response: Union[str, List[dict]],
tool_definitions: Optional[Union[dict, List[dict]]] = None,
) -> Dict[str, Union[str, bool]]:
"""Evaluate task success for a given query, response, and optionally tool definitions.
The query and response can be either a string or a list of messages.


Example with string inputs and no tools:
evaluator = TaskCompletionEvaluator(model_config)
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."

result = evaluator(query=query, response=response)

Example with list of messages:
evaluator = TaskCompletionEvaluator(model_config)
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]

result = evaluator(query=query, response=response, tool_definitions=tool_definitions)

:keyword query: The query being evaluated, either a string or a list of messages.
:paramtype query: Union[str, List[dict]]
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
:paramtype response: Union[str, List[dict]]
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
:return: A dictionary with the task success evaluation results.
:rtype: Dict[str, Union[str, bool]]
"""

@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""
Invokes the instance using the overloaded __call__ signature.

For detailed parameter types and return value documentation, see the overloaded __call__ definition.
"""
return super().__call__(*args, **kwargs)

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
"""Do Task Success evaluation.
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
# we override the _do_eval method as we want the output to be a dictionary,
# which is a different schema than _base_prompty_eval.py
if "query" not in eval_input and "response" not in eval_input:
raise EvaluationException(
message=f"Both query and response must be provided as input to the Task Success evaluator.",
internal_message=f"Both query and response must be provided as input to the Task Success evaluator.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
)
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)

llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
if isinstance(llm_output, dict):
success = llm_output.get("success", False)
if isinstance(success, str):
success = success.upper() == "TRUE"

success_result = "pass" if success == True else "fail"
reason = llm_output.get("explanation", "")
return {
f"{self._result_key}": success,
f"{self._result_key}_result": success_result,
f"{self._result_key}_reason": reason,
f"{self._result_key}_details": llm_output.get("details", ""),
}
if logger:
logger.warning("LLM output is not a dictionary, returning False for the success.")
return {self._result_key: False}
Loading