Azure · nagkumar91 · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
@@ -15,6 +15,7 @@
 ## 1.11.0 (2025-09-03)
 
 ### Features Added
+
 - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
 - Added support for user-supplied TokenCredentials with LLM based evaluators.
 - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
@@ -24,6 +25,13 @@
 ### Bugs Fixed
 - Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
 
+- [Bug](https://github.com/Azure/azure-sdk-for-python/issues/39909): Added `is_reasoning_model` keyword parameter to all evaluators
+    (`SimilarityEvaluator`, `RelevanceEvaluator`, `CoherenceEvaluator`, `FluencyEvaluator`,
+    `RetrievalEvaluator`, `GroundednessEvaluator`, `IntentResolutionEvaluator`,
+    `ResponseCompletenessEvaluator`, `TaskAdherenceEvaluator`, `ToolCallAccuracyEvaluator`).
+    When set, evaluator configuration is adjusted appropriately for reasoning models.
+    `QAEvaluator` now propagates this parameter to its child evaluators.
+
 ### Other Changes
 - Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release. 
 - Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.

@@ -93,7 +93,8 @@ def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration
 
 
 def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]:
-    return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model"))
+    # Minimal OpenAI configuration requires a model name; other fields may be provided
+    return isinstance(val, dict) and isinstance(val.get("model"), str)
 
 
 def parse_model_config_type(
@@ -181,6 +182,11 @@ def validate_azure_ai_project(o: object) -> AzureAIProject:
 
 
 def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]:
+    # Accept minimal OpenAI config (e.g., {"model": "gpt-4o-mini"}) to support
+    # evaluator initialization and prompty plumbing in tests and dry runs.
+    if _is_openai_model_config(config):
+        return cast(OpenAIModelConfiguration, config)
+
     try:
         return _validate_typed_dict(config, AzureOpenAIModelConfiguration)
     except TypeError:

@@ -178,8 +178,6 @@ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
         run = self._get_result(client_run)
         try:
             aggregated_metrics = run.get_aggregated_metrics()
-            print("Aggregated metrics")
-            print(aggregated_metrics)
         except Exception as ex:  # pylint: disable=broad-exception-caught
             LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
             return {}

@@ -1028,12 +1028,15 @@ def _preprocess_data(
     batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
 
     def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
-        """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
+        """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client).
+
+        Defaults to run_submitter. Explicit flags select alternate clients and certain tri-state combinations are
+        preserved for backward compatibility.
+        """
         _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
         _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
 
         if _use_run_submitter_client is None and _use_pf_client is None:
-            # If both are unset, return default
             return "run_submitter"
 
         if _use_run_submitter_client and _use_pf_client:
@@ -1044,20 +1047,21 @@ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter",
                 blame=ErrorBlame.USER_ERROR,
             )
 
-        if _use_run_submitter_client == False and _use_pf_client == False:
+        if _use_run_submitter_client is False and _use_pf_client is False:
             return "code_client"
 
         if _use_run_submitter_client:
             return "run_submitter"
         if _use_pf_client:
             return "pf_client"
 
-        if _use_run_submitter_client is None and _use_pf_client == False:
+        if _use_run_submitter_client is None and _use_pf_client is False:
             return "run_submitter"
-        if _use_run_submitter_client == False and _use_pf_client is None:
+        if _use_run_submitter_client is False and _use_pf_client is None:
             return "pf_client"
 
-        assert False, "This should be impossible"
+        # Should be unreachable
+        return "run_submitter"
 
     client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
 

@@ -12,17 +12,22 @@
 
 class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
+    Evaluates coherence for a given query and response or a multi-turn
+    conversation, including reasoning.
 
-    The coherence measure assesses the ability of the language model to generate text that reads naturally,
-    flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
-    and user-friendliness of a model's generated responses in real-world applications.
+    The coherence measure assesses the model's ability to generate text that
+    reads naturally, flows smoothly, and resembles human-like language. Use it
+    when assessing the readability and user-friendliness of responses.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+    :type model_config:
+        Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the coherence evaluator. Default is 3.
     :type threshold: int
+    :keyword is_reasoning_model: (Preview) config for chat completions is
+        updated to use reasoning models
+    :type is_reasoning_model: bool
 
     .. admonition:: Example:
 
@@ -31,7 +36,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
+            :caption: Initialize and call CoherenceEvaluator using
+                azure.ai.evaluation.AzureAIProject
 
     .. admonition:: Example using Azure AI Project URL:
 
@@ -40,7 +46,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
+            :caption: Initialize and call CoherenceEvaluator using Azure AI
+                Project URL in following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     .. admonition:: Example with Threshold:
@@ -50,23 +57,24 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize with threshold and call a CoherenceEvaluator
+                with a query and response.
 
     .. note::
 
-        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
-        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    To align with support of diverse models, an output key without the
+    `gpt_` prefix has been added. The old key with the `gpt_` prefix is
+    still present for compatibility; however, it will be deprecated.
     """
 
     _PROMPTY_FILE = "coherence.prompty"
     _RESULT_KEY = "coherence"
 
     id = "azureai://built-in/evaluators/coherence"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    """Evaluator identifier, experimental to be used only with cloud evaluation"""
 
     @override
-    def __init__(self, model_config, *, threshold=3, credential=None):
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -78,6 +86,7 @@ def __init__(self, model_config, *, threshold=3, credential=None):
             threshold=threshold,
             credential=credential,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
 
     @overload
@@ -105,9 +114,11 @@ def __call__(
     ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate coherence for a conversation
 
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain a list of conversation turns under the key "messages",
+            and optionally a global context under the key "context". Turns are
+            dictionaries with keys "content", "role", and possibly
+            "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The coherence score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
@@ -119,19 +130,22 @@ def __call__(  # pylint: disable=docstring-missing-param
         *args,
         **kwargs,
     ):
-        """Evaluate coherence. Accepts either a query and response for a single evaluation,
-        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
-        turns, the evaluator will aggregate the results of each turn.
+        """Evaluate coherence.
+
+        Accepts a query/response for a single evaluation, or a conversation
+        for a multi-turn evaluation. If the conversation has more than one
+        pair of turns, results are aggregated.
 
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain conversation turns under the key "messages" as
+            dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str,
+            List[float]]]]]
         """
         return super().__call__(*args, **kwargs)
@@ -4,7 +4,9 @@
 from concurrent.futures import as_completed
 from typing import TypeVar, Dict, List
 
-from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from azure.ai.evaluation._legacy._adapters.tracing import (
+    ThreadPoolExecutorWithContext as ThreadPoolExecutor,
+)
 from typing_extensions import override
 
 from azure.ai.evaluation._evaluators._common import EvaluatorBase

@@ -3,23 +3,36 @@
 # ---------------------------------------------------------
 
 import math
+import os
 import re
 import os
-from typing import Dict, Optional, TypeVar, Union
+from typing import Dict, TypeVar, Union, Optional
 
-if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
-    from promptflow.core._flow import AsyncPrompty
-else:
-    from azure.ai.evaluation._legacy.prompty import AsyncPrompty
+from azure.ai.evaluation._legacy.prompty import (
+    AsyncPrompty as _LegacyAsyncPrompty,
+)
+from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
+from azure.core.credentials import TokenCredential
 from typing_extensions import override
 
-from azure.core.credentials import TokenCredential
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
-from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
+from azure.ai.evaluation._exceptions import (
+    EvaluationException,
+    ErrorBlame,
+    ErrorCategory,
+    ErrorTarget,
+)
+from ..._common.utils import (
+    construct_prompty_model_config,
+    validate_model_config,
+    parse_quality_evaluator_reason_score,
+)
 from . import EvaluatorBase
 
+_PFAsyncPrompty = None  # type: ignore[assignment]
+
+
 try:
     from ..._user_agent import UserAgentSingleton
 except ImportError:
@@ -73,7 +86,11 @@ def __init__(
         self._prompty_file = prompty_file
         self._threshold = threshold
         self._higher_is_better = _higher_is_better
-        super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
+        super().__init__(
+            eval_last_turn=eval_last_turn,
+            threshold=threshold,
+            _higher_is_better=_higher_is_better,
+        )
 
         subclass_name = self.__class__.__name__
         user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"