From 534e4d3e1be53075f4ec983ab8bf5db6cbab9937 Mon Sep 17 00:00:00 2001
From: Elon Demirok <elon.demirok@gmail.com>
Date: Sun, 11 May 2025 06:21:36 +0000
Subject: [PATCH 1/3] Fixes #1582: Handling of structured CompletionUsage
 response values for token usage

---
 evals/cli/oaieval.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
index e48a09ac19..da6b6ce01f 100644
--- a/evals/cli/oaieval.py
+++ b/evals/cli/oaieval.py
@@ -265,6 +265,12 @@ def build_recorder(
         run_spec=run_spec,
     )
 
+def _extract_token_count(token_field):
+    if isinstance(token_field, int):
+        return token_field
+    if hasattr(token_field, 'total'):
+        return token_field.total
+    return 0  # safe default clearly stated
 
 def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None:
     """
@@ -274,16 +280,20 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) ->
     sampling_events = recorder.get_events("sampling")
     for event in sampling_events:
         if "usage" in event.data:
-            usage_events.append(dict(event.data["usage"]))
+            usage_events.append(event.data["usage"])
+
     logger.info(f"Found {len(usage_events)}/{len(sampling_events)} sampling events with usage data")
+
     if usage_events:
         # Sum up the usage of all samples (assumes the usage is the same for all samples)
         total_usage = {
-            key: sum(u[key] if u[key] is not None else 0 for u in usage_events)
-            for key in usage_events[0]
+            key: sum(_extract_token_count(getattr(u, key, 0)) for u in usage_events)
+            for key in ['completion_tokens', 'prompt_tokens', 'total_tokens']  # <--- Minimal explicit fix here
         }
+
         total_usage_str = "\n".join(f"{key}: {value:,}" for key, value in total_usage.items())
         logger.info(f"Token usage from {len(usage_events)} sampling events:\n{total_usage_str}")
+
         for key, value in total_usage.items():
             keyname = f"usage_{key}"
             if keyname not in result:
@@ -293,7 +303,6 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) ->
                     f"Usage key {keyname} already exists in result, not adding {keyname}"
                 )
 
-
 def main() -> None:
     parser = get_parser()
     args = cast(OaiEvalArguments, parser.parse_args(sys.argv[1:]))

From f51ea4631ac767ae50e700b4c41d31082b470020 Mon Sep 17 00:00:00 2001
From: Elon Demirok <elon.demirok@gmail.com>
Date: Sun, 11 May 2025 06:58:00 +0000
Subject: [PATCH 2/3] Adds pre-commit formatting changes that apply to the PR
 scope

---
 evals/cli/oaieval.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
index da6b6ce01f..b5323843ae 100644
--- a/evals/cli/oaieval.py
+++ b/evals/cli/oaieval.py
@@ -227,7 +227,9 @@ def to_number(x: str) -> Union[int, float, str]:
     try:
         add_token_usage_to_result(result, recorder)
     except Exception as e:
-        logger.error(f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected.")
+        logger.error(
+            f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected."
+        )
     recorder.record_final_report(result)
 
     if not (args.dry_run or args.local_run):
@@ -265,13 +267,15 @@ def build_recorder(
         run_spec=run_spec,
     )
 
-def _extract_token_count(token_field):
+
+def _extract_token_count(token_field: Any) -> int:
     if isinstance(token_field, int):
         return token_field
-    if hasattr(token_field, 'total'):
+    if hasattr(token_field, "total"):
         return token_field.total
     return 0  # safe default clearly stated
 
+
 def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None:
     """
     Add token usage from logged sampling events to the result dictionary from the recorder.
@@ -288,7 +292,7 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) ->
         # Sum up the usage of all samples (assumes the usage is the same for all samples)
         total_usage = {
             key: sum(_extract_token_count(getattr(u, key, 0)) for u in usage_events)
-            for key in ['completion_tokens', 'prompt_tokens', 'total_tokens']  # <--- Minimal explicit fix here
+            for key in ["completion_tokens", "prompt_tokens", "total_tokens"]
         }
 
         total_usage_str = "\n".join(f"{key}: {value:,}" for key, value in total_usage.items())
@@ -303,6 +307,7 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) ->
                     f"Usage key {keyname} already exists in result, not adding {keyname}"
                 )
 
+
 def main() -> None:
     parser = get_parser()
     args = cast(OaiEvalArguments, parser.parse_args(sys.argv[1:]))

From 809156114ee32173786382e9892e0ab460dfbac4 Mon Sep 17 00:00:00 2001
From: Elon Demirok <elon.demirok@gmail.com>
Date: Sun, 11 May 2025 06:59:09 +0000
Subject: [PATCH 3/3] Remove extra empty line

---
 evals/cli/oaieval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
index b5323843ae..5eed95b08d 100644
--- a/evals/cli/oaieval.py
+++ b/evals/cli/oaieval.py
@@ -275,7 +275,6 @@ def _extract_token_count(token_field: Any) -> int:
         return token_field.total
     return 0  # safe default clearly stated
 
-
 def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None:
     """
     Add token usage from logged sampling events to the result dictionary from the recorder.