From 534e4d3e1be53075f4ec983ab8bf5db6cbab9937 Mon Sep 17 00:00:00 2001 From: Elon Demirok Date: Sun, 11 May 2025 06:21:36 +0000 Subject: [PATCH 1/3] Fixes #1582: Handling of structured CompletionUsage response values for token usage --- evals/cli/oaieval.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index e48a09ac19..da6b6ce01f 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -265,6 +265,12 @@ def build_recorder( run_spec=run_spec, ) +def _extract_token_count(token_field): + if isinstance(token_field, int): + return token_field + if hasattr(token_field, 'total'): + return token_field.total + return 0 # safe default clearly stated def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None: """ @@ -274,16 +280,20 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> sampling_events = recorder.get_events("sampling") for event in sampling_events: if "usage" in event.data: - usage_events.append(dict(event.data["usage"])) + usage_events.append(event.data["usage"]) + logger.info(f"Found {len(usage_events)}/{len(sampling_events)} sampling events with usage data") + if usage_events: # Sum up the usage of all samples (assumes the usage is the same for all samples) total_usage = { - key: sum(u[key] if u[key] is not None else 0 for u in usage_events) - for key in usage_events[0] + key: sum(_extract_token_count(getattr(u, key, 0)) for u in usage_events) + for key in ['completion_tokens', 'prompt_tokens', 'total_tokens'] # <--- Minimal explicit fix here } + total_usage_str = "\n".join(f"{key}: {value:,}" for key, value in total_usage.items()) logger.info(f"Token usage from {len(usage_events)} sampling events:\n{total_usage_str}") + for key, value in total_usage.items(): keyname = f"usage_{key}" if keyname not in result: @@ -293,7 +303,6 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> f"Usage key {keyname} already exists in result, not adding {keyname}" ) - def main() -> None: parser = get_parser() args = cast(OaiEvalArguments, parser.parse_args(sys.argv[1:])) From f51ea4631ac767ae50e700b4c41d31082b470020 Mon Sep 17 00:00:00 2001 From: Elon Demirok Date: Sun, 11 May 2025 06:58:00 +0000 Subject: [PATCH 2/3] Adds pre-commit formatting changes that apply to the PR scope --- evals/cli/oaieval.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index da6b6ce01f..b5323843ae 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -227,7 +227,9 @@ def to_number(x: str) -> Union[int, float, str]: try: add_token_usage_to_result(result, recorder) except Exception as e: - logger.error(f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected.") + logger.error( + f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected." + ) recorder.record_final_report(result) if not (args.dry_run or args.local_run): @@ -265,13 +267,15 @@ def build_recorder( run_spec=run_spec, ) -def _extract_token_count(token_field): + +def _extract_token_count(token_field: Any) -> int: if isinstance(token_field, int): return token_field - if hasattr(token_field, 'total'): + if hasattr(token_field, "total"): return token_field.total return 0 # safe default clearly stated + def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None: """ Add token usage from logged sampling events to the result dictionary from the recorder. @@ -288,7 +292,7 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> # Sum up the usage of all samples (assumes the usage is the same for all samples) total_usage = { key: sum(_extract_token_count(getattr(u, key, 0)) for u in usage_events) - for key in ['completion_tokens', 'prompt_tokens', 'total_tokens'] # <--- Minimal explicit fix here + for key in ["completion_tokens", "prompt_tokens", "total_tokens"] } total_usage_str = "\n".join(f"{key}: {value:,}" for key, value in total_usage.items()) @@ -303,6 +307,7 @@ def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> f"Usage key {keyname} already exists in result, not adding {keyname}" ) + def main() -> None: parser = get_parser() args = cast(OaiEvalArguments, parser.parse_args(sys.argv[1:])) From 809156114ee32173786382e9892e0ab460dfbac4 Mon Sep 17 00:00:00 2001 From: Elon Demirok Date: Sun, 11 May 2025 06:59:09 +0000 Subject: [PATCH 3/3] Remove extra empty line --- evals/cli/oaieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index b5323843ae..5eed95b08d 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -275,7 +275,6 @@ def _extract_token_count(token_field: Any) -> int: return token_field.total return 0 # safe default clearly stated - def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None: """ Add token usage from logged sampling events to the result dictionary from the recorder.