11# Standard
2- from enum import StrEnum
2+ from copy import deepcopy
33from pathlib import Path
44import gc
55import json
66import os
77import typing as t
8- from copy import deepcopy
98
109# Third Party
1110from accelerate import Accelerator
1817# Local
1918from .evaluator import Evaluator
2019
20+ # Since StrEnum wasn't part of the STL until Python3.11, we must do this
21+ try :
22+ # Standard
23+ from enum import StrEnum
24+ except ImportError :
25+ # Third Party
26+ from strenum import StrEnum # type: ignore[no-redef]
27+
28+ # And do the same thing to bring in NotRequired from typing
29+ try :
30+ # Standard
31+ from typing import NotRequired
32+ except ImportError :
33+ # Third Party
34+ from typing_extensions import NotRequired
35+
2136
2237class ParsedScores (t .TypedDict ):
2338 """
2439 Just an ordinary dict that contains both the overall score as well as per-subtask scores.
2540 """
2641
2742 score : float
28- subtasks : t . NotRequired [t .Dict [str , float ]]
43+ subtasks : NotRequired [t .Dict [str , float ]]
2944
3045
3146class LeaderboardV2EvalResult (t .TypedDict ):
3247 overall_score : float
33- leaderboard_gpqa : t . NotRequired [ParsedScores ]
34- leaderboard_ifeval : t . NotRequired [ParsedScores ]
35- leaderboard_bbh : t . NotRequired [ParsedScores ]
36- leaderboard_mmlu_pro : t . NotRequired [ParsedScores ]
37- leaderboard_musr : t . NotRequired [ParsedScores ]
38- leaderboard_math_hard : t . NotRequired [ParsedScores ]
48+ leaderboard_gpqa : NotRequired [ParsedScores ]
49+ leaderboard_ifeval : NotRequired [ParsedScores ]
50+ leaderboard_bbh : NotRequired [ParsedScores ]
51+ leaderboard_mmlu_pro : NotRequired [ParsedScores ]
52+ leaderboard_musr : NotRequired [ParsedScores ]
53+ leaderboard_math_hard : NotRequired [ParsedScores ]
3954
4055
4156class LeaderboardV2Tasks (StrEnum ):
@@ -94,7 +109,7 @@ class TaskGrouping(t.TypedDict):
94109}
95110
96111# 1. Add OpenAI configuration defaults
97- DEFAULT_OPENAI_CONFIG = {
112+ DEFAULT_OPENAI_CONFIG : t . Dict [ str , t . Any ] = {
98113 "max_tokens" : 768 ,
99114 "temperature" : 0.0 ,
100115 "seed" : 1337 ,
@@ -194,9 +209,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
194209def evaluate_with_hf (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
195210 # we need to use torch.multiprocessing to run each task in a separate process,
196211 # and then combine the results
197- # Third Party
198- import torch .multiprocessing as mp
199-
200212 num_processes = args ["num_gpus" ]
201213
202214 # Create the context and queue within the same context
@@ -222,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
222234 p .join ()
223235
224236 # extract the result which is not None
225- assert len ([ res for res in results . values () if res is not None ]) == 1 , (
226- "we expect exactly 1 process to return a results dict properly"
227- )
237+ assert (
238+ len ([ res for res in results . values () if res is not None ]) == 1
239+ ), "we expect exactly 1 process to return a results dict properly"
228240 results_dict = [res for res in results .values () if res is not None ][0 ]
229241 return results_dict
230242
@@ -290,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290302 parsed_scores = parse_multitask_results (
291303 result_dict , LeaderboardV2Tasks .BBH .value , "acc_norm"
292304 )
293- assert len ( parsed_scores [ "subtasks" ]) == 24 , (
294- "there should be 24 subtasks of bbh run"
295- )
305+ assert (
306+ len ( parsed_scores [ " subtasks" ]) == 24
307+ ), "there should be 24 subtasks of bbh run"
296308 return parsed_scores
297309
298310
@@ -343,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
343355 scores .append (value )
344356 target_metrics .remove (metric )
345357
346- assert len ( scores ) == 2 , (
347- f"there should only be 2 values extracted in ifeval, got: { len (scores )} "
348- )
358+ assert (
359+ len (scores ) == 2
360+ ), f"there should only be 2 values extracted in ifeval, got: { len ( scores ) } "
349361 return {
350362 "score" : sum (scores ) / 2 ,
351363 }
@@ -369,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
369381 parsed_scores = parse_multitask_results (
370382 result_dict , LeaderboardV2Tasks .GPQA .value , "acc_norm"
371383 )
372- assert len ( parsed_scores [ "subtasks" ]) == 3 , (
373- f"Expected 3 gpqa scores, got { len (parsed_scores [' subtasks' ]) } "
374- )
384+ assert (
385+ len (parsed_scores [" subtasks" ]) == 3
386+ ), f"Expected 3 gpqa scores, got { len ( parsed_scores [ 'subtasks' ]) } "
375387 return parsed_scores
376388
377389
@@ -382,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
382394 parsed_scores = parse_multitask_results (
383395 result_dict , LeaderboardV2Tasks .MATH_HARD .value , "exact_match"
384396 )
385- assert len ( parsed_scores [ "subtasks" ]) == 7 , (
386- f"leaderboard_math_hard should have 7 subtasks, found: { len (parsed_scores [' subtasks' ]) } "
387- )
397+ assert (
398+ len (parsed_scores [" subtasks" ]) == 7
399+ ), f"leaderboard_math_hard should have 7 subtasks, found: { len ( parsed_scores [ 'subtasks' ]) } "
388400 return parsed_scores
389401
390402
@@ -451,9 +463,9 @@ def get_scores_from_result_dicts(
451463 # this is just a sanity check step
452464 benchmarks_already_covered = set (parsed_scores .keys ())
453465 overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
454- assert len ( benchmarks_already_covered & benchmarks_to_parse ) == 0 , (
455- f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
456- )
466+ assert (
467+ len ( benchmarks_already_covered & benchmarks_to_parse ) == 0
468+ ), f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
457469
458470 # now actually add them
459471 for benchmark in benchmarks_to_parse :
@@ -486,12 +498,15 @@ def validate_output_path(output_file: str) -> None:
486498
487499 # Test if we can write to the file by opening it in append mode
488500 # We don't actually write anything
489- output_path .open ("a" ).close ()
501+ with output_path .open ("a" , encoding = "utf-8" ) as _ :
502+ pass
490503
491- except PermissionError :
492- raise ValueError (f"Permission denied: Cannot write to { output_file } " )
493- except OSError as e :
494- raise ValueError (f"Invalid output path: { output_file } . Error: { str (e )} " )
504+ except PermissionError as pe :
505+ raise ValueError (f"Permission denied: Cannot write to { output_file } " ) from pe
506+ except OSError as ose :
507+ raise ValueError (
508+ f"Invalid output path: { output_file } . Error: { str (ose )} "
509+ ) from ose
495510
496511
497512def validate_leaderboard_v2_tasks (tasks : t .List [str ]):
@@ -658,7 +673,7 @@ def save_to_file(self, output_file: t.Optional[str] = None) -> None:
658673 output_dir = os .path .dirname (output_file )
659674 if output_dir :
660675 os .makedirs (output_dir , exist_ok = True )
661- with open (output_file , "w" ) as f :
676+ with open (output_file , "w" , encoding = "utf-8" ) as f :
662677 json .dump (self ._results , f , indent = 2 )
663678
664679 def run (
@@ -739,15 +754,6 @@ def run(
739754 # validation logic
740755 validate_leaderboard_v2_tasks (tasks )
741756
742- # Only validate GPU requirements when not using an API endpoint
743- if not api_endpoint :
744- if not num_gpus :
745- num_gpus = cuda .device_count ()
746- if num_gpus <= 0 or num_gpus > cuda .device_count ():
747- raise ValueError (
748- f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
749- )
750-
751757 if output_file :
752758 validate_output_path (output_file )
753759
@@ -767,6 +773,14 @@ def run(
767773 openai_results = evaluate_with_openai (args_openai )
768774 self ._lm_eval_results .append (openai_results )
769775 else :
776+ # Only validate GPU requirements when not using an API endpoint
777+ if not num_gpus :
778+ num_gpus = cuda .device_count ()
779+ if num_gpus <= 0 or num_gpus > cuda .device_count ():
780+ raise ValueError (
781+ f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
782+ )
783+
770784 # Only run local evaluation if not using OpenAI API
771785 if vllm_tasks := grouped_tasks ["vllm" ]:
772786 args_vllm : LeaderboardArgs = {
@@ -823,11 +837,11 @@ def evaluate_with_openai(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
823837
824838 # Add base_url if provided
825839 if base_url :
826- model_args [ "base_url" ] = base_url
840+ model_args . update ({ "base_url" : base_url })
827841
828842 # Add API key if provided
829843 if api_key :
830- model_args [ "api_key" ] = api_key
844+ model_args . update ({ "api_key" : api_key })
831845
832846 # Add any remaining backend config options
833847 model_args .update (backend_config )
0 commit comments