13
13
import json
14
14
15
15
# Third Party
16
+ from rich import print
17
+ from typing_extensions import Annotated
16
18
import typer
17
19
18
20
app = typer .Typer ()
19
21
20
22
23
+ def print_metrics (result : dict , checkpoint_name : str = None , prefix : str = "" ):
24
+ """
25
+ Print formatted metrics for a checkpoint result.
26
+
27
+ Args:
28
+ result: The evaluation result dictionary
29
+ checkpoint_name: Optional checkpoint name to display
30
+ prefix: Optional prefix for each line
31
+ """
32
+ if checkpoint_name :
33
+ print (f"{ prefix } [bold]Leaderboard results[/bold]: { checkpoint_name } " )
34
+ print (f"{ prefix } Overall: { result ['overall_score' ] * 100 :.2f} %" )
35
+ if "leaderboard_bbh" in result :
36
+ print (f"{ prefix } BBH: { result ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
37
+ if "leaderboard_gpqa" in result :
38
+ print (f"{ prefix } GPQA: { result ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
39
+ if "leaderboard_ifeval" in result :
40
+ print (f"{ prefix } IFEval: { result ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
41
+ if "leaderboard_math_hard" in result :
42
+ print (
43
+ f"{ prefix } MATH-Hard: { result ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
44
+ )
45
+ if "leaderboard_mmlu_pro" in result :
46
+ print (f"{ prefix } MMLU-Pro: { result ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %" )
47
+ if "leaderboard_musr" in result :
48
+ print (f"{ prefix } MUSR: { result ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
49
+
50
+
21
51
@app .command ()
22
- def main (
52
+ def best_checkpoint (
23
53
input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
24
54
output_file : Optional [Path ] = typer .Option (None , help = "Optional output file path" ),
55
+ tasks : Annotated [
56
+ Optional [list [str ]],
57
+ typer .Option (
58
+ help = "Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
59
+ ),
60
+ ] = None ,
61
+ num_gpus : int = typer .Option (8 , help = "Number of GPUs to use for evaluation" ),
25
62
):
26
63
"""
27
- Process files in the input directory and optionally save results to an output file.
64
+ Find the best checkpoint by evaluating all checkpoints in the input directory.
65
+ Processes all checkpoint subdirectories and ranks them by overall score.
28
66
"""
29
67
if not input_dir .exists ():
30
68
typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
@@ -52,8 +90,10 @@ def main(
52
90
typer .echo (f"Processing checkpoint: { checkpoint } " )
53
91
ckpt_output_file = checkpoint / "leaderboard_results.json"
54
92
evaluator = LeaderboardV2Evaluator (
55
- model_path = str (checkpoint ), output_file = ckpt_output_file , num_gpus = 8
93
+ model_path = str (checkpoint ), output_file = ckpt_output_file , num_gpus = num_gpus
56
94
)
95
+ if tasks :
96
+ evaluator .tasks = tasks
57
97
result = evaluator .run ()
58
98
checkpoint_results [checkpoint .name ] = result
59
99
typer .echo (f"Checkpoint { checkpoint .name } results: { result ['overall_score' ]} " )
@@ -63,12 +103,21 @@ def main(
63
103
checkpoint_results .items (), key = lambda x : x [1 ]["overall_score" ], reverse = True
64
104
)
65
105
typer .echo ("Sorted checkpoints by score:" )
66
- for checkpoint_name , result in sorted_checkpoints :
106
+ for i , ( checkpoint_name , result ) in enumerate ( sorted_checkpoints ) :
67
107
typer .echo (f"{ '=' * 100 } " )
68
- typer .echo (json .dumps (result , indent = 2 ))
108
+ # Add [BEST CHECKPOINT] label for the first checkpoint
109
+ if i == 0 :
110
+ checkpoint_display = (
111
+ f"{ checkpoint_name } [bold green][BEST CHECKPOINT][/bold green]"
112
+ )
113
+ else :
114
+ checkpoint_display = checkpoint_name
115
+ print_metrics (result , checkpoint_display )
69
116
70
117
typer .echo (f"{ '=' * 100 } " )
71
- typer .echo (f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} " )
118
+ typer .echo (
119
+ f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} [bold green][BEST CHECKPOINT][/bold green]"
120
+ )
72
121
73
122
if output_file :
74
123
typer .echo (f"Output will be saved to: { output_file } " )
@@ -80,5 +129,127 @@ def main(
80
129
typer .echo ("Processing complete!" )
81
130
82
131
132
+ @app .command ()
133
+ def evaluate (
134
+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
135
+ tasks : Annotated [
136
+ Optional [list [str ]],
137
+ typer .Option (
138
+ help = "Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
139
+ ),
140
+ ] = None ,
141
+ num_gpus : int = typer .Option (8 , help = "Number of GPUs to use for evaluation" ),
142
+ output_file : Optional [Path ] = typer .Option (
143
+ None ,
144
+ help = "Custom output file path (default: input_dir/leaderboard_results.json)" ,
145
+ ),
146
+ ):
147
+ """
148
+ Evaluate a single checkpoint directory and save results to JSON file.
149
+ """
150
+ if not input_dir .exists ():
151
+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
152
+ raise typer .Exit (1 )
153
+
154
+ if not input_dir .is_dir ():
155
+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
156
+ raise typer .Exit (1 )
157
+
158
+ typer .echo ("importing LeaderboardV2Evaluator, this may take a while..." )
159
+ # First Party
160
+ from instructlab .eval .leaderboard import LeaderboardV2Evaluator
161
+
162
+ typer .echo ("done" )
163
+
164
+ evaluator = LeaderboardV2Evaluator (
165
+ model_path = str (input_dir ), num_gpus = num_gpus , eval_config = {"batch_size" : "auto" }
166
+ )
167
+ if tasks :
168
+ evaluator .tasks = tasks
169
+ result = evaluator .run ()
170
+
171
+ # now just print out the checkpoint results
172
+ print_metrics (result , str (input_dir ))
173
+
174
+ # Determine output file path
175
+ if output_file is None :
176
+ output_file = input_dir / "leaderboard_results.json"
177
+
178
+ # Check if file exists and warn user
179
+ if output_file .exists ():
180
+ typer .echo (
181
+ f"Warning: Output file '{ output_file } ' already exists and will be overwritten"
182
+ )
183
+
184
+ output_file .write_text (json .dumps (result , indent = 2 ))
185
+ typer .echo (f"Results saved to: { output_file } " )
186
+
187
+
188
+ @app .command ()
189
+ def find_best (
190
+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
191
+ show_all : bool = typer .Option (
192
+ False , "--show-all" , help = "Show scores for all checkpoints"
193
+ ),
194
+ ):
195
+ """
196
+ Find the best checkpoint by looking through leaderboard_results.json files.
197
+ """
198
+ if not input_dir .exists ():
199
+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
200
+ raise typer .Exit (1 )
201
+
202
+ if not input_dir .is_dir ():
203
+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
204
+ raise typer .Exit (1 )
205
+
206
+ # Find all leaderboard_results.json files
207
+ result_files = list (input_dir .glob ("**/leaderboard_results.json" ))
208
+
209
+ if not result_files :
210
+ typer .echo ("No leaderboard results found in any subdirectories" )
211
+ raise typer .Exit (1 )
212
+
213
+ # Load and compare results
214
+ best_score = - 1
215
+ best_checkpoint = None
216
+ best_results = None
217
+ all_results = []
218
+
219
+ for result_file in result_files :
220
+ try :
221
+ results = json .loads (result_file .read_text ())
222
+ score = results .get ("overall_score" , - 1 )
223
+ all_results .append ((result_file .parent , score , results ))
224
+
225
+ if score > best_score :
226
+ best_score = score
227
+ best_checkpoint = result_file .parent
228
+ best_results = results
229
+ except Exception as e :
230
+ typer .echo (f"Error reading { result_file } : { e } " )
231
+ continue
232
+
233
+ if best_checkpoint is None :
234
+ typer .echo ("No valid results found" )
235
+ raise typer .Exit (1 )
236
+
237
+ # Sort all results by score
238
+ all_results .sort (key = lambda x : x [1 ], reverse = True )
239
+
240
+ # Print all results if requested
241
+ if show_all :
242
+ print ("\n [bold]All checkpoint results:[/bold]" )
243
+ for checkpoint , score , results in all_results :
244
+ is_best = checkpoint == best_checkpoint
245
+ prefix = "→ " if is_best else " "
246
+ print (f"\n { prefix } Checkpoint: { checkpoint } " )
247
+ print_metrics (results , prefix = " " )
248
+ else :
249
+ # Print only best results
250
+ print (f"\n [bold]Best checkpoint found[/bold]: { best_checkpoint } " )
251
+ print_metrics (best_results )
252
+
253
+
83
254
if __name__ == "__main__" :
84
255
app ()
0 commit comments