Skip to content

Commit 34e878c

Browse files
authored
Merge pull request #253 from RobotSail/fix-leaderboard
push-fix
2 parents 3725b37 + ffe9c94 commit 34e878c

File tree

2 files changed

+179
-8
lines changed

2 files changed

+179
-8
lines changed

scripts/evaluate_best_checkpoint.py

Lines changed: 177 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,56 @@
1313
import json
1414

1515
# Third Party
16+
from rich import print
17+
from typing_extensions import Annotated
1618
import typer
1719

1820
app = typer.Typer()
1921

2022

23+
def print_metrics(result: dict, checkpoint_name: str = None, prefix: str = ""):
24+
"""
25+
Print formatted metrics for a checkpoint result.
26+
27+
Args:
28+
result: The evaluation result dictionary
29+
checkpoint_name: Optional checkpoint name to display
30+
prefix: Optional prefix for each line
31+
"""
32+
if checkpoint_name:
33+
print(f"{prefix}[bold]Leaderboard results[/bold]: {checkpoint_name}")
34+
print(f"{prefix}Overall: {result['overall_score'] * 100:.2f}%")
35+
if "leaderboard_bbh" in result:
36+
print(f"{prefix}BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
37+
if "leaderboard_gpqa" in result:
38+
print(f"{prefix}GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
39+
if "leaderboard_ifeval" in result:
40+
print(f"{prefix}IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
41+
if "leaderboard_math_hard" in result:
42+
print(
43+
f"{prefix}MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
44+
)
45+
if "leaderboard_mmlu_pro" in result:
46+
print(f"{prefix}MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
47+
if "leaderboard_musr" in result:
48+
print(f"{prefix}MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
49+
50+
2151
@app.command()
22-
def main(
52+
def best_checkpoint(
2353
input_dir: Path = typer.Argument(..., help="Input directory to process"),
2454
output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
55+
tasks: Annotated[
56+
Optional[list[str]],
57+
typer.Option(
58+
help="Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
59+
),
60+
] = None,
61+
num_gpus: int = typer.Option(8, help="Number of GPUs to use for evaluation"),
2562
):
2663
"""
27-
Process files in the input directory and optionally save results to an output file.
64+
Find the best checkpoint by evaluating all checkpoints in the input directory.
65+
Processes all checkpoint subdirectories and ranks them by overall score.
2866
"""
2967
if not input_dir.exists():
3068
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
@@ -52,8 +90,10 @@ def main(
5290
typer.echo(f"Processing checkpoint: {checkpoint}")
5391
ckpt_output_file = checkpoint / "leaderboard_results.json"
5492
evaluator = LeaderboardV2Evaluator(
55-
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
93+
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=num_gpus
5694
)
95+
if tasks:
96+
evaluator.tasks = tasks
5797
result = evaluator.run()
5898
checkpoint_results[checkpoint.name] = result
5999
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
@@ -63,12 +103,21 @@ def main(
63103
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
64104
)
65105
typer.echo("Sorted checkpoints by score:")
66-
for checkpoint_name, result in sorted_checkpoints:
106+
for i, (checkpoint_name, result) in enumerate(sorted_checkpoints):
67107
typer.echo(f"{'=' * 100}")
68-
typer.echo(json.dumps(result, indent=2))
108+
# Add [BEST CHECKPOINT] label for the first checkpoint
109+
if i == 0:
110+
checkpoint_display = (
111+
f"{checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
112+
)
113+
else:
114+
checkpoint_display = checkpoint_name
115+
print_metrics(result, checkpoint_display)
69116

70117
typer.echo(f"{'=' * 100}")
71-
typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
118+
typer.echo(
119+
f"Best checkpoint: {sorted_checkpoints[0][0]} [bold green][BEST CHECKPOINT][/bold green]"
120+
)
72121

73122
if output_file:
74123
typer.echo(f"Output will be saved to: {output_file}")
@@ -80,5 +129,127 @@ def main(
80129
typer.echo("Processing complete!")
81130

82131

132+
@app.command()
133+
def evaluate(
134+
input_dir: Path = typer.Argument(..., help="Input directory to process"),
135+
tasks: Annotated[
136+
Optional[list[str]],
137+
typer.Option(
138+
help="Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
139+
),
140+
] = None,
141+
num_gpus: int = typer.Option(8, help="Number of GPUs to use for evaluation"),
142+
output_file: Optional[Path] = typer.Option(
143+
None,
144+
help="Custom output file path (default: input_dir/leaderboard_results.json)",
145+
),
146+
):
147+
"""
148+
Evaluate a single checkpoint directory and save results to JSON file.
149+
"""
150+
if not input_dir.exists():
151+
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
152+
raise typer.Exit(1)
153+
154+
if not input_dir.is_dir():
155+
typer.echo(f"Error: '{input_dir}' is not a directory")
156+
raise typer.Exit(1)
157+
158+
typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
159+
# First Party
160+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
161+
162+
typer.echo("done")
163+
164+
evaluator = LeaderboardV2Evaluator(
165+
model_path=str(input_dir), num_gpus=num_gpus, eval_config={"batch_size": "auto"}
166+
)
167+
if tasks:
168+
evaluator.tasks = tasks
169+
result = evaluator.run()
170+
171+
# now just print out the checkpoint results
172+
print_metrics(result, str(input_dir))
173+
174+
# Determine output file path
175+
if output_file is None:
176+
output_file = input_dir / "leaderboard_results.json"
177+
178+
# Check if file exists and warn user
179+
if output_file.exists():
180+
typer.echo(
181+
f"Warning: Output file '{output_file}' already exists and will be overwritten"
182+
)
183+
184+
output_file.write_text(json.dumps(result, indent=2))
185+
typer.echo(f"Results saved to: {output_file}")
186+
187+
188+
@app.command()
189+
def find_best(
190+
input_dir: Path = typer.Argument(..., help="Input directory to process"),
191+
show_all: bool = typer.Option(
192+
False, "--show-all", help="Show scores for all checkpoints"
193+
),
194+
):
195+
"""
196+
Find the best checkpoint by looking through leaderboard_results.json files.
197+
"""
198+
if not input_dir.exists():
199+
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
200+
raise typer.Exit(1)
201+
202+
if not input_dir.is_dir():
203+
typer.echo(f"Error: '{input_dir}' is not a directory")
204+
raise typer.Exit(1)
205+
206+
# Find all leaderboard_results.json files
207+
result_files = list(input_dir.glob("**/leaderboard_results.json"))
208+
209+
if not result_files:
210+
typer.echo("No leaderboard results found in any subdirectories")
211+
raise typer.Exit(1)
212+
213+
# Load and compare results
214+
best_score = -1
215+
best_checkpoint = None
216+
best_results = None
217+
all_results = []
218+
219+
for result_file in result_files:
220+
try:
221+
results = json.loads(result_file.read_text())
222+
score = results.get("overall_score", -1)
223+
all_results.append((result_file.parent, score, results))
224+
225+
if score > best_score:
226+
best_score = score
227+
best_checkpoint = result_file.parent
228+
best_results = results
229+
except Exception as e:
230+
typer.echo(f"Error reading {result_file}: {e}")
231+
continue
232+
233+
if best_checkpoint is None:
234+
typer.echo("No valid results found")
235+
raise typer.Exit(1)
236+
237+
# Sort all results by score
238+
all_results.sort(key=lambda x: x[1], reverse=True)
239+
240+
# Print all results if requested
241+
if show_all:
242+
print("\n[bold]All checkpoint results:[/bold]")
243+
for checkpoint, score, results in all_results:
244+
is_best = checkpoint == best_checkpoint
245+
prefix = "→ " if is_best else " "
246+
print(f"\n{prefix}Checkpoint: {checkpoint}")
247+
print_metrics(results, prefix=" ")
248+
else:
249+
# Print only best results
250+
print(f"\n[bold]Best checkpoint found[/bold]: {best_checkpoint}")
251+
print_metrics(best_results)
252+
253+
83254
if __name__ == "__main__":
84255
app()

src/instructlab/eval/leaderboard.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,8 @@ def get_score_by_metric(score_dict: t.Dict[str, t.Any], metric: str) -> t.Any:
251251
extracted_value = value
252252
break
253253

254-
if not extracted_value:
255-
if alias := score_dict.get("alias", None):
254+
if extracted_value is None:
255+
if alias := score_dict.get("alias", "[no-alias]"):
256256
error_msg = (
257257
f"Failed to find a metric matching '{metric}' for task '{alias}'."
258258
)

0 commit comments

Comments
 (0)