Merge pull request #253 from RobotSail/fix-leaderboard

RobotSail · web-flow · commit 34e878c0e22d · 2025-07-08T11:20:48.000-04:00
push-fix
diff --git a/scripts/evaluate_best_checkpoint.py b/scripts/evaluate_best_checkpoint.py
@@ -13,18 +13,56 @@
 import json
 
 # Third Party
+from rich import print
+from typing_extensions import Annotated
 import typer
 
 app = typer.Typer()
 
 
+def print_metrics(result: dict, checkpoint_name: str = None, prefix: str = ""):
+    """
+    Print formatted metrics for a checkpoint result.
+
+    Args:
+        result: The evaluation result dictionary
+        checkpoint_name: Optional checkpoint name to display
+        prefix: Optional prefix for each line
+    """
+    if checkpoint_name:
+        print(f"{prefix}[bold]Leaderboard results[/bold]: {checkpoint_name}")
+    print(f"{prefix}Overall: {result['overall_score'] * 100:.2f}%")
+    if "leaderboard_bbh" in result:
+        print(f"{prefix}BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
+    if "leaderboard_gpqa" in result:
+        print(f"{prefix}GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
+    if "leaderboard_ifeval" in result:
+        print(f"{prefix}IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
+    if "leaderboard_math_hard" in result:
+        print(
+            f"{prefix}MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
+        )
+    if "leaderboard_mmlu_pro" in result:
+        print(f"{prefix}MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
+    if "leaderboard_musr" in result:
+        print(f"{prefix}MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
+
+
 @app.command()
-def main(
+def best_checkpoint(
     input_dir: Path = typer.Argument(..., help="Input directory to process"),
     output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
+    tasks: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
+        ),
+    ] = None,
+    num_gpus: int = typer.Option(8, help="Number of GPUs to use for evaluation"),
 ):
     """
-    Process files in the input directory and optionally save results to an output file.
+    Find the best checkpoint by evaluating all checkpoints in the input directory.
+    Processes all checkpoint subdirectories and ranks them by overall score.
     """
     if not input_dir.exists():
         typer.echo(f"Error: Input directory '{input_dir}' does not exist")
@@ -52,8 +90,10 @@ def main(
         typer.echo(f"Processing checkpoint: {checkpoint}")
         ckpt_output_file = checkpoint / "leaderboard_results.json"
         evaluator = LeaderboardV2Evaluator(
-            model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
+            model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=num_gpus
         )
+        if tasks:
+            evaluator.tasks = tasks
         result = evaluator.run()
         checkpoint_results[checkpoint.name] = result
         typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
@@ -63,12 +103,21 @@ def main(
         checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
     )
     typer.echo("Sorted checkpoints by score:")
-    for checkpoint_name, result in sorted_checkpoints:
+    for i, (checkpoint_name, result) in enumerate(sorted_checkpoints):
         typer.echo(f"{'=' * 100}")
-        typer.echo(json.dumps(result, indent=2))
+        # Add [BEST CHECKPOINT] label for the first checkpoint
+        if i == 0:
+            checkpoint_display = (
+                f"{checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
+            )
+        else:
+            checkpoint_display = checkpoint_name
+        print_metrics(result, checkpoint_display)
 
     typer.echo(f"{'=' * 100}")
-    typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
+    typer.echo(
+        f"Best checkpoint: {sorted_checkpoints[0][0]} [bold green][BEST CHECKPOINT][/bold green]"
+    )
 
     if output_file:
         typer.echo(f"Output will be saved to: {output_file}")
@@ -80,5 +129,127 @@ def main(
     typer.echo("Processing complete!")
 
 
+@app.command()
+def evaluate(
+    input_dir: Path = typer.Argument(..., help="Input directory to process"),
+    tasks: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="Specific tasks to evaluate (e.g., 'leaderboard_bbh', 'leaderboard_gpqa')"
+        ),
+    ] = None,
+    num_gpus: int = typer.Option(8, help="Number of GPUs to use for evaluation"),
+    output_file: Optional[Path] = typer.Option(
+        None,
+        help="Custom output file path (default: input_dir/leaderboard_results.json)",
+    ),
+):
+    """
+    Evaluate a single checkpoint directory and save results to JSON file.
+    """
+    if not input_dir.exists():
+        typer.echo(f"Error: Input directory '{input_dir}' does not exist")
+        raise typer.Exit(1)
+
+    if not input_dir.is_dir():
+        typer.echo(f"Error: '{input_dir}' is not a directory")
+        raise typer.Exit(1)
+
+    typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
+    # First Party
+    from instructlab.eval.leaderboard import LeaderboardV2Evaluator
+
+    typer.echo("done")
+
+    evaluator = LeaderboardV2Evaluator(
+        model_path=str(input_dir), num_gpus=num_gpus, eval_config={"batch_size": "auto"}
+    )
+    if tasks:
+        evaluator.tasks = tasks
+    result = evaluator.run()
+
+    # now just print out the checkpoint results
+    print_metrics(result, str(input_dir))
+
+    # Determine output file path
+    if output_file is None:
+        output_file = input_dir / "leaderboard_results.json"
+
+    # Check if file exists and warn user
+    if output_file.exists():
+        typer.echo(
+            f"Warning: Output file '{output_file}' already exists and will be overwritten"
+        )
+
+    output_file.write_text(json.dumps(result, indent=2))
+    typer.echo(f"Results saved to: {output_file}")
+
+
+@app.command()
+def find_best(
+    input_dir: Path = typer.Argument(..., help="Input directory to process"),
+    show_all: bool = typer.Option(
+        False, "--show-all", help="Show scores for all checkpoints"
+    ),
+):
+    """
+    Find the best checkpoint by looking through leaderboard_results.json files.
+    """
+    if not input_dir.exists():
+        typer.echo(f"Error: Input directory '{input_dir}' does not exist")
+        raise typer.Exit(1)
+
+    if not input_dir.is_dir():
+        typer.echo(f"Error: '{input_dir}' is not a directory")
+        raise typer.Exit(1)
+
+    # Find all leaderboard_results.json files
+    result_files = list(input_dir.glob("**/leaderboard_results.json"))
+
+    if not result_files:
+        typer.echo("No leaderboard results found in any subdirectories")
+        raise typer.Exit(1)
+
+    # Load and compare results
+    best_score = -1
+    best_checkpoint = None
+    best_results = None
+    all_results = []
+
+    for result_file in result_files:
+        try:
+            results = json.loads(result_file.read_text())
+            score = results.get("overall_score", -1)
+            all_results.append((result_file.parent, score, results))
+
+            if score > best_score:
+                best_score = score
+                best_checkpoint = result_file.parent
+                best_results = results
+        except Exception as e:
+            typer.echo(f"Error reading {result_file}: {e}")
+            continue
+
+    if best_checkpoint is None:
+        typer.echo("No valid results found")
+        raise typer.Exit(1)
+
+    # Sort all results by score
+    all_results.sort(key=lambda x: x[1], reverse=True)
+
+    # Print all results if requested
+    if show_all:
+        print("\n[bold]All checkpoint results:[/bold]")
+        for checkpoint, score, results in all_results:
+            is_best = checkpoint == best_checkpoint
+            prefix = "→ " if is_best else "  "
+            print(f"\n{prefix}Checkpoint: {checkpoint}")
+            print_metrics(results, prefix="  ")
+    else:
+        # Print only best results
+        print(f"\n[bold]Best checkpoint found[/bold]: {best_checkpoint}")
+        print_metrics(best_results)
+
+
 if __name__ == "__main__":
     app()
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
@@ -251,8 +251,8 @@ def get_score_by_metric(score_dict: t.Dict[str, t.Any], metric: str) -> t.Any:
             extracted_value = value
             break
 
-    if not extracted_value:
-        if alias := score_dict.get("alias", None):
+    if extracted_value is None:
+        if alias := score_dict.get("alias", "[no-alias]"):
             error_msg = (
                 f"Failed to find a metric matching '{metric}' for task '{alias}'."
             )