huggingface · NathanHB · Nov 4, 2025 · Oct 31, 2025 · Oct 31, 2025 · Nov 4, 2025
diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py
@@ -473,5 +473,6 @@ def eval(
     task = "lighteval|ifeval|0"
     task = "lighteval|gpqa|0"
     task = "lighteval|ifbench_test|0"
+    task = "lighteval|mmlu_pro|0"
     model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct:nebius"
     eval(models=[model], tasks=task)
diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py
@@ -0,0 +1,81 @@
+"""
+name:
+MMLU Pro
+
+dataset:
+TIGER-Lab/MMLU-Pro
+
+abstract:
+MMLU-Pro dataset is a more robust and challenging massive multi-task
+understanding dataset tailored to more rigorously benchmark large language
+models' capabilities. This dataset contains 12K complex questions across various
+disciplines.
+
+languages:
+english
+
+tags:
+general-knowledge, knowledge, multiple-choice
+
+paper:
+https://arxiv.org/abs/2406.01574
+"""
+
+from string import ascii_uppercase
+
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+{question}
+
+{choices}
+
+Answer:""".strip()
+
+
+def mmlu_pro_prompt_function(line, task_name: str = None):
+    choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])])
+
+    query = TEMPLATE.format(
+        question=line["question"],
+        choices=choices,
+    )
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=ascii_uppercase[: len(choices)],
+        gold_index=line["answer_index"],
+        instruction=query,
+    )
+
+
+def record_to_sample(record):
+    return Sample(input=record["question"], target=record["answer"], choices=record["options"])
+
+
+mmlu_pro = LightevalTaskConfig(
+    name="mmlu_pro",
+    prompt_function=mmlu_pro_prompt_function,
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
+    suite=("lighteval",),
+    hf_repo="TIGER-Lab/MMLU-Pro",
+    hf_subset="default",
+    hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea",
+    evaluation_splits=("test",),
+    few_shots_split="validation",
+    metrics=[Metrics.gpqa_instruct_metric],
+)
+
+TASKS_TABLE = [mmlu_pro]