Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lighteval/main_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,5 +473,6 @@ def eval(
task = "lighteval|ifeval|0"
task = "lighteval|gpqa|0"
task = "lighteval|ifbench_test|0"
task = "lighteval|mmlu_pro|0"
model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct:nebius"
eval(models=[model], tasks=task)
81 changes: 81 additions & 0 deletions src/lighteval/tasks/tasks/mmlu_pro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
name:
MMLU Pro
dataset:
TIGER-Lab/MMLU-Pro
abstract:
MMLU-Pro dataset is a more robust and challenging massive multi-task
understanding dataset tailored to more rigorously benchmark large language
models' capabilities. This dataset contains 12K complex questions across various
disciplines.
languages:
english
tags:
general-knowledge, knowledge, multiple-choice
paper:
https://arxiv.org/abs/2406.01574
"""

from string import ascii_uppercase

from inspect_ai.dataset import Sample
from inspect_ai.scorer import choice
from inspect_ai.solver import multiple_choice

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc


TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
{question}
{choices}
Answer:""".strip()


def mmlu_pro_prompt_function(line, task_name: str = None):
choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])])

query = TEMPLATE.format(
question=line["question"],
choices=choices,
)

return Doc(
task_name=task_name,
query=query,
choices=ascii_uppercase[: len(choices)],
gold_index=line["answer_index"],
instruction=query,
)


def record_to_sample(record):
return Sample(input=record["question"], target=record["answer"], choices=record["options"])


mmlu_pro = LightevalTaskConfig(
name="mmlu_pro",
prompt_function=mmlu_pro_prompt_function,
sample_fields=record_to_sample,
solver=[multiple_choice(cache=True)],
scorer=choice(),
suite=("lighteval",),
hf_repo="TIGER-Lab/MMLU-Pro",
hf_subset="default",
hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea",
evaluation_splits=("test",),
few_shots_split="validation",
metrics=[Metrics.gpqa_instruct_metric],
)

TASKS_TABLE = [mmlu_pro]
Loading