|
| 1 | +""" |
| 2 | +name: |
| 3 | +MMLU Pro |
| 4 | +
|
| 5 | +dataset: |
| 6 | +TIGER-Lab/MMLU-Pro |
| 7 | +
|
| 8 | +abstract: |
| 9 | +
|
| 10 | +languages: |
| 11 | +english |
| 12 | +
|
| 13 | +tags: |
| 14 | +general-knowledge |
| 15 | +
|
| 16 | +paper: |
| 17 | +
|
| 18 | +""" |
| 19 | +from string import ascii_uppercase |
| 20 | + |
| 21 | +from lighteval.metrics.dynamic_metrics import ( |
| 22 | + LogLikelihoodAccMetric, |
| 23 | +) |
| 24 | +from lighteval.metrics.metrics import Metrics |
| 25 | +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm |
| 26 | +from lighteval.tasks.lighteval_task import LightevalTaskConfig |
| 27 | +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation |
| 28 | +from lighteval.tasks.requests import Doc |
| 29 | +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function |
| 30 | +from lighteval.tasks.templates.utils.formulation import ( |
| 31 | + CFFormulation, |
| 32 | + HybridFormulation, |
| 33 | + MCFFormulation, |
| 34 | +) |
| 35 | +from lighteval.utils.language import Language |
| 36 | + |
| 37 | + |
| 38 | +TEMPLATE = """ |
| 39 | +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. |
| 40 | +
|
| 41 | +{question} |
| 42 | +
|
| 43 | +{choices} |
| 44 | +
|
| 45 | +Answer:""".strip() |
| 46 | + |
| 47 | + |
| 48 | +def mmlu_pro_prompt_function(line, task_name: str = None): |
| 49 | + choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])]) |
| 50 | + |
| 51 | + query = TEMPLATE.format( |
| 52 | + question=line["question"], |
| 53 | + choices=choices, |
| 54 | + ) |
| 55 | + |
| 56 | + return Doc( |
| 57 | + task_name=task_name, |
| 58 | + query=query, |
| 59 | + choices=ascii_uppercase[: len(choices)], |
| 60 | + gold_index=line["answer_index"], |
| 61 | + instruction=query, |
| 62 | + ) |
| 63 | + |
| 64 | + |
| 65 | +mmlu_pro = LightevalTaskConfig( |
| 66 | + name="mmlu_pro", |
| 67 | + prompt_function=mmlu_pro_prompt_function, |
| 68 | + suite=("lighteval",), |
| 69 | + hf_repo="TIGER-Lab/MMLU-Pro", |
| 70 | + hf_subset="default", |
| 71 | + hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", |
| 72 | + evaluation_splits=("test",), |
| 73 | + few_shots_split="validation", |
| 74 | + metrics=[Metrics.gpqa_instruct_metric], |
| 75 | + ) |
| 76 | + |
| 77 | +TASKS_TABLE = [mmlu_pro] |
0 commit comments