Skip to content

Commit bae9544

Browse files
committed
belebele split
1 parent bc5a564 commit bae9544

File tree

1 file changed

+171
-4
lines changed

1 file changed

+171
-4
lines changed

src/lighteval/tasks/extended/misc/instruct.py

Lines changed: 171 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
)
3434
from lighteval.tasks.default_prompts import LETTER_INDICES
3535
from lighteval.tasks.lighteval_task import LightevalTaskConfig
36+
from lighteval.tasks.multilingual.tasks import LangCodeLanguage, iso_639_3_ind_to_iso_639_3_macro
3637
from lighteval.tasks.requests import Doc
3738
from lighteval.utils.language import Language
3839

@@ -48,6 +49,11 @@
4849
}
4950

5051

52+
def belebele_prompt_en_instruct(line, task_name: str = None):
53+
line["dialect"] == "eng_Latn"
54+
return belebele_prompt(line, task_name)
55+
56+
5157
def belebele_prompt(line, task_name: str = None):
5258
lang_to_template = {
5359
"eng_Latn": "Given the following passage, query, and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nPassage:\n{Passage}\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
@@ -60,7 +66,7 @@ def belebele_prompt(line, task_name: str = None):
6066

6167
gold_index = int(line["correct_answer_num"]) - 1
6268
choices = [line["mc_answer1"], line["mc_answer2"], line["mc_answer3"], line["mc_answer4"]]
63-
query_template = lang_to_template.get(line["dialect"], "eng_Latn")
69+
query_template = lang_to_template[line["dialect"]]
6470
query = query_template.format(
6571
A=choices[0],
6672
B=choices[1],
@@ -80,9 +86,9 @@ def belebele_prompt(line, task_name: str = None):
8086
)
8187

8288

83-
BELEBELE_TASKS = [
89+
BELEBELE_TASKS_NATIVE_INSTRUCT = [
8490
LightevalTaskConfig(
85-
name=f"belebele_instruct_{lang}_Latn",
91+
name=f"belebele_native_instruct_{lang}_Latn",
8692
prompt_function=belebele_prompt,
8793
suite=["extended"],
8894
hf_repo="facebook/belebele",
@@ -123,7 +129,168 @@ def belebele_prompt(line, task_name: str = None):
123129
"spa",
124130
]
125131
]
126-
TASKS_TABLE.extend(BELEBELE_TASKS)
132+
133+
BELEBELE_TASKS_EN_INSTRUCT = [
134+
LightevalTaskConfig(
135+
name=f"belebele_en_instruct_{lang}",
136+
prompt_function=belebele_prompt_en_instruct,
137+
suite=["extended"],
138+
hf_repo="facebook/belebele",
139+
hf_subset=f"{lang}_Latn",
140+
evaluation_splits=["test"],
141+
hf_avail_splits=["test"],
142+
few_shots_split=None,
143+
few_shots_select=None,
144+
generation_size=32768, # needed for reasoning models like R1
145+
metric=[
146+
SampleLevelMetric(
147+
metric_name="pass@1:1_samples",
148+
sample_level_fn=PassAtK(
149+
k=1,
150+
n=1,
151+
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
152+
language=iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(lang).to_alpha3()],
153+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
154+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
155+
precision=6,
156+
).sample_level_fn([ref], [pred], doc),
157+
).compute,
158+
category=MetricCategory.GENERATIVE_SAMPLING,
159+
use_case=MetricUseCase.REASONING,
160+
corpus_level_fn=np.mean,
161+
higher_is_better=True,
162+
)
163+
],
164+
stop_sequence=[], # no stop sequence, will use eos token
165+
trust_dataset=True,
166+
version=1,
167+
)
168+
for lang in [
169+
"acm_Arab",
170+
"arz_Arab",
171+
"ceb_Latn",
172+
"fin_Latn",
173+
"hin_Deva",
174+
"ita_Latn",
175+
"khm_Khmr",
176+
"lvs_Latn",
177+
"npi_Deva",
178+
"pol_Latn",
179+
"slv_Latn",
180+
"swe_Latn",
181+
# "tso_Latn",
182+
# "xho_Latn",
183+
"afr_Latn",
184+
"asm_Beng",
185+
"ces_Latn",
186+
"fra_Latn",
187+
"hin_Latn",
188+
"jav_Latn",
189+
# "kin_Latn",
190+
"mal_Mlym",
191+
"npi_Latn",
192+
"por_Latn",
193+
# "sna_Latn",
194+
"swh_Latn",
195+
"tur_Latn",
196+
"yor_Latn",
197+
"als_Latn",
198+
"azj_Latn",
199+
"ckb_Arab",
200+
# "fuv_Latn",
201+
"hrv_Latn",
202+
"jpn_Jpan",
203+
"kir_Cyrl",
204+
"mar_Deva",
205+
# "nso_Latn",
206+
"snd_Arab",
207+
"tam_Taml",
208+
"ukr_Cyrl",
209+
"zho_Hans",
210+
"amh_Ethi",
211+
# "bam_Latn",
212+
"dan_Latn",
213+
# "gaz_Latn",
214+
"hun_Latn",
215+
# "kac_Latn",
216+
"kor_Hang",
217+
"mkd_Cyrl",
218+
# "nya_Latn",
219+
"ron_Latn",
220+
"som_Latn",
221+
"tel_Telu",
222+
"urd_Arab",
223+
"zho_Hant",
224+
"apc_Arab",
225+
"ben_Beng",
226+
"deu_Latn",
227+
# "grn_Latn",
228+
"hye_Armn",
229+
"kan_Knda",
230+
"lao_Laoo",
231+
"mlt_Latn",
232+
"ory_Orya",
233+
"rus_Cyrl",
234+
# "sot_Latn",
235+
"tgk_Cyrl",
236+
"urd_Latn",
237+
"zsm_Latn",
238+
"arb_Arab",
239+
"ben_Latn",
240+
"ell_Grek",
241+
"guj_Gujr",
242+
# "ibo_Latn",
243+
"kat_Geor",
244+
# "lin_Latn",
245+
# "mri_Latn",
246+
"pan_Guru",
247+
# "shn_Mymr",
248+
"spa_Latn",
249+
"tgl_Latn",
250+
"uzn_Latn",
251+
# "zul_Latn",
252+
"arb_Latn",
253+
# "bod_Tibt",
254+
"eng_Latn",
255+
# "hat_Latn",
256+
# "ilo_Latn",
257+
"kaz_Cyrl",
258+
"lit_Latn",
259+
"mya_Mymr",
260+
"pbt_Arab",
261+
"sin_Latn",
262+
"srp_Cyrl",
263+
"tha_Thai",
264+
"vie_Latn",
265+
"ars_Arab",
266+
"bul_Cyrl",
267+
"est_Latn",
268+
# "hau_Latn",
269+
"ind_Latn",
270+
# "kea_Latn",
271+
# "lug_Latn",
272+
"nld_Latn",
273+
"pes_Arab",
274+
"sin_Sinh",
275+
# "ssw_Latn",
276+
# "tir_Ethi",
277+
"war_Latn",
278+
"ary_Arab",
279+
"cat_Latn",
280+
"eus_Latn",
281+
"heb_Hebr",
282+
"isl_Latn",
283+
# "khk_Cyrl",
284+
# "luo_Latn",
285+
"nob_Latn",
286+
"plt_Latn",
287+
"slk_Latn",
288+
# "sun_Latn",
289+
# "tsn_Latn",
290+
# "wol_Latn",
291+
]
292+
]
293+
TASKS_TABLE.extend(BELEBELE_TASKS_NATIVE_INSTRUCT + BELEBELE_TASKS_EN_INSTRUCT)
127294

128295

129296
class GlobalMMLUPrompt:

0 commit comments

Comments
 (0)