Skip to content

Commit 7eaff4d

Browse files
AdnanElAssadi56Samoed
authored andcommitted
Fixed Multilingual Classification Subset
1 parent d1e1a1b commit 7eaff4d

File tree

1 file changed

+16
-13
lines changed

1 file changed

+16
-13
lines changed

mteb/tasks/Classification/multilingual/human/MultilingualSentimentClassificationHumanSubset.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -67,31 +67,34 @@ class MultilingualSentimentClassificationHumanSubset(AbsTaskClassification, Mult
6767

6868
def load_data(self, **kwargs):
6969
"""Load human test subset + full original training data for each language"""
70-
# Load human evaluation subset (config-based)
70+
# Load human evaluation subset (unified with lang column)
7171
human_dataset = load_dataset(
7272
self.metadata_dict["dataset"]["path"],
7373
revision=self.metadata_dict["dataset"]["revision"],
7474
)
7575

76-
# Load full original training data
76+
# Load full original training data (unified with lang column)
7777
original_dataset = load_dataset("mteb/multilingual-sentiment-classification")
7878

79-
# Combine for each language config
79+
# Both datasets have unified structure with lang column
80+
# Split by language to create individual configs
8081
combined_dataset = {}
8182

82-
# Handle default config (all languages combined)
83-
if "default" in human_dataset:
84-
combined_dataset["default"] = DatasetDict({
85-
"train": original_dataset["default"]["train"],
86-
"test": human_dataset["default"]["test"]
87-
})
83+
# Filter training data by language
84+
train_data = original_dataset["train"]
85+
test_data = human_dataset["test"]
8886

89-
# Handle individual language configs
87+
# Create individual language configs
9088
for lang in ["eng", "ara", "nor", "rus"]:
91-
if lang in human_dataset:
89+
# Filter train data for this language
90+
train_lang_data = train_data.filter(lambda x: x["lang"] == lang)
91+
# Filter test data for this language
92+
test_lang_data = test_data.filter(lambda x: x["lang"] == lang)
93+
94+
if len(test_lang_data) > 0: # Only create config if we have test data
9295
combined_dataset[lang] = DatasetDict({
93-
"train": original_dataset[lang]["train"],
94-
"test": human_dataset[lang]["test"]
96+
"train": train_lang_data,
97+
"test": test_lang_data
9598
})
9699

97100
self.dataset = combined_dataset

0 commit comments

Comments
 (0)