@@ -67,31 +67,34 @@ class MultilingualSentimentClassificationHumanSubset(AbsTaskClassification, Mult
6767
6868 def load_data (self , ** kwargs ):
6969 """Load human test subset + full original training data for each language"""
70- # Load human evaluation subset (config-based )
70+ # Load human evaluation subset (unified with lang column )
7171 human_dataset = load_dataset (
7272 self .metadata_dict ["dataset" ]["path" ],
7373 revision = self .metadata_dict ["dataset" ]["revision" ],
7474 )
7575
76- # Load full original training data
76+ # Load full original training data (unified with lang column)
7777 original_dataset = load_dataset ("mteb/multilingual-sentiment-classification" )
7878
79- # Combine for each language config
79+ # Both datasets have unified structure with lang column
80+ # Split by language to create individual configs
8081 combined_dataset = {}
8182
82- # Handle default config (all languages combined)
83- if "default" in human_dataset :
84- combined_dataset ["default" ] = DatasetDict ({
85- "train" : original_dataset ["default" ]["train" ],
86- "test" : human_dataset ["default" ]["test" ]
87- })
83+ # Filter training data by language
84+ train_data = original_dataset ["train" ]
85+ test_data = human_dataset ["test" ]
8886
89- # Handle individual language configs
87+ # Create individual language configs
9088 for lang in ["eng" , "ara" , "nor" , "rus" ]:
91- if lang in human_dataset :
89+ # Filter train data for this language
90+ train_lang_data = train_data .filter (lambda x : x ["lang" ] == lang )
91+ # Filter test data for this language
92+ test_lang_data = test_data .filter (lambda x : x ["lang" ] == lang )
93+
94+ if len (test_lang_data ) > 0 : # Only create config if we have test data
9295 combined_dataset [lang ] = DatasetDict ({
93- "train" : original_dataset [ lang ][ "train" ] ,
94- "test" : human_dataset [ lang ][ "test" ]
96+ "train" : train_lang_data ,
97+ "test" : test_lang_data
9598 })
9699
97100 self .dataset = combined_dataset
0 commit comments