From 28696812332e8f26c6db9c3a4c55add73cfc8c0b Mon Sep 17 00:00:00 2001 From: Julian Mosig von Aehrenfeld Date: Thu, 3 Jul 2025 19:04:44 +0200 Subject: [PATCH 1/5] Create calculate_num_samples method in evaluation_tracker to count number of samples per task --- src/lighteval/logging/evaluation_tracker.py | 27 +++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 5a8f8553f..511f744c9 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import collections import json import logging import os @@ -724,3 +725,29 @@ def push_to_tensorboard( # noqa: C901 f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard" f" at global_step {global_step}" ) + + def calculate_num_samples(self) -> dict[str, int]: + """ + Counts the number of samples per task, includes grouped tasks. + This implementation is oriented on MetricsLogger.aggregate(), to make sure the subgroups of tasks match up. + """ + + # Count samples of individual tasks + num_samples = {task: len(samples) for task, samples in self.details_logger.details.items()} + + # Count samples for sub groups + grouped_tasks = collections.defaultdict(list) + + for task in num_samples: + if "|" in task: + suite, task, fewshot = task.split("|") + grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(task) + + for average_task, list_of_subtasks in grouped_tasks.items(): + if len(list_of_subtasks) > 1: + num_samples[average_task] = sum(num_samples[k] for k in list_of_subtasks) + + # Add sample count for all + num_samples["all"] = sum(count for task, count in num_samples.items() if task != "all") + + return num_samples From 39adf619b9b9419a1e7a7c1cb736d2e71bda9dd9 Mon Sep 17 00:00:00 2001 From: Julian Mosig von Aehrenfeld Date: Thu, 3 Jul 2025 19:06:26 +0200 Subject: [PATCH 2/5] add num_samples to final_dict of evaluation_tracker --- src/lighteval/logging/evaluation_tracker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 511f744c9..1295a666d 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -352,6 +352,7 @@ def generate_final_dict(self) -> dict: "config_tasks": self.task_config_logger.tasks_configs, "summary_tasks": self.details_logger.compiled_details, "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), + "num_samples": self.calculate_num_samples(), } final_dict = { From dbaad99392787f43be5b3caf7f1635e0c0be5339 Mon Sep 17 00:00:00 2001 From: Julian Mosig von Aehrenfeld Date: Thu, 3 Jul 2025 19:07:18 +0200 Subject: [PATCH 3/5] add num_samples to results_dict in EvaluationTracker.save() --- src/lighteval/logging/evaluation_tracker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 1295a666d..67ffb43e3 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -229,6 +229,7 @@ def save(self) -> None: "config_tasks": self.task_config_logger.tasks_configs, "summary_tasks": self.details_logger.compiled_details, "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), + "num_samples": self.calculate_num_samples(), } # Create the details datasets for later upload From 80f9df81900a676a225881d657f5de313e70535b Mon Sep 17 00:00:00 2001 From: Julian Mosig von Aehrenfeld Date: Thu, 3 Jul 2025 19:12:35 +0200 Subject: [PATCH 4/5] Add num_samples to the markdown table printed by make_results_table() --- src/lighteval/utils/utils.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/lighteval/utils/utils.py b/src/lighteval/utils/utils.py index 28e0ac4a4..76ac18f5c 100644 --- a/src/lighteval/utils/utils.py +++ b/src/lighteval/utils/utils.py @@ -158,24 +158,29 @@ def flatten(item: list[Union[list, str]]) -> list[str]: def make_results_table(result_dict): """Generate table of results.""" md_writer = MarkdownTableWriter() - md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] + md_writer.headers = ["Task", "Version", "Number of Samples", "Metric", "Value", "", "Stderr"] values = [] + # For backwards compatibility, create empty dict if result_dict doesn't contain num_samples + num_samples_dict = result_dict["num_samples"] if "num_samples" in result_dict else {} + for k in sorted(result_dict["results"].keys()): dic = result_dict["results"][k] version = result_dict["versions"][k] if k in result_dict["versions"] else "" + num_samples = num_samples_dict[k] if k in num_samples_dict else "" for m, v in dic.items(): if m.endswith("_stderr"): continue if m + "_stderr" in dic: se = dic[m + "_stderr"] - values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se]) + values.append([k, version, num_samples, m, "%.4f" % v, "±", "%.4f" % se]) else: - values.append([k, version, m, "%.4f" % v, "", ""]) + values.append([k, version, num_samples, m, "%.4f" % v, "", ""]) k = "" version = "" + num_samples = "" md_writer.value_matrix = values return md_writer.dumps() From 04b3af361a9deb6ccf5681836887e1862dc1d540 Mon Sep 17 00:00:00 2001 From: Julian Mosig von Aehrenfeld Date: Thu, 3 Jul 2025 20:15:48 +0200 Subject: [PATCH 5/5] Add num_samples entry in example results.json in docs --- docs/source/saving-and-reading-results.mdx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index 2a54aeaf4..bc2652ea1 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -215,6 +215,10 @@ The detail file contains the following columns: "padded": 0, "non_padded": 2, "num_truncated_few_shots": 0 + }, + "num_samples": { + "lighteval|gsm8k|0": 1, + "all": 1 } } ```