diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index 2a54aeaf4..bc2652ea1 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -215,6 +215,10 @@ The detail file contains the following columns: "padded": 0, "non_padded": 2, "num_truncated_few_shots": 0 + }, + "num_samples": { + "lighteval|gsm8k|0": 1, + "all": 1 } } ``` diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 5a8f8553f..67ffb43e3 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import collections import json import logging import os @@ -228,6 +229,7 @@ def save(self) -> None: "config_tasks": self.task_config_logger.tasks_configs, "summary_tasks": self.details_logger.compiled_details, "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), + "num_samples": self.calculate_num_samples(), } # Create the details datasets for later upload @@ -351,6 +353,7 @@ def generate_final_dict(self) -> dict: "config_tasks": self.task_config_logger.tasks_configs, "summary_tasks": self.details_logger.compiled_details, "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), + "num_samples": self.calculate_num_samples(), } final_dict = { @@ -724,3 +727,29 @@ def push_to_tensorboard( # noqa: C901 f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard" f" at global_step {global_step}" ) + + def calculate_num_samples(self) -> dict[str, int]: + """ + Counts the number of samples per task, includes grouped tasks. + This implementation is oriented on MetricsLogger.aggregate(), to make sure the subgroups of tasks match up. + """ + + # Count samples of individual tasks + num_samples = {task: len(samples) for task, samples in self.details_logger.details.items()} + + # Count samples for sub groups + grouped_tasks = collections.defaultdict(list) + + for task in num_samples: + if "|" in task: + suite, task, fewshot = task.split("|") + grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(task) + + for average_task, list_of_subtasks in grouped_tasks.items(): + if len(list_of_subtasks) > 1: + num_samples[average_task] = sum(num_samples[k] for k in list_of_subtasks) + + # Add sample count for all + num_samples["all"] = sum(count for task, count in num_samples.items() if task != "all") + + return num_samples diff --git a/src/lighteval/utils/utils.py b/src/lighteval/utils/utils.py index 28e0ac4a4..76ac18f5c 100644 --- a/src/lighteval/utils/utils.py +++ b/src/lighteval/utils/utils.py @@ -158,24 +158,29 @@ def flatten(item: list[Union[list, str]]) -> list[str]: def make_results_table(result_dict): """Generate table of results.""" md_writer = MarkdownTableWriter() - md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] + md_writer.headers = ["Task", "Version", "Number of Samples", "Metric", "Value", "", "Stderr"] values = [] + # For backwards compatibility, create empty dict if result_dict doesn't contain num_samples + num_samples_dict = result_dict["num_samples"] if "num_samples" in result_dict else {} + for k in sorted(result_dict["results"].keys()): dic = result_dict["results"][k] version = result_dict["versions"][k] if k in result_dict["versions"] else "" + num_samples = num_samples_dict[k] if k in num_samples_dict else "" for m, v in dic.items(): if m.endswith("_stderr"): continue if m + "_stderr" in dic: se = dic[m + "_stderr"] - values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se]) + values.append([k, version, num_samples, m, "%.4f" % v, "±", "%.4f" % se]) else: - values.append([k, version, m, "%.4f" % v, "", ""]) + values.append([k, version, num_samples, m, "%.4f" % v, "", ""]) k = "" version = "" + num_samples = "" md_writer.value_matrix = values return md_writer.dumps()