From 5b8917890ed74f4e06b8f0a09cee9fe0f3953bad Mon Sep 17 00:00:00 2001 From: Junhao Li Date: Tue, 9 Sep 2025 17:01:20 -0700 Subject: [PATCH 1/3] . --- community_tasks/chart_qa_evals.py | 62 +++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 community_tasks/chart_qa_evals.py diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py new file mode 100644 index 000000000..b94b00f2d --- /dev/null +++ b/community_tasks/chart_qa_evals.py @@ -0,0 +1,62 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# ruff: noqa: F405, F403, F401 +""" +Task to evaluate VLMs on HuggingFaceM4/ChartQA. +""" + +import numpy as np + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +def prompt_fn(line, task_name: str = None): + """Defines how to go from a dataset line to a doc object. + Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info + about what this function should do in the README. + """ + return Doc( + task_name=task_name, + query="Answer the following question based on the chart. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question. Think step by step before answering.", + gold_index=0, + choices=[line["label"]], + images=[line["image"]], + ) + + +task = LightevalTaskConfig( + name="chart_qa", + prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + suite=["community"], + hf_repo="HuggingFaceM4/ChartQA", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + metrics=[Metrics.exact_match], # select your metric in Metrics +) + +TASKS_TABLE = [task] From 19048afaf8ea9f74254a1e8e016057b618983750 Mon Sep 17 00:00:00 2001 From: Junhao Li Date: Wed, 10 Sep 2025 18:00:44 -0700 Subject: [PATCH 2/3] . --- community_tasks/chart_qa_evals.py | 50 +++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py index b94b00f2d..183c58cb9 100644 --- a/community_tasks/chart_qa_evals.py +++ b/community_tasks/chart_qa_evals.py @@ -23,13 +23,23 @@ # ruff: noqa: F405, F403, F401 """ Task to evaluate VLMs on HuggingFaceM4/ChartQA. + +Example evaluation: +lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model """ import numpy as np -from lighteval.metrics.metrics import Metrics +from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric +from lighteval.metrics.metrics import Metrics, SampleLevelMetric +from lighteval.metrics.utils.extractive_match_utils import ( + ExprExtractionConfig, + LatexExtractionConfig, +) +from lighteval.metrics.utils.metric_utils import SamplingMethod from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language def prompt_fn(line, task_name: str = None): @@ -39,24 +49,54 @@ def prompt_fn(line, task_name: str = None): """ return Doc( task_name=task_name, - query="Answer the following question based on the chart. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question. Think step by step before answering.", + query="Answer the following question. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question.\n\n" + + line["query"], gold_index=0, choices=[line["label"]], images=[line["image"]], ) +extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()] +metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=extraction_targets, + pred_extraction_target=extraction_targets, + precision=6, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, +) + task = LightevalTaskConfig( name="chart_qa", prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py suite=["community"], hf_repo="HuggingFaceM4/ChartQA", hf_subset="default", - hf_avail_splits=["test"], + hf_avail_splits=["train", "val", "test"], + evaluation_splits=["test"], + hf_filter=lambda line: line["human_or_machine"] == 0, + few_shots_split=None, + few_shots_select=None, + metrics=[metric], # select your metric in Metrics +) + +human_task = LightevalTaskConfig( + name="chart_qa:human", + prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + suite=["community"], + hf_repo="HuggingFaceM4/ChartQA", + hf_subset="default", + hf_avail_splits=["train", "val", "test"], evaluation_splits=["test"], + hf_filter=lambda line: line["human_or_machine"] == 0, few_shots_split=None, few_shots_select=None, - metrics=[Metrics.exact_match], # select your metric in Metrics + metrics=[metric], # select your metric in Metrics ) -TASKS_TABLE = [task] +TASKS_TABLE = [task, human_task] From 2860bf38aa78f8cbe1160d9ed602820c59808e73 Mon Sep 17 00:00:00 2001 From: Junhao Li Date: Wed, 10 Sep 2025 18:10:34 -0700 Subject: [PATCH 3/3] . --- community_tasks/chart_qa_evals.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py index 183c58cb9..ebe968551 100644 --- a/community_tasks/chart_qa_evals.py +++ b/community_tasks/chart_qa_evals.py @@ -24,14 +24,14 @@ """ Task to evaluate VLMs on HuggingFaceM4/ChartQA. -Example evaluation: -lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model +Example usage: +lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa:human|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model """ import numpy as np from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric -from lighteval.metrics.metrics import Metrics, SampleLevelMetric +from lighteval.metrics.metrics import SampleLevelMetric from lighteval.metrics.utils.extractive_match_utils import ( ExprExtractionConfig, LatexExtractionConfig, @@ -43,10 +43,6 @@ def prompt_fn(line, task_name: str = None): - """Defines how to go from a dataset line to a doc object. - Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info - about what this function should do in the README. - """ return Doc( task_name=task_name, query="Answer the following question. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question.\n\n" @@ -64,7 +60,7 @@ def prompt_fn(line, task_name: str = None): language=Language.ENGLISH, gold_extraction_target=extraction_targets, pred_extraction_target=extraction_targets, - precision=6, + precision=4, ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -73,21 +69,20 @@ def prompt_fn(line, task_name: str = None): task = LightevalTaskConfig( name="chart_qa", - prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + prompt_function=prompt_fn, suite=["community"], hf_repo="HuggingFaceM4/ChartQA", hf_subset="default", hf_avail_splits=["train", "val", "test"], evaluation_splits=["test"], - hf_filter=lambda line: line["human_or_machine"] == 0, few_shots_split=None, few_shots_select=None, - metrics=[metric], # select your metric in Metrics + metrics=[metric], ) human_task = LightevalTaskConfig( name="chart_qa:human", - prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + prompt_function=prompt_fn, suite=["community"], hf_repo="HuggingFaceM4/ChartQA", hf_subset="default", @@ -96,7 +91,7 @@ def prompt_fn(line, task_name: str = None): hf_filter=lambda line: line["human_or_machine"] == 0, few_shots_split=None, few_shots_select=None, - metrics=[metric], # select your metric in Metrics + metrics=[metric], ) TASKS_TABLE = [task, human_task]