From 5b8917890ed74f4e06b8f0a09cee9fe0f3953bad Mon Sep 17 00:00:00 2001
From: Junhao Li <junhao@ubicloud.com>
Date: Tue, 9 Sep 2025 17:01:20 -0700
Subject: [PATCH 1/3] .

---
 community_tasks/chart_qa_evals.py | 62 +++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 community_tasks/chart_qa_evals.py

diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py
new file mode 100644
index 000000000..b94b00f2d
--- /dev/null
+++ b/community_tasks/chart_qa_evals.py
@@ -0,0 +1,62 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ruff: noqa: F405, F403, F401
+"""
+Task to evaluate VLMs on HuggingFaceM4/ChartQA.
+"""
+
+import numpy as np
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query="Answer the following question based on the chart. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question. Think step by step before answering.",
+        gold_index=0,
+        choices=[line["label"]],
+        images=[line["image"]],
+    )
+
+
+task = LightevalTaskConfig(
+    name="chart_qa",
+    prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["community"],
+    hf_repo="HuggingFaceM4/ChartQA",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    metrics=[Metrics.exact_match],  # select your metric in Metrics
+)
+
+TASKS_TABLE = [task]

From 19048afaf8ea9f74254a1e8e016057b618983750 Mon Sep 17 00:00:00 2001
From: Junhao Li <junhao@ubicloud.com>
Date: Wed, 10 Sep 2025 18:00:44 -0700
Subject: [PATCH 2/3] .

---
 community_tasks/chart_qa_evals.py | 50 +++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py
index b94b00f2d..183c58cb9 100644
--- a/community_tasks/chart_qa_evals.py
+++ b/community_tasks/chart_qa_evals.py
@@ -23,13 +23,23 @@
 # ruff: noqa: F405, F403, F401
 """
 Task to evaluate VLMs on HuggingFaceM4/ChartQA.
+
+Example evaluation:
+lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model
 """
 
 import numpy as np
 
-from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric
+from lighteval.metrics.metrics import Metrics, SampleLevelMetric
+from lighteval.metrics.utils.extractive_match_utils import (
+    ExprExtractionConfig,
+    LatexExtractionConfig,
+)
+from lighteval.metrics.utils.metric_utils import SamplingMethod
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
+from lighteval.utils.language import Language
 
 
 def prompt_fn(line, task_name: str = None):
@@ -39,24 +49,54 @@ def prompt_fn(line, task_name: str = None):
     """
     return Doc(
         task_name=task_name,
-        query="Answer the following question based on the chart. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question. Think step by step before answering.",
+        query="Answer the following question. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question.\n\n"
+        + line["query"],
         gold_index=0,
         choices=[line["label"]],
         images=[line["image"]],
     )
 
 
+extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()]
+metric = SampleLevelMetric(
+    metric_name="extractive_match",
+    sample_level_fn=MultilingualExtractiveMatchMetric(
+        language=Language.ENGLISH,
+        gold_extraction_target=extraction_targets,
+        pred_extraction_target=extraction_targets,
+        precision=6,
+    ),
+    category=SamplingMethod.GENERATIVE,
+    corpus_level_fn=np.mean,
+    higher_is_better=True,
+)
+
 task = LightevalTaskConfig(
     name="chart_qa",
     prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
     suite=["community"],
     hf_repo="HuggingFaceM4/ChartQA",
     hf_subset="default",
-    hf_avail_splits=["test"],
+    hf_avail_splits=["train", "val", "test"],
+    evaluation_splits=["test"],
+    hf_filter=lambda line: line["human_or_machine"] == 0,
+    few_shots_split=None,
+    few_shots_select=None,
+    metrics=[metric],  # select your metric in Metrics
+)
+
+human_task = LightevalTaskConfig(
+    name="chart_qa:human",
+    prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["community"],
+    hf_repo="HuggingFaceM4/ChartQA",
+    hf_subset="default",
+    hf_avail_splits=["train", "val", "test"],
     evaluation_splits=["test"],
+    hf_filter=lambda line: line["human_or_machine"] == 0,
     few_shots_split=None,
     few_shots_select=None,
-    metrics=[Metrics.exact_match],  # select your metric in Metrics
+    metrics=[metric],  # select your metric in Metrics
 )
 
-TASKS_TABLE = [task]
+TASKS_TABLE = [task, human_task]

From 2860bf38aa78f8cbe1160d9ed602820c59808e73 Mon Sep 17 00:00:00 2001
From: Junhao Li <junhao@ubicloud.com>
Date: Wed, 10 Sep 2025 18:10:34 -0700
Subject: [PATCH 3/3] .

---
 community_tasks/chart_qa_evals.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py
index 183c58cb9..ebe968551 100644
--- a/community_tasks/chart_qa_evals.py
+++ b/community_tasks/chart_qa_evals.py
@@ -24,14 +24,14 @@
 """
 Task to evaluate VLMs on HuggingFaceM4/ChartQA.
 
-Example evaluation:
-lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model
+Example usage:
+lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa:human|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model
 """
 
 import numpy as np
 
 from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric
-from lighteval.metrics.metrics import Metrics, SampleLevelMetric
+from lighteval.metrics.metrics import SampleLevelMetric
 from lighteval.metrics.utils.extractive_match_utils import (
     ExprExtractionConfig,
     LatexExtractionConfig,
@@ -43,10 +43,6 @@
 
 
 def prompt_fn(line, task_name: str = None):
-    """Defines how to go from a dataset line to a doc object.
-    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
-    about what this function should do in the README.
-    """
     return Doc(
         task_name=task_name,
         query="Answer the following question. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question.\n\n"
@@ -64,7 +60,7 @@ def prompt_fn(line, task_name: str = None):
         language=Language.ENGLISH,
         gold_extraction_target=extraction_targets,
         pred_extraction_target=extraction_targets,
-        precision=6,
+        precision=4,
     ),
     category=SamplingMethod.GENERATIVE,
     corpus_level_fn=np.mean,
@@ -73,21 +69,20 @@ def prompt_fn(line, task_name: str = None):
 
 task = LightevalTaskConfig(
     name="chart_qa",
-    prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    prompt_function=prompt_fn,
     suite=["community"],
     hf_repo="HuggingFaceM4/ChartQA",
     hf_subset="default",
     hf_avail_splits=["train", "val", "test"],
     evaluation_splits=["test"],
-    hf_filter=lambda line: line["human_or_machine"] == 0,
     few_shots_split=None,
     few_shots_select=None,
-    metrics=[metric],  # select your metric in Metrics
+    metrics=[metric],
 )
 
 human_task = LightevalTaskConfig(
     name="chart_qa:human",
-    prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    prompt_function=prompt_fn,
     suite=["community"],
     hf_repo="HuggingFaceM4/ChartQA",
     hf_subset="default",
@@ -96,7 +91,7 @@ def prompt_fn(line, task_name: str = None):
     hf_filter=lambda line: line["human_or_machine"] == 0,
     few_shots_split=None,
     few_shots_select=None,
-    metrics=[metric],  # select your metric in Metrics
+    metrics=[metric],
 )
 
 TASKS_TABLE = [task, human_task]