open-compass · bodsul · May 1, 2025 · May 1, 2025 · May 2, 2025
diff --git a/run.py b/run.py
@@ -393,6 +393,8 @@ def main():
                         judge_kwargs['model'] = 'llama31-8b'
                     elif listinstr(['VideoMMLU_QA', 'VideoMMLU_CAP'], dataset_name):
                         judge_kwargs['model'] = 'qwen-72b'
+                    elif listinstr(['CAPTURE_real', 'CAPTURE_synthetic'], dataset_name):
+                        judge_kwargs['model'] = 'llama31-8b'
 
                 if RANK == 0:
                     logger.info(judge_kwargs)

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -12,7 +12,7 @@
 from .image_vqa import (
     ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, VGRPBench, MMVet, MTVQADataset, TableVQABench,
     CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH, LogicVista, MME_CoT,
-    MMSci_Captioning, Physics_yale, TDBenchGrounding
+    MMSci_Captioning, Physics_yale, TDBenchGrounding, CAPTURE
 )
 
 from .image_ccocr import CCOCRDataset
@@ -158,8 +158,8 @@ def evaluate(self, eval_file, **judge_kwargs):
     CreationMMBenchDataset, ImageShortQADataset, MMAlignBench, OmniDocBench,
     VLM2Bench, VMCBenchDataset, EMMADataset, MME_CoT, MOAT, MedXpertQA_MM_test,
     LEGO, MMSci_Captioning, Physics_yale, MMIFEval, Spatial457, VisuLogic, CVBench,
-    CMMU_MCQ, PathVQA_VAL, PathVQA_TEST, PathMMU_VAL, PathMMU_TEST, TDBench, TDBenchGrounding
-
+    CMMU_MCQ, PathVQA_VAL, PathVQA_TEST, PathMMU_VAL, PathMMU_TEST, TDBench, TDBenchGrounding,
+    CAPTURE
 ]
 
 

diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
@@ -1937,3 +1937,50 @@ def build_prompt(self, line):
         msgs.extend([dict(type='image', value=p) for p in tgt_path])
         msgs.append(dict(type='text', value=question))
         return msgs
+
+
+class CAPTURE(ImageBaseDataset):
+    TYPE = ''
+    DATASET_URL = {'CAPTURE_real': '',
+                   'CAPTURE_synthetic': ''}
+    DATASET_MD5 = {'CAPTURE_real': None,
+                   'CAPTURE_synthetic': None}
+
+    def create_tsv_from_hf(self):
+        pass
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.capture import CAPTURE_atomeval, CAPTURE_smape
+
+        model = judge_kwargs['model']
+        suffix = '.' + eval_file.split('.')[-1]
+        record_file = eval_file.replace(suffix, f'_{model}.{suffix}')
+        score_file = eval_file.replace(suffix, '_score.csv')
+        nproc = judge_kwargs.pop('nproc', 4)
+        system_prompt = (
+            "You are an answer extractor. When given someone's answer to "
+            "some question, you will only extract their final number answer "
+            "and will respond with just the number. If there is no exact "
+            "number answer, respond with -1"
+        )
+        if not osp.exists(record_file):
+            data = load(eval_file)
+            model = build_judge(**judge_kwargs, system_prompt=system_prompt)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+
+            extracted_answers = track_progress_rich(
+                CAPTURE_atomeval,
+                tups,
+                nproc=nproc,
+                chunksize=nproc,
+            )
+            data['extracted_answer'] = extracted_answers
+            dump(data, record_file)
+
+        data = load(record_file)
+        score = CAPTURE_smape(data)
+        dump(score, score_file)
+        return score
diff --git a/vlmeval/dataset/utils/capture.py b/vlmeval/dataset/utils/capture.py
@@ -0,0 +1,139 @@
+from huggingface_hub import hf_hub_download
+import zipfile
+import os
+import json
+import tqdm
+from ...smp import *
+
+
+def create_csv_from_meta(meta_file, object_key, data_dir, out_file):
+    with open(meta_file, "r") as fp:
+        meta = json.load(fp)
+
+    data = []
+    for entry in tqdm(meta):
+        image_file = entry["image_file"]
+        image_path = osp.join(data_dir, image_file)
+        image = encode_image_file_to_base64(image_path)
+        object_name = entry[object_key]
+        question = (
+            f"Count the exact number of {object_name} in the image. "
+            f"Assume the pattern of {object_name} continues behind any "
+            f"black box. Provide the total number of {object_name} as if "
+            f"the black box were not there. Only count {object_name} that "
+            f"are visible within the frame (or would be visible without "
+            f"the occluding box). If {object_name} are partially in the "
+            f"frame (i.e. if any part of {object_name} are visible), "
+            f"count it. If the {object_name} would be partially in the "
+            f"frame without the occluding box, count it."
+        )
+        answer = str(entry["ground_truth"])
+        data.append(
+            dict(
+                image=image,
+                question=question,
+                answer=answer,
+                image_file=image_file,
+            )
+        )
+        df = pd.DataFrame(data).sort_values(by="image_file")
+        df.to_csv(out_file, index=True, index_label="index", sep="\t")
+
+
+def create_tsv_real():
+    data_root = LMUDataRoot()
+    data_dir = osp.join(data_root, "capture")
+    os.makedirs(data_root, exist_ok=True)
+    real_zip = hf_hub_download(
+        repo_id="atinp/CAPTURe",
+        filename="real_dataset.zip",
+        repo_type="dataset",
+    )
+
+    with zipfile.ZipFile(real_zip, "r") as zip_ref:
+        zip_ref.extractall(data_dir)
+    # rename the extracted folder (originally called dataset) to real_dataset
+    os.rename(f"{data_dir}/dataset", f"{data_dir}/real_dataset")
+
+    real_meta = hf_hub_download(
+        repo_id="atinp/CAPTURe",
+        filename="real_metadata.json",
+        repo_type="dataset",
+    )
+    out_file = os.path.join(data_root, "CAPTURE_real.tsv")
+    create_csv_from_meta(
+        real_meta, "object", f"{data_dir}/real_dataset", out_file
+    )
+    return out_file
+
+
+def create_tsv_synthetic():
+    syn_zip = hf_hub_download(
+        repo_id="atinp/CAPTURe",
+        filename="synthetic_dataset.zip",
+        repo_type="dataset",
+    )
+    data_root = LMUDataRoot()
+    data_dir = osp.join(data_root, "capture")
+    os.makedirs(data_root, exist_ok=True)
+
+    with zipfile.ZipFile(syn_zip, "r") as zip_ref:
+        zip_ref.extractall(data_dir)
+
+    synth_meta = hf_hub_download(
+        repo_id="atinp/CAPTURe",
+        filename="synthetic_metadata.json",
+        repo_type="dataset",
+    )
+    out_file = os.path.join(data_root, "CAPTURE_synthetic.tsv")
+    create_csv_from_meta(
+        synth_meta, "dot_shape", f"{data_dir}/synthetic_dataset", out_file
+    )
+    return out_file
+
+
+def safe_string_to_int(s):
+    try:
+        return int(s)
+    except ValueError:
+        return -1
+
+
+def CAPTURE_atomeval(model, line):
+    ans = model.generate_str(line["prediction"])
+    return safe_string_to_int(ans)
+
+
+def CAPTURE_smape(data):
+    total_percentage_error = 0
+    count = 0
+    skip = 0
+
+    for i in range(len(data)):
+        row = data.iloc[i]
+        ground_truth = int(row["answer"])
+        answer = row["extracted_answer"]
+
+        if answer == -1:
+            skip += 1
+            total_percentage_error += 100
+            count += 1
+            continue
+
+        # Compute sMAPE (Symmetric Mean Absolute Percentage Error)
+        numerator = abs(answer - ground_truth)
+        denominator = abs(answer) + abs(ground_truth)
+        smape = (numerator / denominator) * 100
+
+        # Add to total percentage error
+        total_percentage_error += smape
+        count += 1
+
+    # Calculate MAPE
+    mape = total_percentage_error / count if count != 0 else 0
+    return pd.DataFrame([dict(SMAPE=mape, skip=skip)])
+
+
+if __name__ == "__main__":
+    create_tsv_real()
+    create_tsv_synthetic()