Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ def main():
judge_kwargs['model'] = 'llama31-8b'
elif listinstr(['VideoMMLU_QA', 'VideoMMLU_CAP'], dataset_name):
judge_kwargs['model'] = 'qwen-72b'
elif listinstr(['CAPTURE_real', 'CAPTURE_synthetic'], dataset_name):
judge_kwargs['model'] = 'llama31-8b'

if RANK == 0:
logger.info(judge_kwargs)
Expand Down
6 changes: 3 additions & 3 deletions vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .image_vqa import (
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, VGRPBench, MMVet, MTVQADataset, TableVQABench,
CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH, LogicVista, MME_CoT,
MMSci_Captioning, Physics_yale, TDBenchGrounding
MMSci_Captioning, Physics_yale, TDBenchGrounding, CAPTURE
)

from .image_ccocr import CCOCRDataset
Expand Down Expand Up @@ -158,8 +158,8 @@ def evaluate(self, eval_file, **judge_kwargs):
CreationMMBenchDataset, ImageShortQADataset, MMAlignBench, OmniDocBench,
VLM2Bench, VMCBenchDataset, EMMADataset, MME_CoT, MOAT, MedXpertQA_MM_test,
LEGO, MMSci_Captioning, Physics_yale, MMIFEval, Spatial457, VisuLogic, CVBench,
CMMU_MCQ, PathVQA_VAL, PathVQA_TEST, PathMMU_VAL, PathMMU_TEST, TDBench, TDBenchGrounding

CMMU_MCQ, PathVQA_VAL, PathVQA_TEST, PathMMU_VAL, PathMMU_TEST, TDBench, TDBenchGrounding,
CAPTURE
]


Expand Down
47 changes: 47 additions & 0 deletions vlmeval/dataset/image_vqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1937,3 +1937,50 @@ def build_prompt(self, line):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
msgs.append(dict(type='text', value=question))
return msgs


class CAPTURE(ImageBaseDataset):
TYPE = ''
DATASET_URL = {'CAPTURE_real': '',
'CAPTURE_synthetic': ''}
DATASET_MD5 = {'CAPTURE_real': None,
'CAPTURE_synthetic': None}

def create_tsv_from_hf(self):
pass

@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.capture import CAPTURE_atomeval, CAPTURE_smape

model = judge_kwargs['model']
suffix = '.' + eval_file.split('.')[-1]
record_file = eval_file.replace(suffix, f'_{model}.{suffix}')
score_file = eval_file.replace(suffix, '_score.csv')
nproc = judge_kwargs.pop('nproc', 4)
system_prompt = (
"You are an answer extractor. When given someone's answer to "
"some question, you will only extract their final number answer "
"and will respond with just the number. If there is no exact "
"number answer, respond with -1"
)
if not osp.exists(record_file):
data = load(eval_file)
model = build_judge(**judge_kwargs, system_prompt=system_prompt)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]

extracted_answers = track_progress_rich(
CAPTURE_atomeval,
tups,
nproc=nproc,
chunksize=nproc,
)
data['extracted_answer'] = extracted_answers
dump(data, record_file)

data = load(record_file)
score = CAPTURE_smape(data)
dump(score, score_file)
return score
139 changes: 139 additions & 0 deletions vlmeval/dataset/utils/capture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from huggingface_hub import hf_hub_download
import zipfile
import os
import json
import tqdm
from ...smp import *


def create_csv_from_meta(meta_file, object_key, data_dir, out_file):
with open(meta_file, "r") as fp:
meta = json.load(fp)

data = []
for entry in tqdm(meta):
image_file = entry["image_file"]
image_path = osp.join(data_dir, image_file)
image = encode_image_file_to_base64(image_path)
object_name = entry[object_key]
question = (
f"Count the exact number of {object_name} in the image. "
f"Assume the pattern of {object_name} continues behind any "
f"black box. Provide the total number of {object_name} as if "
f"the black box were not there. Only count {object_name} that "
f"are visible within the frame (or would be visible without "
f"the occluding box). If {object_name} are partially in the "
f"frame (i.e. if any part of {object_name} are visible), "
f"count it. If the {object_name} would be partially in the "
f"frame without the occluding box, count it."
)
answer = str(entry["ground_truth"])
data.append(
dict(
image=image,
question=question,
answer=answer,
image_file=image_file,
)
)
df = pd.DataFrame(data).sort_values(by="image_file")
df.to_csv(out_file, index=True, index_label="index", sep="\t")


def create_tsv_real():
data_root = LMUDataRoot()
data_dir = osp.join(data_root, "capture")
os.makedirs(data_root, exist_ok=True)
real_zip = hf_hub_download(
repo_id="atinp/CAPTURe",
filename="real_dataset.zip",
repo_type="dataset",
)

with zipfile.ZipFile(real_zip, "r") as zip_ref:
zip_ref.extractall(data_dir)
# rename the extracted folder (originally called dataset) to real_dataset
os.rename(f"{data_dir}/dataset", f"{data_dir}/real_dataset")

real_meta = hf_hub_download(
repo_id="atinp/CAPTURe",
filename="real_metadata.json",
repo_type="dataset",
)
out_file = os.path.join(data_root, "CAPTURE_real.tsv")
create_csv_from_meta(
real_meta, "object", f"{data_dir}/real_dataset", out_file
)
return out_file


def create_tsv_synthetic():
syn_zip = hf_hub_download(
repo_id="atinp/CAPTURe",
filename="synthetic_dataset.zip",
repo_type="dataset",
)
data_root = LMUDataRoot()
data_dir = osp.join(data_root, "capture")
os.makedirs(data_root, exist_ok=True)

with zipfile.ZipFile(syn_zip, "r") as zip_ref:
zip_ref.extractall(data_dir)

synth_meta = hf_hub_download(
repo_id="atinp/CAPTURe",
filename="synthetic_metadata.json",
repo_type="dataset",
)
out_file = os.path.join(data_root, "CAPTURE_synthetic.tsv")
create_csv_from_meta(
synth_meta, "dot_shape", f"{data_dir}/synthetic_dataset", out_file
)
return out_file


def safe_string_to_int(s):
try:
return int(s)
except ValueError:
return -1


def CAPTURE_atomeval(model, line):
ans = model.generate_str(line["prediction"])
return safe_string_to_int(ans)


def CAPTURE_smape(data):
total_percentage_error = 0
count = 0
skip = 0

for i in range(len(data)):
row = data.iloc[i]
ground_truth = int(row["answer"])
answer = row["extracted_answer"]

if answer == -1:
skip += 1
total_percentage_error += 100
count += 1
continue

# Compute sMAPE (Symmetric Mean Absolute Percentage Error)
numerator = abs(answer - ground_truth)
denominator = abs(answer) + abs(ground_truth)
smape = (numerator / denominator) * 100

# Add to total percentage error
total_percentage_error += smape
count += 1

# Calculate MAPE
mape = total_percentage_error / count if count != 0 else 0
return pd.DataFrame([dict(SMAPE=mape, skip=skip)])


if __name__ == "__main__":
create_tsv_real()
create_tsv_synthetic()