diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..6e9f68dae --- /dev/null +++ b/.gitattributes @@ -0,0 +1,9 @@ +*.png filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +*.bmp filter=lfs diff=lfs merge=lfs -text +*.tif filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +*.svg filter=lfs diff=lfs merge=lfs -text diff --git a/src/rai_bench/README.md b/src/rai_bench/README.md index 739a228ea..6373a3b0c 100644 --- a/src/rai_bench/README.md +++ b/src/rai_bench/README.md @@ -172,6 +172,9 @@ The VLM Benchmark is a benchmark for VLM models. It includes a set of tasks cont To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/setup/tracing.md) document. +> [!IMPORTANT] +> If you are going to use Lanfuse for tracing results, do `export LANGFUSE_MAX_EVENT_SIZE_BYTES=20000000`. By default, the maximum tracing event size is 1MB, but some tasks exceed this limit. + To run the benchmark: ```bash diff --git a/src/rai_bench/rai_bench/__init__.py b/src/rai_bench/rai_bench/__init__.py index 9ca566649..e5b2f0cdd 100644 --- a/src/rai_bench/rai_bench/__init__.py +++ b/src/rai_bench/rai_bench/__init__.py @@ -14,6 +14,7 @@ from .test_models import ( ManipulationO3DEBenchmarkConfig, ToolCallingAgentBenchmarkConfig, + VLMBenchmarkConfig, test_dual_agents, test_models, ) @@ -28,6 +29,7 @@ __all__ = [ "ManipulationO3DEBenchmarkConfig", "ToolCallingAgentBenchmarkConfig", + "VLMBenchmarkConfig", "define_benchmark_logger", "get_llm_for_benchmark", "parse_manipulation_o3de_benchmark_args", diff --git a/src/rai_bench/rai_bench/base_benchmark.py b/src/rai_bench/rai_bench/base_benchmark.py index d8024715b..fda87ae59 100644 --- a/src/rai_bench/rai_bench/base_benchmark.py +++ b/src/rai_bench/rai_bench/base_benchmark.py @@ -35,6 +35,54 @@ class RunSummary(BaseModel): total_tasks: int = Field(..., description="Total number of executed tasks.") +class ModelSummary(BaseModel): + model_name: str = Field(..., description="Name of the LLM.") + avg_success_rate: float = Field( + ..., + description="Percentage of successfully completed tasks across all repeats.", + ) + std_success_rate: float = Field( + ..., + description="Standard deviation of success rate across all repeats for this model.", + ) + avg_total_tasks: float = Field( + ..., description="Average number of tasks executed through all repeats." + ) + avg_time: float = Field( + ..., description="Average time taken across all tasks and repeats." + ) + std_time: float = Field( + ..., + description="Standard deviation of time taken across all repeats for this model.", + ) + + repeats: int = Field( + ..., description="Total number of repeats for the model for each task." + ) + + +class TasksSummary(BaseModel): + model_name: str = Field(..., description="Name of the LLM.") + task_id: str = Field(..., description="Unique identifier for the task.") + task_prompt: str = Field( + ..., description="The task prompt that identifies the task." + ) + avg_success_rate: float = Field( + ..., description="Average result for task across all repeats." + ) + std_success_rate: float = Field( + ..., description="Standard deviation of the success rate across all repeats." + ) + avg_time: float = Field( + ..., description="Average time taken across all repeats for one task." + ) + std_time: float = Field( + ..., + description="Standard deviation of the time taken across all repeats for one task.", + ) + repeats: int = Field(..., description="Total number of repeats for task.") + + class TimeoutException(Exception): pass diff --git a/src/rai_bench/rai_bench/test_models.py b/src/rai_bench/rai_bench/test_models.py index 8be2db7e4..6fab72a00 100644 --- a/src/rai_bench/rai_bench/test_models.py +++ b/src/rai_bench/rai_bench/test_models.py @@ -11,24 +11,36 @@ # # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # See the License for the specific language governing permissions and # # limitations under the License. +import csv import uuid from abc import abstractmethod from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Literal +import numpy as np from git import Optional from langchain.chat_models.base import BaseChatModel from pydantic import BaseModel import rai_bench.manipulation_o3de as manipulation_o3de import rai_bench.tool_calling_agent as tool_calling_agent +import rai_bench.vlm_benchmark as vlm_benchmark +from rai_bench.base_benchmark import ModelSummary, RunSummary, TasksSummary +from rai_bench.results_processing.data_loading import ( + DETAILED_FILE_NAME, + SUMMARY_FILE_NAME, +) from rai_bench.utils import ( define_benchmark_logger, get_llm_for_benchmark, get_llm_model_name, ) +MODEL_SUMMARY_FILE_NAME = "model_summary.csv" +TASKS_SUMMARY_FILE_NAME = "tasks_summary.csv" +BENCHMARK_SUMMARY = "benchmark_summary.csv" + class BenchmarkConfig(BaseModel): repeats: int = 1 @@ -77,6 +89,194 @@ def name(self) -> str: return "tool_calling_agent" +class VLMBenchmarkConfig(BenchmarkConfig): + complexities: List[Literal["easy", "medium", "hard"]] = ["easy", "medium", "hard"] + task_types: List[ + Literal[ + "bool_response_image_task", + "quantity_response_image_task", + "multiple_choice_image_task", + ] + ] = [ + "bool_response_image_task", + "quantity_response_image_task", + "multiple_choice_image_task", + ] + + @property + def name(self) -> str: + return "vlm" + + +def merge_model_repeats_summary( + bench_name: str, model_name: str, run_dir: Path +) -> None: + """Merge summary results across all repeats for a single model. + + Parameters + ---------- + bench_name : str + Name of the benchmark + model_name : str + Name of the model + run_dir : Path + Directory containing the benchmark run results + """ + model_dir = run_dir / bench_name / model_name + if not model_dir.exists(): + return + + summaries: List[RunSummary] = [] + for repeat_dir in model_dir.iterdir(): + if repeat_dir.is_dir() and repeat_dir.name.isdigit(): + summary_file = repeat_dir / SUMMARY_FILE_NAME + if summary_file.exists(): + with open(summary_file, "r") as f: + reader = csv.DictReader(f) + for row in reader: + summaries.append(RunSummary.model_validate(row)) + + if not summaries: + return + + success_rates = [s.success_rate for s in summaries] + times = [s.avg_time for s in summaries] + total_tasks_list = [s.total_tasks for s in summaries] + + avg_success_rate = np.mean(success_rates) + std_success_rate = np.std(success_rates) + avg_time = np.mean(times) + std_time = np.std(times) + total_tasks = np.mean(total_tasks_list) + + merged_summary = ModelSummary( + model_name=model_name, + avg_success_rate=round(float(avg_success_rate), 2), + std_success_rate=round(float(std_success_rate), 3), + avg_time=round(float(avg_time), 3), + std_time=round(float(std_time), 3), + avg_total_tasks=round(float(total_tasks), 3), + repeats=len(summaries), + ) + + merged_file = model_dir / MODEL_SUMMARY_FILE_NAME + with open(merged_file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=ModelSummary.model_fields.keys()) + writer.writeheader() + writer.writerow(merged_summary.model_dump()) + + +def merge_benchmark_summary( + bench_name: str, run_dir: Path, model_names: List[str] +) -> None: + """Merge summary results across all models for a single benchmark. + + Parameters + ---------- + bench_name : str + Name of the benchmark + run_dir : Path + Directory containing the benchmark run results + model_names : List[str] + List of model names to include in the summary + """ + bench_dir = run_dir / bench_name + if not bench_dir.exists(): + return + + all_summaries: List[ModelSummary] = [] + for model_name in model_names: + model_dir = bench_dir / model_name + merged_file = model_dir / MODEL_SUMMARY_FILE_NAME + + if merged_file.exists(): + with open(merged_file, "r") as f: + reader = csv.DictReader(f) + for row in reader: + all_summaries.append(ModelSummary.model_validate(row)) + + if not all_summaries: + return + + benchmark_summary_file = bench_dir / BENCHMARK_SUMMARY + with open(benchmark_summary_file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=ModelSummary.model_fields.keys()) + writer.writeheader() + for summary in all_summaries: + writer.writerow(summary.model_dump()) + + +def merge_tasks_summary(bench_name: str, model_name: str, run_dir: Path) -> None: + """Merge task results across all repeats for a single model, aggregating by task. + + Parameters + ---------- + bench_name : str + Name of the benchmark + model_name : str + Name of the model + run_dir : Path + Directory containing the benchmark run results + """ + model_dir = run_dir / bench_name / model_name + if not model_dir.exists(): + return + + task_data_by_id: Dict[str, Dict[str, Any]] = {} + + for repeat_dir in model_dir.iterdir(): + if repeat_dir.is_dir() and repeat_dir.name.isdigit(): + results_file = repeat_dir / DETAILED_FILE_NAME + if results_file.exists(): + with open(results_file, "r") as f: + reader = csv.DictReader(f) + for row in reader: + task_id = row["task_id"] + task_prompt = row["task_prompt"] + score = float(row["score"]) + total_time = float(row["total_time"]) + + if task_id not in task_data_by_id: + task_data_by_id[task_id] = { + "scores": [], + "times": [], + "task_prompt": task_prompt, + } + + task_data_by_id[task_id]["scores"].append(score) + task_data_by_id[task_id]["times"].append(total_time) + + if not task_data_by_id: + return + + # Calculate statistics for each task + task_summaries: List[TasksSummary] = [] + for task_id, data in task_data_by_id.items(): + scores = np.array(data["scores"]) + times = np.array(data["times"]) + task_prompt = data["task_prompt"] + + task_summary = TasksSummary( + model_name=model_name, + task_id=task_id, + task_prompt=task_prompt, + avg_success_rate=round(float(scores.mean()), 3), + std_success_rate=round(float(scores.std()), 3), + avg_time=round(float(times.mean()), 3), + std_time=round(float(times.std()), 3), + repeats=len(scores), + ) + task_summaries.append(task_summary) + + tasks_summary_file = model_dir / TASKS_SUMMARY_FILE_NAME + with open(tasks_summary_file, "w", newline="") as f: + if task_summaries: + writer = csv.DictWriter(f, fieldnames=TasksSummary.model_fields.keys()) + writer.writeheader() + for task_summary in task_summaries: + writer.writerow(task_summary.model_dump()) + + def test_dual_agents( multimodal_llms: List[BaseChatModel], tool_calling_models: List[BaseChatModel], @@ -163,6 +363,7 @@ def test_models( # for each bench configuration seperate run folder now = datetime.now() run_name = f"run_{now.strftime('%Y-%m-%d_%H-%M-%S')}" + run_dir = Path(out_dir) / run_name for i, model_name in enumerate(model_names): for u in range(bench_conf.repeats): curr_out_dir = ( @@ -211,8 +412,32 @@ def test_models( experiment_id=experiment_id, bench_logger=bench_logger, ) + + elif isinstance(bench_conf, VLMBenchmarkConfig): + vlm_tasks = vlm_benchmark.get_spatial_tasks() + vlm_benchmark.run_benchmark( + llm=llm, + out_dir=Path(curr_out_dir), + tasks=vlm_tasks, + bench_logger=bench_logger, + ) + except Exception as e: bench_logger.critical(f"BENCHMARK RUN FAILED: {e}") bench_logger.critical( f"{bench_conf.name} benchmark for {model_name}, vendor: {vendors[i]}, execution number: {u + 1}" ) + merge_results_logger = define_benchmark_logger(out_dir=Path(out_dir)) + merge_results_logger.info( + f"Merging summaries for benchmark: {bench_conf.name}" + ) + + for model_name in model_names: + merge_model_repeats_summary(bench_conf.name, model_name, run_dir) + merge_tasks_summary(bench_conf.name, model_name, run_dir) + + merge_benchmark_summary(bench_conf.name, run_dir, model_names) + + merge_results_logger.info( + f"Summary merging completed for benchmark: {bench_conf.name}" + ) diff --git a/src/rai_bench/rai_bench/vlm_benchmark/benchmark.py b/src/rai_bench/rai_bench/vlm_benchmark/benchmark.py index 8f1c9d699..c2c1f1836 100644 --- a/src/rai_bench/rai_bench/vlm_benchmark/benchmark.py +++ b/src/rai_bench/rai_bench/vlm_benchmark/benchmark.py @@ -133,7 +133,7 @@ def run_next(self, agent: CompiledStateGraph, experiment_id: uuid.UUID) -> None: score = task.validate(output=structured_output) else: errors.append(f"Not valid structured output: {type(structured_output)}") - score = False + score = 0 te = time.perf_counter() total_time = te - ts @@ -141,6 +141,7 @@ def run_next(self, agent: CompiledStateGraph, experiment_id: uuid.UUID) -> None: self.logger.info(f"TASK SCORE: {score}, TOTAL TIME: {total_time:.3f}") task_result = TaskResult( + task_id=task.task_id, task_prompt=task.get_prompt(), system_prompt=task.get_system_prompt(), type=task.type, diff --git a/src/rai_bench/rai_bench/vlm_benchmark/interfaces.py b/src/rai_bench/rai_bench/vlm_benchmark/interfaces.py index 97f769b93..83d303152 100644 --- a/src/rai_bench/rai_bench/vlm_benchmark/interfaces.py +++ b/src/rai_bench/rai_bench/vlm_benchmark/interfaces.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib import logging from abc import ABC, abstractmethod -from typing import Generic, List, Literal, Optional, TypeVar +from typing import Any, Generic, List, Literal, Optional, TypeVar from langchain_core.messages import BaseMessage from langchain_core.runnables.config import DEFAULT_RECURSION_LIMIT -from pydantic import BaseModel, ConfigDict, ValidationError +from pydantic import BaseModel, ConfigDict, Field, ValidationError, computed_field loggers_type = logging.Logger @@ -28,6 +29,40 @@ IMAGE_REASONING_SYSTEM_PROMPT = "You are a helpful and knowledgeable AI assistant that specializes in interpreting and analyzing visual content. Your task is to answer questions based on the images provided to you. Please response in requested structured output format." +class TaskValidationError(Exception): + pass + + +AnswerT = TypeVar("AnswerT") + + +class ImageReasoningTaskInput(BaseModel, Generic[AnswerT]): + """Base input for an image reasoning task.""" + + question: str = Field(..., description="The question to be answered.") + images_paths: List[str] = Field( + ..., + description="List of image file paths to be used for answering the question.", + ) + expected_answer: AnswerT = Field( + ..., description="The expected answer to the question." + ) + + @computed_field + @property + def task_id(self) -> str: + """Unique identifier for the task based on question and image paths.""" + content = f"{self.question}|{sorted(self.images_paths)}" + return hashlib.sha256(content.encode()).hexdigest() + + +class ImageReasoningAnswer(BaseModel, Generic[AnswerT]): + """Base answer for an image reasoning task.""" + + answer: AnswerT = Field(..., description="The answer to the question.") + justification: str = Field(..., description="Justification for the answer.") + + class LangchainRawOutputModel(BaseModel): """ A Pydantic model for wrapping Langchain message parsing results from a structured output agent. See documentation for more details: @@ -47,20 +82,17 @@ class LangchainRawOutputModel(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) raw: BaseMessage - parsed: BaseModel + parsed: ImageReasoningAnswer[Any] parsing_error: Optional[BaseException] -class TaskValidationError(Exception): - pass - - -class ImageReasoningTask(ABC, Generic[BaseModelT]): +class ImageReasoningTask(ABC, Generic[AnswerT]): complexity: Literal["easy", "medium", "hard"] recursion_limit: int = DEFAULT_RECURSION_LIMIT def __init__( self, + task_input: ImageReasoningTaskInput[AnswerT], logger: loggers_type | None = None, ) -> None: """ @@ -78,15 +110,35 @@ def __init__( self.logger = logger else: self.logger = logging.getLogger(__name__) - self.question: str - self.images_paths: List[str] + + self._task_input = task_input + + @property + def question(self) -> str: + """The question to be answered.""" + return self._task_input.question + + @property + def images_paths(self) -> List[str]: + """List of image file paths.""" + return self._task_input.images_paths + + @property + def expected_answer(self) -> AnswerT: + """The expected answer to the question.""" + return self._task_input.expected_answer + + @property + def task_id(self) -> str: + """Unique identifier for the task.""" + return self._task_input.task_id def set_logger(self, logger: loggers_type): self.logger = logger @property @abstractmethod - def structured_output(self) -> type[BaseModelT]: + def structured_output(self) -> type[ImageReasoningAnswer[AnswerT]]: """Structured output that agent should return.""" pass @@ -118,7 +170,7 @@ def get_prompt(self) -> str: pass @abstractmethod - def validate(self, output: BaseModelT) -> bool: + def validate(self, output: ImageReasoningAnswer[AnswerT]) -> float: """Validate result of the task.""" pass @@ -135,7 +187,7 @@ def get_images(self) -> List[str]: def get_structured_output_from_messages( self, messages: List[BaseMessage] - ) -> BaseModelT | None: + ) -> ImageReasoningAnswer[AnswerT] | None: """Extract and validate structured output from a list of messages. Iterates through messages in reverse order, attempting to find the message that is @@ -167,8 +219,11 @@ def get_structured_output_from_messages( ) parsed = validated_message.parsed - if isinstance(parsed, self.structured_output): - return parsed + expected_output_type = self.structured_output + parsed_valid_output = expected_output_type.model_validate( + parsed.model_dump() + ) + return parsed_valid_output except ValidationError: continue return None diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_1.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_1.jpg index 7a2026fd2..e04af6e4f 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_1.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_1.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_10.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_10.png new file mode 100644 index 000000000..28e23b70b --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_10.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44cd817356eb6fc8bd90e4f4ad04a31f8ba2a7c441f8fa2433c24188bd42969d +size 222920 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_11.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_11.png new file mode 100644 index 000000000..9515b3224 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_11.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece7ce624db2cbe081eacd503de97501b303a2c38c6d2902fefdd577d1046ebd +size 497598 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_12.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_12.png new file mode 100644 index 000000000..1137aa7f5 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_12.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:389dc7b475f338bd4bce0aa477127e4e601075d2e5a545cdc414f506c78e0a0f +size 1139409 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_13.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_13.png new file mode 100644 index 000000000..067a2cfe3 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_13.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fd24939958439f2d2b193ef98306382b0eb4700be22ddf964dacf422043eccc +size 232378 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_14.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_14.png new file mode 100644 index 000000000..4a91c9354 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_14.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d158ca524cb0faa8a3bed64ae46f05f488a9597abd332a9810d800757aaeddc +size 441519 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_15.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_15.png new file mode 100644 index 000000000..91cf34dec --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_15.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09ea1c58a154bc565bb74f2c91f2cec6c95613a5772c65eb99cdd86bf4982bed +size 679496 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_16.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_16.png new file mode 100644 index 000000000..a254669d5 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_16.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:923a9ab71e68715d7c927e5c218ba406cc47f37e6145ccf4d3dc19512d040fe9 +size 6205972 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_17.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_17.png new file mode 100644 index 000000000..8acad7451 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_17.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd337c10f0c662bfedbd6a5137d6e52decc7d2937134b222d76f7f34f588228 +size 2506357 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_2.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_2.jpg index 663af0a03..c5d742f82 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_2.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_2.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_3.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_3.jpg index 2f1eac5de..bffe4b8f5 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_3.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_3.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_4.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_4.jpg index c98bba4cb..327c18666 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_4.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_4.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_5.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_5.jpg index aa54a0660..1a00c6f16 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_5.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_5.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_6.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_6.jpg index 8789788dd..4283a041e 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_6.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_6.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_7.jpg b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_7.jpg index 04f6778a7..5ffb9494e 100644 Binary files a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_7.jpg and b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_7.jpg differ diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_8.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_8.png new file mode 100644 index 000000000..8ff32a372 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d515a1ebf1849ccdc7d3467c5eaf4483d8300fb672a56a928ea074b55ab8f384 +size 510974 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_9.png b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_9.png new file mode 100644 index 000000000..90ce93490 --- /dev/null +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/images/image_9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a84ef5c9a4ec76c22b665c97be9c17610dee314bd9cfcba087991a8fcc6415 +size 1195412 diff --git a/src/rai_bench/rai_bench/vlm_benchmark/predefined/tasks.py b/src/rai_bench/rai_bench/vlm_benchmark/predefined/tasks.py index bd1d9cd61..4bd47503d 100644 --- a/src/rai_bench/rai_bench/vlm_benchmark/predefined/tasks.py +++ b/src/rai_bench/rai_bench/vlm_benchmark/predefined/tasks.py @@ -12,101 +12,546 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, cast - -from pydantic import BaseModel +from typing import Any, List from rai_bench.vlm_benchmark.interfaces import ImageReasoningTask -from rai_bench.vlm_benchmark.tasks.tasks import BoolImageTask, BoolImageTaskInput +from rai_bench.vlm_benchmark.tasks.tasks import ( + BoolImageTask, + BoolImageTaskInput, + MultipleChoiceImageTask, + MultipleChoiceImageTaskInput, + QuantityImageTask, + QuantityImageTaskInput, +) IMG_PATH = "src/rai_bench/rai_bench/vlm_benchmark/predefined/images/" -true_response_inputs: List[BoolImageTaskInput] = [ - BoolImageTaskInput( - question="Is the door on the left from the desk?", - images_paths=[IMG_PATH + "image_1.jpg"], - expected_answer=True, - ), - BoolImageTaskInput( - question="Is the light on in the room?", - images_paths=[IMG_PATH + "image_2.jpg"], - expected_answer=True, - ), - BoolImageTaskInput( - question="Do you see the plant?", - images_paths=[IMG_PATH + "image_2.jpg"], - expected_answer=True, - ), - BoolImageTaskInput( - question="Are there any pictures on the wall?", - images_paths=[IMG_PATH + "image_3.jpg"], - expected_answer=True, - ), - BoolImageTaskInput( - question="Are there 3 pictures on the wall?", - images_paths=[IMG_PATH + "image_4.jpg"], - expected_answer=True, - ), - BoolImageTaskInput( - question="Is there a plant behind the rack?", - images_paths=[IMG_PATH + "image_5.jpg"], - expected_answer=True, - ), - BoolImageTaskInput( - question="Is there a pillow on the armchair?", - images_paths=[IMG_PATH + "image_7.jpg"], - expected_answer=True, - ), -] -false_response_inputs: List[BoolImageTaskInput] = [ - BoolImageTaskInput( - question="Is the door open?", - images_paths=[IMG_PATH + "image_1.jpg"], - expected_answer=False, - ), - BoolImageTaskInput( - question="Is someone in the room?", - images_paths=[IMG_PATH + "image_1.jpg"], - expected_answer=False, - ), - BoolImageTaskInput( - question="Do you see the plant?", - images_paths=[IMG_PATH + "image_3.jpg"], - expected_answer=False, - ), - BoolImageTaskInput( - question="Are there 4 pictures on the wall?", - images_paths=[IMG_PATH + "image_4.jpg"], - expected_answer=False, - ), - BoolImageTaskInput( - question="Is there a rack on the left from the sofa?", - images_paths=[IMG_PATH + "image_4.jpg"], - expected_answer=False, - ), - BoolImageTaskInput( - question="Is there a plant on the right from the window?", - images_paths=[IMG_PATH + "image_6.jpg"], - expected_answer=False, - ), - BoolImageTaskInput( - question="Is there a red pillow on the armchair?", - images_paths=[IMG_PATH + "image_7.jpg"], - expected_answer=False, - ), -] - - -def get_spatial_tasks() -> List[ImageReasoningTask[BaseModel]]: - true_tasks = [ - BoolImageTask( - task_input=input_item, - ) - for input_item in true_response_inputs - ] - false_tasks = [ - BoolImageTask( - task_input=input_item, - ) - for input_item in false_response_inputs - ] - return cast(List[ImageReasoningTask[BaseModel]], true_tasks + false_tasks) + + +image_1 = IMG_PATH + "image_1.jpg" + +image_1_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is the door on the left from the desk?", + images_paths=[image_1], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is the door open?", + images_paths=[image_1], + expected_answer=False, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is someone in the room?", + images_paths=[image_1], + expected_answer=False, + ) + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_1], + options=[ + "gauge", + "bed", + "barrels", + "lamp with a shade", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + ], + expected_answer=["bed", "lamp with a shade", "door", "desk"], + ) + ), +] + +image_2 = IMG_PATH + "image_2.jpg" + +image_2_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is the light on in the room?", + images_paths=[image_2], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Do you see the plant?", + images_paths=[image_2], + expected_answer=True, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many windows are visible in the image?", + images_paths=[image_2], + expected_answer=2, + ) + ), +] + +image_3 = IMG_PATH + "image_3.jpg" + +image_3_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Are there any pictures on the wall?", + images_paths=[image_3], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Do you see the plant?", + images_paths=[image_3], + expected_answer=False, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many pictures are on the wall?", + images_paths=[image_3], + expected_answer=3, + ) + ), +] + +image_4 = IMG_PATH + "image_4.jpg" + +image_4_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Are there 3 pictures on the wall?", + images_paths=[image_4], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Are there 4 pictures on the wall?", + images_paths=[image_4], + expected_answer=False, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is there a rack on the left from the sofa?", + images_paths=[image_4], + expected_answer=False, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many pictures are on the wall?", + images_paths=[image_4], + expected_answer=3, + ) + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_4], + options=[ + "sofa", + "gauge", + "bed", + "armchair", + "barrels", + "lamp with a shade", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + "dresser", + "TV", + ], + expected_answer=["sofa", "plant", "TV", "dresser"], + ) + ), +] + +image_5 = IMG_PATH + "image_5.jpg" + +image_5_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is there a plant behind the rack?", + images_paths=[image_5], + expected_answer=True, + ) + ), +] + +image_6 = IMG_PATH + "image_6.jpg" + +image_6_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is there a plant on the right from the window?", + images_paths=[image_6], + expected_answer=False, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many pictures are on the wall?", + images_paths=[image_6], + expected_answer=2, + ) + ), +] + +image_7 = IMG_PATH + "image_7.jpg" + +image_7_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is there a pillow on the armchair?", + images_paths=[image_7], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is there a red pillow on the armchair?", + images_paths=[image_7], + expected_answer=False, + ) + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_7], + options=[ + "sofa", + "gauge", + "armchair", + "barrels", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + ], + expected_answer=["sofa", "armchair", "door", "desk", "plant"], + ) + ), +] + +image_8 = IMG_PATH + "image_8.png" + +image_8_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Do you see something anomalous in the image?", + images_paths=[image_8], + expected_answer=True, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many people are in the image?", + images_paths=[image_8], + expected_answer=1, + ) + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_8], + options=[ + "sofa", + "gauge", + "bed", + "armchair", + "barrels", + "lamp with a shade", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + "dresser", + ], + expected_answer=["human", "boxes"], + ) + ), +] + +image_9 = IMG_PATH + "image_9.png" + +image_9_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Do you see something anomalous in the image?", + images_paths=[image_9], + expected_answer=True, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many people are in the image?", + images_paths=[image_9], + expected_answer=1, + ) + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_9], + options=[ + "sofa", + "gauge", + "bed", + "armchair", + "barrels", + "lamp with a shade", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + "dresser", + ], + expected_answer=["boxes", "human", "roll container"], + ) + ), +] + +image_10 = IMG_PATH + "image_10.png" + +image_10_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Do you see something anomalous in the image?", + images_paths=[image_10], + expected_answer=True, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many people are in the image?", + images_paths=[image_10], + expected_answer=1, + ) + ), +] + +image_11 = IMG_PATH + "image_11.png" + +image_11_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is any person in the image?", + images_paths=[image_11], + expected_answer=True, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many barrels are in the image?", + images_paths=[image_11], + expected_answer=6, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many fallen barrels are in the image?", + images_paths=[image_11], + expected_answer=2, + ) + ), +] + +image_12 = IMG_PATH + "image_12.png" + +image_12_tasks: List[ImageReasoningTask[Any]] = [ + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many fallen barrels are in the image?", + images_paths=[image_12], + expected_answer=1, + ), + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many people are in the image?", + images_paths=[image_12], + expected_answer=0, + ) + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_12], + options=[ + "sofa", + "gauge", + "bed", + "armchair", + "barrels", + "lamp with a shade", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + "dresser", + ], + expected_answer=["barrels", "boxes", "roll container"], + ), + ), +] + +image_13 = IMG_PATH + "image_13.png" + +image_13_tasks: List[ImageReasoningTask[Any]] = [ + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many fallen barrels are in the image?", + images_paths=[image_13], + expected_answer=1, + ) + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many barrels are in the image?", + images_paths=[image_13], + expected_answer=6, + ) + ), +] + +image_14 = IMG_PATH + "image_14.png" + +image_14_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is gauge level high?", + images_paths=[image_14], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is gauge level low?", + images_paths=[image_14], + expected_answer=False, + ), + ), + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many barrels are in the image?", + images_paths=[image_14], + expected_answer=4, + ), + ), + MultipleChoiceImageTask( + task_input=MultipleChoiceImageTaskInput( + question="What is in the image?", + images_paths=[image_14], + options=[ + "sofa", + "gauge", + "bed", + "armchair", + "barrels", + "lamp with a shade", + "door", + "boxes", + "human", + "desk", + "plant", + "roll container", + "dresser", + ], + expected_answer=["gauge", "barrels"], + ) + ), +] + +image_15 = IMG_PATH + "image_15.png" +image_15_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is worker wearing a vest?", + images_paths=[image_15], + expected_answer=True, + ) + ), + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is worker wearing a helmet?", + images_paths=[image_15], + expected_answer=False, + ) + ), +] + +image_16 = IMG_PATH + "image_16.png" + +image_16_tasks: List[ImageReasoningTask[Any]] = [ + BoolImageTask( + task_input=BoolImageTaskInput( + question="Is worker wearing a helmet?", + images_paths=[image_16], + expected_answer=False, + ), + ) +] + +image_17 = IMG_PATH + "image_17.png" + + +image_17_tasks: List[ImageReasoningTask[Any]] = [ + QuantityImageTask( + task_input=QuantityImageTaskInput( + question="How many damaged boxes are in the image?", + images_paths=[image_17], + expected_answer=2, + ) + ), +] + + +def get_spatial_tasks() -> List[ImageReasoningTask[Any]]: + """ + Return a flat list with all predefined spatial image reasoning tasks + declared in this module. + """ + all_tasks: List[ImageReasoningTask[Any]] = [] + for task_group in [ + image_1_tasks, + image_2_tasks, + image_3_tasks, + image_4_tasks, + image_5_tasks, + image_6_tasks, + image_7_tasks, + image_8_tasks, + image_9_tasks, + image_10_tasks, + image_11_tasks, + image_12_tasks, + image_13_tasks, + image_14_tasks, + image_15_tasks, + image_16_tasks, + image_17_tasks, + ]: + all_tasks.extend(task_group) + return all_tasks diff --git a/src/rai_bench/rai_bench/vlm_benchmark/results_tracking.py b/src/rai_bench/rai_bench/vlm_benchmark/results_tracking.py index 5d8b77b71..2e5a649d2 100644 --- a/src/rai_bench/rai_bench/vlm_benchmark/results_tracking.py +++ b/src/rai_bench/rai_bench/vlm_benchmark/results_tracking.py @@ -19,6 +19,7 @@ class TaskResult(BaseModel): + task_id: str = Field(..., description="Unique identifier for the task object.") task_prompt: str = Field(..., description="The task prompt.") system_prompt: str = Field(..., description="The system prompt.") complexity: str = Field(..., description="Complexity of the task.") diff --git a/src/rai_bench/rai_bench/vlm_benchmark/tasks/tasks.py b/src/rai_bench/rai_bench/vlm_benchmark/tasks/tasks.py index 639b50400..5d1a6dffe 100644 --- a/src/rai_bench/rai_bench/vlm_benchmark/tasks/tasks.py +++ b/src/rai_bench/rai_bench/vlm_benchmark/tasks/tasks.py @@ -14,35 +14,50 @@ import logging -from typing import List +from typing import List, Type -from pydantic import BaseModel, Field +from pydantic import Field from rai.messages import preprocess_image -from rai_bench.vlm_benchmark.interfaces import ImageReasoningTask +from rai_bench.vlm_benchmark.interfaces import ( + ImageReasoningAnswer, + ImageReasoningTask, + ImageReasoningTaskInput, +) loggers_type = logging.Logger -class BoolAnswerWithJustification(BaseModel): +class BoolAnswerWithJustification(ImageReasoningAnswer[bool]): """A boolean answer to the user question along with justification for the answer.""" - answer: bool - justification: str +class QuantityAnswerWithJustification(ImageReasoningAnswer[int]): + """A quantity answer telling the number of objects to the user question along with justification for the answer.""" -class BoolImageTaskInput(BaseModel): - question: str = Field(..., description="The question to be answered.") - images_paths: List[str] = Field( + +class MultipleChoiceAnswerWithJustification(ImageReasoningAnswer[List[str]]): + """A multiple choice answer to the user question along with justification for the answer.""" + + +class BoolImageTaskInput(ImageReasoningTaskInput[bool]): + """Input for a task that requires a boolean answer to a question about an image.""" + + +class QuantityImageTaskInput(ImageReasoningTaskInput[int]): + """Input for a task that requires counting objects in an image.""" + + +class MultipleChoiceImageTaskInput(ImageReasoningTaskInput[List[str]]): + """Input for a task that requires selecting one or more answers from a list of options.""" + + options: List[str] = Field( ..., - description="List of image file paths to be used for answering the question.", - ) - expected_answer: bool = Field( - ..., description="The expected answer to the question." + description="List of possible answers to the question.", ) -class BoolImageTask(ImageReasoningTask[BoolAnswerWithJustification]): +class BoolImageTask(ImageReasoningTask[bool]): complexity = "easy" def __init__( @@ -51,11 +66,9 @@ def __init__( logger: loggers_type | None = None, ) -> None: super().__init__( + task_input=task_input, logger=logger, ) - self.question = task_input.question - self.images_paths = task_input.images_paths - self.expected_answer = task_input.expected_answer @property def structured_output(self) -> type[BoolAnswerWithJustification]: @@ -72,5 +85,81 @@ def get_images(self): images = [preprocess_image(image_path) for image_path in self.images_paths] return images - def validate(self, output: BoolAnswerWithJustification) -> bool: - return output.answer == self.expected_answer + def validate(self, output: ImageReasoningAnswer[bool]) -> float: + return float(output.answer == self.expected_answer) + + +class QuantityImageTask(ImageReasoningTask[int]): + """A task that requires counting objects in an image.""" + + complexity = "medium" + + def __init__( + self, + task_input: QuantityImageTaskInput, + logger: loggers_type | None = None, + ) -> None: + super().__init__(task_input=task_input, logger=logger) + + @property + def type(self) -> str: + return "quantity_response_image_task" + + @property + def structured_output(self) -> Type[QuantityAnswerWithJustification]: + return QuantityAnswerWithJustification + + def validate(self, output: ImageReasoningAnswer[int]) -> float: + return float(output.answer == self.expected_answer) + + def get_prompt(self) -> str: + return self.question + + def get_images(self): + images = [preprocess_image(image_path) for image_path in self.images_paths] + return images + + +class MultipleChoiceImageTask(ImageReasoningTask[List[str]]): + """A task that requires selecting one or more answers from a set of options.""" + + complexity = "hard" + + def __init__( + self, + task_input: MultipleChoiceImageTaskInput, + logger: loggers_type | None = None, + ) -> None: + super().__init__(task_input=task_input, logger=logger) + self.options = task_input.options + + @property + def type(self) -> str: + return "multiple_choice_response_image_task" + + @property + def structured_output(self) -> Type[MultipleChoiceAnswerWithJustification]: + return MultipleChoiceAnswerWithJustification + + def validate(self, output: ImageReasoningAnswer[List[str]]) -> float: + answers_processed = set([answer.casefold() for answer in output.answer]) + expected_processed = set([answer.casefold() for answer in self.expected_answer]) + + if not answers_processed.issubset(expected_processed): + return 0.0 + + correct_count = len(answers_processed.intersection(expected_processed)) + total_expected = len(expected_processed) + + return float(correct_count / total_expected) if total_expected > 0 else 0.0 + + def get_prompt(self) -> str: + return ( + self.question + + " Choose one or more answers from the options: " + + ", ".join(self.options) + ) + + def get_images(self): + images = [preprocess_image(image_path) for image_path in self.images_paths] + return images