RobotecAI · MagdalenaKotynia · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,9 @@
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.tif filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+*.svg filter=lfs diff=lfs merge=lfs -text
diff --git a/src/rai_bench/README.md b/src/rai_bench/README.md
@@ -171,6 +171,9 @@ The VLM Benchmark is a benchmark for VLM models. It includes a set of tasks cont
 
 To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/tracing.md) document.
 
+> [!IMPORTANT]
+> If you are going to use Lanfuse for tracing results, do `export LANGFUSE_MAX_EVENT_SIZE_BYTES=20000000`. By default, the maximum tracing event size is 1MB, but some tasks exceed this limit.
+
 To run the benchmark:
 
 ```bash

diff --git a/src/rai_bench/rai_bench/__init__.py b/src/rai_bench/rai_bench/__init__.py
@@ -14,6 +14,7 @@
 from .test_models import (
     ManipulationO3DEBenchmarkConfig,
     ToolCallingAgentBenchmarkConfig,
+    VLMBenchmarkConfig,
     test_dual_agents,
     test_models,
 )
@@ -28,6 +29,7 @@
 __all__ = [
     "ManipulationO3DEBenchmarkConfig",
     "ToolCallingAgentBenchmarkConfig",
+    "VLMBenchmarkConfig",
     "define_benchmark_logger",
     "get_llm_for_benchmark",
     "parse_manipulation_o3de_benchmark_args",

diff --git a/src/rai_bench/rai_bench/base_benchmark.py b/src/rai_bench/rai_bench/base_benchmark.py
@@ -35,6 +35,54 @@ class RunSummary(BaseModel):
     total_tasks: int = Field(..., description="Total number of executed tasks.")
 
 
+class ModelSummary(BaseModel):
+    model_name: str = Field(..., description="Name of the LLM.")
+    avg_success_rate: float = Field(
+        ...,
+        description="Percentage of successfully completed tasks across all repeats.",
+    )
+    std_success_rate: float = Field(
+        ...,
+        description="Standard deviation of success rate across all repeats for this model.",
+    )
+    avg_total_tasks: float = Field(
+        ..., description="Average number of tasks executed through all repeats."
+    )
+    avg_time: float = Field(
+        ..., description="Average time taken across all tasks and repeats."
+    )
+    std_time: float = Field(
+        ...,
+        description="Standard deviation of time taken across all repeats for this model.",
+    )
+
+    repeats: int = Field(
+        ..., description="Total number of repeats for the model for each task."
+    )
+
+
+class TasksSummary(BaseModel):
+    model_name: str = Field(..., description="Name of the LLM.")
+    task_id: str = Field(..., description="Unique identifier for the task.")
+    task_prompt: str = Field(
+        ..., description="The task prompt that identifies the task."
+    )
+    avg_success_rate: float = Field(
+        ..., description="Average result for task across all repeats."
+    )
+    std_success_rate: float = Field(
+        ..., description="Standard deviation of the success rate across all repeats."
+    )
+    avg_time: float = Field(
+        ..., description="Average time taken across all repeats for one task."
+    )
+    std_time: float = Field(
+        ...,
+        description="Standard deviation of the time taken across all repeats for one task.",
+    )
+    repeats: int = Field(..., description="Total number of repeats for task.")
+
+
 class TimeoutException(Exception):
     pass
 

diff --git a/src/rai_bench/rai_bench/test_models.py b/src/rai_bench/rai_bench/test_models.py
@@ -11,24 +11,36 @@
 # # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # # See the License for the specific language governing permissions and
 # # limitations under the License.
+import csv
 import uuid
 from abc import abstractmethod
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Literal
 
+import numpy as np
 from git import Optional
 from langchain.chat_models.base import BaseChatModel
 from pydantic import BaseModel
 
 import rai_bench.manipulation_o3de as manipulation_o3de
 import rai_bench.tool_calling_agent as tool_calling_agent
+import rai_bench.vlm_benchmark as vlm_benchmark
+from rai_bench.base_benchmark import ModelSummary, RunSummary, TasksSummary
+from rai_bench.results_processing.data_loading import (
+    DETAILED_FILE_NAME,
+    SUMMARY_FILE_NAME,
+)
 from rai_bench.utils import (
     define_benchmark_logger,
     get_llm_for_benchmark,
     get_llm_model_name,
 )
 
+MODEL_SUMMARY_FILE_NAME = "model_summary.csv"
+TASKS_SUMMARY_FILE_NAME = "tasks_summary.csv"
+BENCHMARK_SUMMARY = "benchmark_summary.csv"
+
 
 class BenchmarkConfig(BaseModel):
     repeats: int = 1
@@ -77,6 +89,194 @@ def name(self) -> str:
         return "tool_calling_agent"
 
 
+class VLMBenchmarkConfig(BenchmarkConfig):
+    complexities: List[Literal["easy", "medium", "hard"]] = ["easy", "medium", "hard"]
+    task_types: List[
+        Literal[
+            "bool_response_image_task",
+            "quantity_response_image_task",
+            "multiple_choice_image_task",
+        ]
+    ] = [
+        "bool_response_image_task",
+        "quantity_response_image_task",
+        "multiple_choice_image_task",
+    ]
+
+    @property
+    def name(self) -> str:
+        return "vlm"
+
+
+def merge_model_repeats_summary(
+    bench_name: str, model_name: str, run_dir: Path
+) -> None:
+    """Merge summary results across all repeats for a single model.
+
+    Parameters
+    ----------
+    bench_name : str
+        Name of the benchmark
+    model_name : str
+        Name of the model
+    run_dir : Path
+        Directory containing the benchmark run results
+    """
+    model_dir = run_dir / bench_name / model_name
+    if not model_dir.exists():
+        return
+
+    summaries: List[RunSummary] = []
+    for repeat_dir in model_dir.iterdir():
+        if repeat_dir.is_dir() and repeat_dir.name.isdigit():
+            summary_file = repeat_dir / SUMMARY_FILE_NAME
+            if summary_file.exists():
+                with open(summary_file, "r") as f:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        summaries.append(RunSummary.model_validate(row))
+
+    if not summaries:
+        return
+
+    success_rates = [s.success_rate for s in summaries]
+    times = [s.avg_time for s in summaries]
+    total_tasks_list = [s.total_tasks for s in summaries]
+
+    avg_success_rate = np.mean(success_rates)
+    std_success_rate = np.std(success_rates)
+    avg_time = np.mean(times)
+    std_time = np.std(times)
+    total_tasks = np.mean(total_tasks_list)
+
+    merged_summary = ModelSummary(
+        model_name=model_name,
+        avg_success_rate=round(float(avg_success_rate), 2),
+        std_success_rate=round(float(std_success_rate), 3),
+        avg_time=round(float(avg_time), 3),
+        std_time=round(float(std_time), 3),
+        avg_total_tasks=round(float(total_tasks), 3),
+        repeats=len(summaries),
+    )
+
+    merged_file = model_dir / MODEL_SUMMARY_FILE_NAME
+    with open(merged_file, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=ModelSummary.model_fields.keys())
+        writer.writeheader()
+        writer.writerow(merged_summary.model_dump())
+
+
+def merge_benchmark_summary(
+    bench_name: str, run_dir: Path, model_names: List[str]
+) -> None:
+    """Merge summary results across all models for a single benchmark.
+
+    Parameters
+    ----------
+    bench_name : str
+        Name of the benchmark
+    run_dir : Path
+        Directory containing the benchmark run results
+    model_names : List[str]
+        List of model names to include in the summary
+    """
+    bench_dir = run_dir / bench_name
+    if not bench_dir.exists():
+        return
+
+    all_summaries: List[ModelSummary] = []
+    for model_name in model_names:
+        model_dir = bench_dir / model_name
+        merged_file = model_dir / MODEL_SUMMARY_FILE_NAME
+
+        if merged_file.exists():
+            with open(merged_file, "r") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    all_summaries.append(ModelSummary.model_validate(row))
+
+    if not all_summaries:
+        return
+
+    benchmark_summary_file = bench_dir / BENCHMARK_SUMMARY
+    with open(benchmark_summary_file, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=ModelSummary.model_fields.keys())
+        writer.writeheader()
+        for summary in all_summaries:
+            writer.writerow(summary.model_dump())
+
+
+def merge_tasks_summary(bench_name: str, model_name: str, run_dir: Path) -> None:
+    """Merge task results across all repeats for a single model, aggregating by task.
+
+    Parameters
+    ----------
+    bench_name : str
+        Name of the benchmark
+    model_name : str
+        Name of the model
+    run_dir : Path
+        Directory containing the benchmark run results
+    """
+    model_dir = run_dir / bench_name / model_name
+    if not model_dir.exists():
+        return
+
+    task_data_by_id: Dict[str, Dict[str, Any]] = {}
+
+    for repeat_dir in model_dir.iterdir():
+        if repeat_dir.is_dir() and repeat_dir.name.isdigit():
+            results_file = repeat_dir / DETAILED_FILE_NAME
+            if results_file.exists():
+                with open(results_file, "r") as f:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        task_id = row["task_id"]
+                        task_prompt = row["task_prompt"]
+                        score = float(row["score"])
+                        total_time = float(row["total_time"])
+
+                        if task_id not in task_data_by_id:
+                            task_data_by_id[task_id] = {
+                                "scores": [],
+                                "times": [],
+                                "task_prompt": task_prompt,
+                            }
+
+                        task_data_by_id[task_id]["scores"].append(score)
+                        task_data_by_id[task_id]["times"].append(total_time)
+
+    if not task_data_by_id:
+        return
+
+    # Calculate statistics for each task
+    task_summaries: List[TasksSummary] = []
+    for task_id, data in task_data_by_id.items():
+        scores = np.array(data["scores"])
+        times = np.array(data["times"])
+        task_prompt = data["task_prompt"]
+
+        task_summary = TasksSummary(
+            model_name=model_name,
+            task_id=task_id,
+            task_prompt=task_prompt,
+            avg_success_rate=round(float(scores.mean()), 3),
+            std_success_rate=round(float(scores.std()), 3),
+            avg_time=round(float(times.mean()), 3),
+            std_time=round(float(times.std()), 3),
+            repeats=len(scores),
+        )
+        task_summaries.append(task_summary)
+
+    tasks_summary_file = model_dir / TASKS_SUMMARY_FILE_NAME
+    with open(tasks_summary_file, "w", newline="") as f:
+        if task_summaries:
+            writer = csv.DictWriter(f, fieldnames=TasksSummary.model_fields.keys())
+            writer.writeheader()
+            for task_summary in task_summaries:
+                writer.writerow(task_summary.model_dump())
+
+
 def test_dual_agents(
     multimodal_llms: List[BaseChatModel],
     tool_calling_models: List[BaseChatModel],
@@ -163,6 +363,7 @@ def test_models(
             # for each bench configuration seperate run folder
             now = datetime.now()
             run_name = f"run_{now.strftime('%Y-%m-%d_%H-%M-%S')}"
+            run_dir = Path(out_dir) / run_name
             for i, model_name in enumerate(model_names):
                 for u in range(bench_conf.repeats):
                     curr_out_dir = (
@@ -211,8 +412,32 @@ def test_models(
                                 experiment_id=experiment_id,
                                 bench_logger=bench_logger,
                             )
+
+                        elif isinstance(bench_conf, VLMBenchmarkConfig):
+                            vlm_tasks = vlm_benchmark.get_spatial_tasks()
+                            vlm_benchmark.run_benchmark(
+                                llm=llm,
+                                out_dir=Path(curr_out_dir),
+                                tasks=vlm_tasks,
+                                bench_logger=bench_logger,
+                            )
+
                     except Exception as e:
                         bench_logger.critical(f"BENCHMARK RUN FAILED: {e}")
                         bench_logger.critical(
                             f"{bench_conf.name} benchmark for {model_name}, vendor: {vendors[i]}, execution number: {u + 1}"
                         )
+            merge_results_logger = define_benchmark_logger(out_dir=Path(out_dir))
+            merge_results_logger.info(
+                f"Merging summaries for benchmark: {bench_conf.name}"
+            )
+
+            for model_name in model_names:
+                merge_model_repeats_summary(bench_conf.name, model_name, run_dir)
+                merge_tasks_summary(bench_conf.name, model_name, run_dir)
+
+            merge_benchmark_summary(bench_conf.name, run_dir, model_names)
+
+            merge_results_logger.info(
+                f"Summary merging completed for benchmark: {bench_conf.name}"
+            )
diff --git a/src/rai_bench/rai_bench/vlm_benchmark/benchmark.py b/src/rai_bench/rai_bench/vlm_benchmark/benchmark.py
@@ -133,14 +133,15 @@ def run_next(self, agent: CompiledStateGraph, experiment_id: uuid.UUID) -> None:
             score = task.validate(output=structured_output)
         else:
             errors.append(f"Not valid structured output: {type(structured_output)}")
-            score = False
+            score = 0
 
         te = time.perf_counter()
         total_time = te - ts
 
         self.logger.info(f"TASK SCORE: {score}, TOTAL TIME: {total_time:.3f}")
 
         task_result = TaskResult(
+            task_id=task.task_id,
             task_prompt=task.get_prompt(),
             system_prompt=task.get_system_prompt(),
             type=task.type,