Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
e3fdd47
feat: interfaces for quantity and multiple choice tasks
MagdalenaKotynia Aug 12, 2025
143e7c3
feat: created new tasks with warehouse simulation images
MagdalenaKotynia Aug 12, 2025
e5652c0
feat: added vlm bench config to test many models
MagdalenaKotynia Aug 12, 2025
2b2e360
chore: moved old image files from vlm benchmark to lfs
MagdalenaKotynia Aug 12, 2025
d566994
refactor: extracted common logic for vlm tasks inputs and answers to …
MagdalenaKotynia Aug 12, 2025
02b459b
docs: added info about increasing event size for Langfuse tracing of …
MagdalenaKotynia Aug 12, 2025
466ec59
feat: added merging results summaries across all repeats and all models
MagdalenaKotynia Aug 12, 2025
1adc3d4
fix: fixed typing after refactor extracting common logic for vlm task…
MagdalenaKotynia Aug 13, 2025
f791e85
refactor: reorganized tasks order by images
MagdalenaKotynia Aug 13, 2025
fe1b5a8
chore: removed unused comment
MagdalenaKotynia Aug 13, 2025
90d163b
refactor: created separate structure for storing summary for the mode…
MagdalenaKotynia Aug 14, 2025
01179b6
fix: fixed validation of parsed llm output
MagdalenaKotynia Aug 14, 2025
30c6b6a
refactor: added task id, refactor of storing task_input variables
MagdalenaKotynia Aug 18, 2025
084b484
fix: aggregate tasks repeats results by the task id
MagdalenaKotynia Aug 18, 2025
8e3c01c
refactor: renamed csv storing model summary
MagdalenaKotynia Aug 18, 2025
5417762
feat: added std success rate and std time for model across all repeats
MagdalenaKotynia Aug 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
*.png filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.gif filter=lfs diff=lfs merge=lfs -text
*.webp filter=lfs diff=lfs merge=lfs -text
*.bmp filter=lfs diff=lfs merge=lfs -text
*.tif filter=lfs diff=lfs merge=lfs -text
*.tiff filter=lfs diff=lfs merge=lfs -text
*.svg filter=lfs diff=lfs merge=lfs -text
3 changes: 3 additions & 0 deletions src/rai_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ The VLM Benchmark is a benchmark for VLM models. It includes a set of tasks cont

To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/tracing.md) document.

> [!IMPORTANT]
> If you are going to use Lanfuse for tracing results, do `export LANGFUSE_MAX_EVENT_SIZE_BYTES=20000000`. By default, the maximum tracing event size is 1MB, but some tasks exceed this limit.

To run the benchmark:

```bash
Expand Down
2 changes: 2 additions & 0 deletions src/rai_bench/rai_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .test_models import (
ManipulationO3DEBenchmarkConfig,
ToolCallingAgentBenchmarkConfig,
VLMBenchmarkConfig,
test_dual_agents,
test_models,
)
Expand All @@ -28,6 +29,7 @@
__all__ = [
"ManipulationO3DEBenchmarkConfig",
"ToolCallingAgentBenchmarkConfig",
"VLMBenchmarkConfig",
"define_benchmark_logger",
"get_llm_for_benchmark",
"parse_manipulation_o3de_benchmark_args",
Expand Down
48 changes: 48 additions & 0 deletions src/rai_bench/rai_bench/base_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,54 @@ class RunSummary(BaseModel):
total_tasks: int = Field(..., description="Total number of executed tasks.")


class ModelSummary(BaseModel):
model_name: str = Field(..., description="Name of the LLM.")
avg_success_rate: float = Field(
...,
description="Percentage of successfully completed tasks across all repeats.",
)
std_success_rate: float = Field(
...,
description="Standard deviation of success rate across all repeats for this model.",
)
avg_total_tasks: float = Field(
..., description="Average number of tasks executed through all repeats."
)
avg_time: float = Field(
..., description="Average time taken across all tasks and repeats."
)
std_time: float = Field(
...,
description="Standard deviation of time taken across all repeats for this model.",
)

repeats: int = Field(
..., description="Total number of repeats for the model for each task."
)


class TasksSummary(BaseModel):
model_name: str = Field(..., description="Name of the LLM.")
task_id: str = Field(..., description="Unique identifier for the task.")
task_prompt: str = Field(
..., description="The task prompt that identifies the task."
)
avg_success_rate: float = Field(
..., description="Average result for task across all repeats."
)
std_success_rate: float = Field(
..., description="Standard deviation of the success rate across all repeats."
)
avg_time: float = Field(
..., description="Average time taken across all repeats for one task."
)
std_time: float = Field(
...,
description="Standard deviation of the time taken across all repeats for one task.",
)
repeats: int = Field(..., description="Total number of repeats for task.")


class TimeoutException(Exception):
pass

Expand Down
225 changes: 225 additions & 0 deletions src/rai_bench/rai_bench/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,36 @@
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
import csv
import uuid
from abc import abstractmethod
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Literal

import numpy as np
from git import Optional
from langchain.chat_models.base import BaseChatModel
from pydantic import BaseModel

import rai_bench.manipulation_o3de as manipulation_o3de
import rai_bench.tool_calling_agent as tool_calling_agent
import rai_bench.vlm_benchmark as vlm_benchmark
from rai_bench.base_benchmark import ModelSummary, RunSummary, TasksSummary
from rai_bench.results_processing.data_loading import (
DETAILED_FILE_NAME,
SUMMARY_FILE_NAME,
)
from rai_bench.utils import (
define_benchmark_logger,
get_llm_for_benchmark,
get_llm_model_name,
)

MODEL_SUMMARY_FILE_NAME = "model_summary.csv"
TASKS_SUMMARY_FILE_NAME = "tasks_summary.csv"
BENCHMARK_SUMMARY = "benchmark_summary.csv"


class BenchmarkConfig(BaseModel):
repeats: int = 1
Expand Down Expand Up @@ -77,6 +89,194 @@ def name(self) -> str:
return "tool_calling_agent"


class VLMBenchmarkConfig(BenchmarkConfig):
complexities: List[Literal["easy", "medium", "hard"]] = ["easy", "medium", "hard"]
task_types: List[
Literal[
"bool_response_image_task",
"quantity_response_image_task",
"multiple_choice_image_task",
]
] = [
"bool_response_image_task",
"quantity_response_image_task",
"multiple_choice_image_task",
]

@property
def name(self) -> str:
return "vlm"


def merge_model_repeats_summary(
bench_name: str, model_name: str, run_dir: Path
) -> None:
"""Merge summary results across all repeats for a single model.

Parameters
----------
bench_name : str
Name of the benchmark
model_name : str
Name of the model
run_dir : Path
Directory containing the benchmark run results
"""
model_dir = run_dir / bench_name / model_name
if not model_dir.exists():
return

summaries: List[RunSummary] = []
for repeat_dir in model_dir.iterdir():
if repeat_dir.is_dir() and repeat_dir.name.isdigit():
summary_file = repeat_dir / SUMMARY_FILE_NAME
if summary_file.exists():
with open(summary_file, "r") as f:
reader = csv.DictReader(f)
for row in reader:
summaries.append(RunSummary.model_validate(row))

if not summaries:
return

success_rates = [s.success_rate for s in summaries]
times = [s.avg_time for s in summaries]
total_tasks_list = [s.total_tasks for s in summaries]

avg_success_rate = np.mean(success_rates)
std_success_rate = np.std(success_rates)
avg_time = np.mean(times)
std_time = np.std(times)
total_tasks = np.mean(total_tasks_list)

merged_summary = ModelSummary(
model_name=model_name,
avg_success_rate=round(float(avg_success_rate), 2),
std_success_rate=round(float(std_success_rate), 3),
avg_time=round(float(avg_time), 3),
std_time=round(float(std_time), 3),
avg_total_tasks=round(float(total_tasks), 3),
repeats=len(summaries),
)

merged_file = model_dir / MODEL_SUMMARY_FILE_NAME
with open(merged_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=ModelSummary.model_fields.keys())
writer.writeheader()
writer.writerow(merged_summary.model_dump())


def merge_benchmark_summary(
bench_name: str, run_dir: Path, model_names: List[str]
) -> None:
"""Merge summary results across all models for a single benchmark.

Parameters
----------
bench_name : str
Name of the benchmark
run_dir : Path
Directory containing the benchmark run results
model_names : List[str]
List of model names to include in the summary
"""
bench_dir = run_dir / bench_name
if not bench_dir.exists():
return

all_summaries: List[ModelSummary] = []
for model_name in model_names:
model_dir = bench_dir / model_name
merged_file = model_dir / MODEL_SUMMARY_FILE_NAME

if merged_file.exists():
with open(merged_file, "r") as f:
reader = csv.DictReader(f)
for row in reader:
all_summaries.append(ModelSummary.model_validate(row))

if not all_summaries:
return

benchmark_summary_file = bench_dir / BENCHMARK_SUMMARY
with open(benchmark_summary_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=ModelSummary.model_fields.keys())
writer.writeheader()
for summary in all_summaries:
writer.writerow(summary.model_dump())


def merge_tasks_summary(bench_name: str, model_name: str, run_dir: Path) -> None:
"""Merge task results across all repeats for a single model, aggregating by task.

Parameters
----------
bench_name : str
Name of the benchmark
model_name : str
Name of the model
run_dir : Path
Directory containing the benchmark run results
"""
model_dir = run_dir / bench_name / model_name
if not model_dir.exists():
return

task_data_by_id: Dict[str, Dict[str, Any]] = {}

for repeat_dir in model_dir.iterdir():
if repeat_dir.is_dir() and repeat_dir.name.isdigit():
results_file = repeat_dir / DETAILED_FILE_NAME
if results_file.exists():
with open(results_file, "r") as f:
reader = csv.DictReader(f)
for row in reader:
task_id = row["task_id"]
task_prompt = row["task_prompt"]
score = float(row["score"])
total_time = float(row["total_time"])

if task_id not in task_data_by_id:
task_data_by_id[task_id] = {
"scores": [],
"times": [],
"task_prompt": task_prompt,
}

task_data_by_id[task_id]["scores"].append(score)
task_data_by_id[task_id]["times"].append(total_time)

if not task_data_by_id:
return

# Calculate statistics for each task
task_summaries: List[TasksSummary] = []
for task_id, data in task_data_by_id.items():
scores = np.array(data["scores"])
times = np.array(data["times"])
task_prompt = data["task_prompt"]

task_summary = TasksSummary(
model_name=model_name,
task_id=task_id,
task_prompt=task_prompt,
avg_success_rate=round(float(scores.mean()), 3),
std_success_rate=round(float(scores.std()), 3),
avg_time=round(float(times.mean()), 3),
std_time=round(float(times.std()), 3),
repeats=len(scores),
)
task_summaries.append(task_summary)

tasks_summary_file = model_dir / TASKS_SUMMARY_FILE_NAME
with open(tasks_summary_file, "w", newline="") as f:
if task_summaries:
writer = csv.DictWriter(f, fieldnames=TasksSummary.model_fields.keys())
writer.writeheader()
for task_summary in task_summaries:
writer.writerow(task_summary.model_dump())


def test_dual_agents(
multimodal_llms: List[BaseChatModel],
tool_calling_models: List[BaseChatModel],
Expand Down Expand Up @@ -163,6 +363,7 @@ def test_models(
# for each bench configuration seperate run folder
now = datetime.now()
run_name = f"run_{now.strftime('%Y-%m-%d_%H-%M-%S')}"
run_dir = Path(out_dir) / run_name
for i, model_name in enumerate(model_names):
for u in range(bench_conf.repeats):
curr_out_dir = (
Expand Down Expand Up @@ -211,8 +412,32 @@ def test_models(
experiment_id=experiment_id,
bench_logger=bench_logger,
)

elif isinstance(bench_conf, VLMBenchmarkConfig):
vlm_tasks = vlm_benchmark.get_spatial_tasks()
vlm_benchmark.run_benchmark(
llm=llm,
out_dir=Path(curr_out_dir),
tasks=vlm_tasks,
bench_logger=bench_logger,
)

except Exception as e:
bench_logger.critical(f"BENCHMARK RUN FAILED: {e}")
bench_logger.critical(
f"{bench_conf.name} benchmark for {model_name}, vendor: {vendors[i]}, execution number: {u + 1}"
)
merge_results_logger = define_benchmark_logger(out_dir=Path(out_dir))
merge_results_logger.info(
f"Merging summaries for benchmark: {bench_conf.name}"
)

for model_name in model_names:
merge_model_repeats_summary(bench_conf.name, model_name, run_dir)
merge_tasks_summary(bench_conf.name, model_name, run_dir)

merge_benchmark_summary(bench_conf.name, run_dir, model_names)

merge_results_logger.info(
f"Summary merging completed for benchmark: {bench_conf.name}"
)
3 changes: 2 additions & 1 deletion src/rai_bench/rai_bench/vlm_benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,15 @@ def run_next(self, agent: CompiledStateGraph, experiment_id: uuid.UUID) -> None:
score = task.validate(output=structured_output)
else:
errors.append(f"Not valid structured output: {type(structured_output)}")
score = False
score = 0

te = time.perf_counter()
total_time = te - ts

self.logger.info(f"TASK SCORE: {score}, TOTAL TIME: {total_time:.3f}")

task_result = TaskResult(
task_id=task.task_id,
task_prompt=task.get_prompt(),
system_prompt=task.get_system_prompt(),
type=task.type,
Expand Down
Loading