refactor(benchmark): move plot generation to analyze_combined, remove per-granularity plots

arcangelo7 · arcangelo7 · commit f88fb293a7a7 · 2026-02-21T13:12:45.000+01:00
diff --git a/benchmark/bear/analyze_combined.py b/benchmark/bear/analyze_combined.py
@@ -0,0 +1,183 @@
+from pathlib import Path
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.patches import Patch
+import numpy as np
+from rich.console import Console
+
+from analyze_results import (
+    DATA_DIR,
+    _format_log_axis,
+    _save_plot,
+    load_measured_ostrich_results,
+    load_ostrich_dm_by_version,
+    load_ostrich_vm_by_version,
+    load_ostrich_vq_median,
+    load_results,
+    load_tal_dm_by_version,
+    load_tal_vm_by_version,
+    load_tal_vq_median,
+)
+
+console = Console()
+
+# Line styles: system distinguished by color, granularity by dash pattern.
+# No markers. Dashes for hourly use long on/off segments to stay readable.
+STYLES = {
+    "tal_daily":  {"color": "#0072B2", "linestyle": "-",          "label": "TAL daily (89 ver.)"},
+    "tal_hourly": {"color": "#0072B2", "linestyle": (0, (8, 4)),  "label": "TAL hourly (1,299 ver.)"},
+    "ost_daily":  {"color": "#D55E00", "linestyle": "-",          "label": "OSTRICH daily (89 ver.)"},
+    "ost_hourly": {"color": "#D55E00", "linestyle": (0, (8, 4)),  "label": "OSTRICH hourly (1,299 ver.)"},
+}
+
+
+def _load_granularity_data(granularity: str) -> tuple[dict, list[Path]]:
+    results_file = DATA_DIR / f"benchmark_results_{granularity}.json"
+    ostrich_results_file = DATA_DIR / f"ostrich_benchmark_results_{granularity}.json"
+    ostrich_raw_files = [DATA_DIR / f"ostrich_raw_{pt}_{granularity}.txt" for pt in ["p", "po"]]
+    data = load_results(results_file)
+    load_measured_ostrich_results(ostrich_results_file)
+    return data, ostrich_raw_files
+
+
+def _normalize_keys(data: dict[int, float]) -> tuple[list[float], list[float]]:
+    versions = sorted(data.keys())
+    max_v = max(versions)
+    pct = [v / max_v * 100 for v in versions]
+    vals = [data[v] for v in versions]
+    return pct, vals
+
+
+def _plot_line(ax: Axes, pct: list[float], vals: list[float], style_key: str) -> None:
+    s = STYLES[style_key]
+    ax.plot(pct, vals, color=s["color"], linestyle=s["linestyle"],
+            linewidth=1.5, label=s["label"])
+
+
+def _plot_line_chart(ax: Axes,
+                     daily_data: dict[int, float], hourly_data: dict[int, float],
+                     daily_ost_data: dict[int, float] | None,
+                     hourly_ost_data: dict[int, float] | None) -> None:
+    pct_d, vals_d = _normalize_keys(daily_data)
+    pct_h, vals_h = _normalize_keys(hourly_data)
+    _plot_line(ax, pct_d, vals_d, "tal_daily")
+    _plot_line(ax, pct_h, vals_h, "tal_hourly")
+    if daily_ost_data:
+        pct, vals = _normalize_keys(daily_ost_data)
+        _plot_line(ax, pct, vals, "ost_daily")
+    if hourly_ost_data:
+        pct, vals = _normalize_keys(hourly_ost_data)
+        _plot_line(ax, pct, vals, "ost_hourly")
+
+
+def plot_vm_combined(daily_data: dict, daily_ost: list[Path],
+                     hourly_data: dict, hourly_ost: list[Path],
+                     plot_dir: Path) -> None:
+    fig, ax = plt.subplots(figsize=(8, 5))
+    ost_d = load_ostrich_vm_by_version(daily_ost) if any(f.exists() for f in daily_ost) else None
+    ost_h = load_ostrich_vm_by_version(hourly_ost) if any(f.exists() for f in hourly_ost) else None
+    _plot_line_chart(ax,
+                     load_tal_vm_by_version(daily_data["results"]["vm"]),
+                     load_tal_vm_by_version(hourly_data["results"]["vm"]),
+                     ost_d, ost_h)
+    _format_log_axis(ax)
+    ax.set_xlabel("Version (% of total)")
+    ax.set_ylabel("Lookup time (ms)")
+    ax.set_title("VM: median across all triple patterns")
+    ax.legend(fontsize=9, handlelength=3)
+    ax.grid(True, alpha=0.3)
+    fig.tight_layout()
+    _save_plot(fig, plot_dir, "vm_comparison")
+
+
+def plot_dm_combined(daily_data: dict, daily_ost: list[Path],
+                     hourly_data: dict, hourly_ost: list[Path],
+                     plot_dir: Path) -> None:
+    fig, ax = plt.subplots(figsize=(8, 5))
+    ost_d = load_ostrich_dm_by_version(daily_ost) if any(f.exists() for f in daily_ost) else None
+    ost_h = load_ostrich_dm_by_version(hourly_ost) if any(f.exists() for f in hourly_ost) else None
+    _plot_line_chart(ax,
+                     load_tal_dm_by_version(daily_data["results"]["dm"]),
+                     load_tal_dm_by_version(hourly_data["results"]["dm"]),
+                     ost_d, ost_h)
+    _format_log_axis(ax)
+    ax.set_xlabel("Delta target version (% of total)")
+    ax.set_ylabel("Lookup time (ms)")
+    ax.set_title("DM: median across all triple patterns from V0")
+    ax.legend(fontsize=9, handlelength=3)
+    ax.grid(True, alpha=0.3)
+    fig.tight_layout()
+    _save_plot(fig, plot_dir, "dm_comparison")
+
+
+def plot_vq_combined(daily_data: dict, daily_ost: list[Path],
+                     hourly_data: dict, hourly_ost: list[Path],
+                     plot_dir: Path) -> None:
+    fig, ax = plt.subplots(figsize=(7, 5))
+
+    has_ost_d = any(f.exists() for f in daily_ost)
+    has_ost_h = any(f.exists() for f in hourly_ost)
+
+    groups = ["TAL"]
+    if has_ost_d or has_ost_h:
+        groups.append("OSTRICH")
+    x = np.arange(len(groups))
+    width = 0.35
+
+    daily_vals = [load_tal_vq_median(daily_data["results"]["vq"])]
+    hourly_vals = [load_tal_vq_median(hourly_data["results"]["vq"])]
+    if has_ost_d:
+        daily_vals.append(load_ostrich_vq_median(daily_ost))
+    if has_ost_h:
+        hourly_vals.append(load_ostrich_vq_median(hourly_ost))
+
+    # Legend uses neutral gray so it does not imply a specific system color.
+    # Bars themselves use per-system colors on the x-axis labels.
+    bars_d = ax.bar(x - width / 2, daily_vals, width,
+                    color=[("#0072B2", "#D55E00")[i] for i in range(len(groups))],
+                    edgecolor="black")
+    bars_h = ax.bar(x + width / 2, hourly_vals, width,
+                    color=[("#0072B2", "#D55E00")[i] for i in range(len(groups))],
+                    edgecolor="black", hatch="//")
+
+    legend_handles = [
+        Patch(facecolor="white", edgecolor="black", label="Daily (89 ver.)"),
+        Patch(facecolor="white", edgecolor="black", hatch="//", label="Hourly (1,299 ver.)"),
+    ]
+    ax.legend(handles=legend_handles, fontsize=9)
+
+    for bars in [bars_d, bars_h]:
+        for bar in bars:
+            val = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2, val, f"{val:.2f}",
+                    ha="center", va="bottom", fontsize=8)
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(groups)
+    _format_log_axis(ax)
+    ax.set_ylabel("Lookup time (ms)")
+    ax.set_title("VQ: median across all triple patterns")
+    ax.grid(True, alpha=0.3, axis="y")
+    fig.tight_layout()
+    _save_plot(fig, plot_dir, "vq_comparison")
+
+
+def main() -> None:
+    plot_dir = DATA_DIR / "analysis" / "combined" / "plots"
+    console.rule("[bold]Loading data")
+
+    daily_data, daily_ost = _load_granularity_data("daily")
+    hourly_data, hourly_ost = _load_granularity_data("hourly")
+
+    console.rule("[bold]Generating combined plots")
+    plot_vm_combined(daily_data, daily_ost, hourly_data, hourly_ost, plot_dir)
+    plot_dm_combined(daily_data, daily_ost, hourly_data, hourly_ost, plot_dir)
+    plot_vq_combined(daily_data, daily_ost, hourly_data, hourly_ost, plot_dir)
+    console.print("[bold green]Done.[/bold green]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/bear/analyze_results.py b/benchmark/bear/analyze_results.py
@@ -318,171 +318,6 @@ def _save_plot(fig: Figure, plot_dir: Path, name: str) -> None:
     console.print(f"  Saved: {plot_dir / name}.{{pdf,jpg}}")
 
 
-def plot_vm_comparison(tal_vm: List[dict], ostrich_raw_files: List[Path], plot_dir: Path) -> None:
-    tal_data = load_tal_vm_by_version(tal_vm)
-    fig, ax = plt.subplots(figsize=(8, 5))
-    versions = sorted(tal_data.keys())
-    ax.plot(versions, [tal_data[v] for v in versions], label="TAL", marker="", linewidth=1.5)
-    if any(f.exists() for f in ostrich_raw_files):
-        ost_data = load_ostrich_vm_by_version(ostrich_raw_files)
-        ost_versions = sorted(ost_data.keys())
-        ax.plot(ost_versions, [ost_data[v] for v in ost_versions], label="OSTRICH", marker="", linewidth=1.5)
-    _format_log_axis(ax)
-    ax.set_xlabel("Version")
-    ax.set_ylabel("Lookup time (ms)")
-    ax.set_title("VM: median across all triple patterns")
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-    _save_plot(fig, plot_dir, "vm_comparison")
-
-
-def plot_dm_comparison(tal_dm: List[dict], ostrich_raw_files: List[Path], plot_dir: Path) -> None:
-    tal_data = load_tal_dm_by_version(tal_dm)
-    fig, ax = plt.subplots(figsize=(8, 5))
-    versions = sorted(tal_data.keys())
-    ax.plot(versions, [tal_data[v] for v in versions], label="TAL", marker="o", linewidth=1.5, markersize=4)
-    if any(f.exists() for f in ostrich_raw_files):
-        ost_data = load_ostrich_dm_by_version(ostrich_raw_files)
-        ost_versions = sorted(ost_data.keys())
-        ax.plot(ost_versions, [ost_data[v] for v in ost_versions], label="OSTRICH", marker="", linewidth=1.5)
-    _format_log_axis(ax)
-    ax.set_xlabel("Version (delta from V0)")
-    ax.set_ylabel("Lookup time (ms)")
-    ax.set_title("DM: median across all triple patterns from V0")
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-    _save_plot(fig, plot_dir, "dm_comparison")
-
-
-def plot_vq_comparison(tal_vq: List[dict], ostrich_raw_files: List[Path], plot_dir: Path) -> None:
-    systems = ["TAL"]
-    values = [load_tal_vq_median(tal_vq)]
-    if any(f.exists() for f in ostrich_raw_files):
-        systems.append("OSTRICH")
-        values.append(load_ostrich_vq_median(ostrich_raw_files))
-    fig, ax = plt.subplots(figsize=(6, 5))
-    bars = ax.bar(systems, values)
-    for bar, val in zip(bars, values):
-        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.2f}",
-                ha="center", va="bottom", fontsize=9)
-    _format_log_axis(ax)
-    ax.set_ylabel("Lookup time (ms)")
-    ax.set_title("VQ: median across all triple patterns")
-    ax.grid(True, alpha=0.3, axis="y")
-    _save_plot(fig, plot_dir, "vq_comparison")
-
-
-def plot_by_pattern(tal_results: List[dict], ostrich_raw_files: List[Path],
-                    query_type: str, load_tal_fn, load_ost_fn, plot_dir: Path,
-                    x_label: str, version_key: str) -> None:
-    has_ostrich = any(f.exists() for f in ostrich_raw_files)
-    fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)
-    for i, pt in enumerate(["p", "po"]):
-        ax = axes[i]
-        tal_data = load_tal_fn(tal_results, pattern_filter=pt)
-        versions = sorted(tal_data.keys())
-        ax.plot(versions, [tal_data[v] for v in versions], label="TAL", linewidth=1.5)
-        if has_ostrich:
-            ost_data = load_ost_fn(ostrich_raw_files, pattern_filter=pt)
-            ost_versions = sorted(ost_data.keys())
-            ax.plot(ost_versions, [ost_data[v] for v in ost_versions], label="OSTRICH", linewidth=1.5)
-        _format_log_axis(ax)
-        ax.set_xlabel(x_label)
-        ax.set_title(f"?{'P?' if pt == 'p' else 'PO'} patterns")
-        ax.legend()
-        ax.grid(True, alpha=0.3)
-    axes[0].set_ylabel("Lookup time (ms)")
-    fig.suptitle(f"{query_type.upper()}: median by pattern type", fontsize=13)
-    fig.tight_layout()
-    _save_plot(fig, plot_dir, f"{query_type}_by_pattern")
-
-
-def generate_plots(data: dict, ostrich_raw_files: List[Path], plot_dir: Path,
-                   disk_usage: dict[str, int | None] | None = None) -> None:
-    results = data.get("results", {})
-    vm_results = results.get("vm", [])
-    dm_results = results.get("dm", [])
-    vq_results = results.get("vq", [])
-
-    if vm_results:
-        plot_vm_comparison(vm_results, ostrich_raw_files, plot_dir)
-        plot_by_pattern(vm_results, ostrich_raw_files, "vm",
-                        load_tal_vm_by_version, load_ostrich_vm_by_version,
-                        plot_dir, "Version", "version_index")
-    if dm_results:
-        plot_dm_comparison(dm_results, ostrich_raw_files, plot_dir)
-        plot_by_pattern(dm_results, ostrich_raw_files, "dm",
-                        load_tal_dm_by_version, load_ostrich_dm_by_version,
-                        plot_dir, "Version (delta from V0)", "version_end")
-    if vq_results:
-        plot_vq_comparison(vq_results, ostrich_raw_files, plot_dir)
-
-    # Storage and memory
-    if disk_usage:
-        plot_storage_comparison(disk_usage, plot_dir)
-    plot_memory_comparison(data, plot_dir)
-
-
-def plot_storage_comparison(disk_usage: dict[str, int | None], plot_dir: Path) -> None:
-    ocdm_ds = disk_usage["ocdm_dataset_bytes"]
-    ocdm_prov = disk_usage["ocdm_provenance_bytes"]
-    qlever = disk_usage["qlever_index_bytes"]
-    ostrich = disk_usage["ostrich_store_bytes"]
-
-    if ocdm_ds is None and qlever is None and ostrich is None:
-        console.print("  [dim]Skipping storage_comparison (no data)[/dim]")
-        return
-
-    fig, ax = plt.subplots(figsize=(6, 5))
-    systems = []
-    sizes_mb = []
-
-    if ocdm_ds is not None or qlever is not None:
-        ocdm_total = (ocdm_ds or 0) + (ocdm_prov or 0)
-        tal_total = ocdm_total + (qlever or 0)
-        systems.append("TAL\n(OCDM + QLever)")
-        sizes_mb.append(tal_total / 1048576)
-
-    if ostrich is not None:
-        systems.append("OSTRICH")
-        sizes_mb.append(ostrich / 1048576)
-
-    bars = ax.bar(systems, sizes_mb, color=["#1f77b4", "#ff7f0e"][:len(systems)])
-    for bar, val in zip(bars, sizes_mb):
-        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.1f} MB",
-                ha="center", va="bottom", fontsize=9)
-    ax.set_ylabel("Storage (MB)")
-    ax.set_title("Storage comparison")
-    ax.grid(True, alpha=0.3, axis="y")
-    _save_plot(fig, plot_dir, "storage_comparison")
-
-
-def plot_memory_comparison(data: dict, plot_dir: Path) -> None:
-    results = data.get("results", {})
-    query_types = []
-    medians_kb = []
-    for qt in ["vm", "dm", "vq"]:
-        qt_results = results.get(qt, [])
-        valid = [r["median_memory_bytes"] for r in qt_results if r.get("median_memory_bytes") is not None]
-        if valid:
-            query_types.append(qt.upper())
-            medians_kb.append(statistics.median(valid) / 1024)
-
-    if not query_types:
-        console.print("  [dim]Skipping memory_comparison (no data)[/dim]")
-        return
-
-    fig, ax = plt.subplots(figsize=(6, 5))
-    bars = ax.bar(query_types, medians_kb)
-    for bar, val in zip(bars, medians_kb):
-        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.0f} KB",
-                ha="center", va="bottom", fontsize=9)
-    ax.set_ylabel("Peak memory (KB)")
-    ax.set_title("TAL median peak memory by query type")
-    ax.grid(True, alpha=0.3, axis="y")
-    _save_plot(fig, plot_dir, "memory_comparison")
-
-
 def generate_comparison_table(tal_results: dict, ocdm_timing_file: Path, qlever_timing_file: Path) -> List[dict]:
     rows = []
     for system_name, published in PUBLISHED_RESULTS.items():
@@ -782,11 +617,6 @@ def main():
     console.print()
     print_comparison_table(comparison)
 
-    console.rule("[bold]Generating plots")
-    ostrich_raw_files = [DATA_DIR / f"ostrich_raw_{pt}_{args.granularity}.txt" for pt in ["p", "po"]]
-    plot_dir = output_dir / "plots"
-    generate_plots(data, ostrich_raw_files, plot_dir, disk_usage)
-
 
 if __name__ == "__main__":
     main()