perf: Add batched commits and LRU caching for database operations (#957)

jamestexas · wagoodman · web-flow · commit 67f49d9eb33e · 2026-01-08T15:01:18.000Z
* perf: Add batched commits and LRU caching for database operations

Optimize database operations to reduce execution time by eliminating
per-record commits and adding query result caching.

Changes:

1. Batched commits for fix-date tracking (vunnel_first_observed.py)
   - Previously: commit after every single insert
   - Now: batch commits every 2000 operations (configurable)
   - Auto-flushes on context exit to ensure data integrity

2. Batched commits for result writes (result.py)
   - Previously: separate transaction per CVE write
   - Now: batch commits every 2000 operations (configurable)
   - Maintains single active transaction, commits periodically

3. LRU caching for fix-date lookups (finder.py)
   - Previously: every fixdater.best() call hit database
   - Now: functools.lru_cache(maxsize=10000) caches results
   - Eliminates duplicate queries for same (CVE, CPE, version, ecosystem)
   - Cache cleared on context exit

Performance impact:
- Tested on NVD provider with 322k CVEs
- Reduces execution time from 17 minutes to 11 minutes (35% faster)
- Reduces database commits from ~322k to ~161
- Eliminates duplicate database queries through caching

Testing:
- All existing tests pass
- Added tests for batching behavior
- Added tests for cache functionality

Signed-off-by: James Gardner &lt;james.gardner@chainguard.dev&gt;

* fix: Add trailing comma for linter

Signed-off-by: James Gardner &lt;james.gardner@chainguard.dev&gt;

* fix: Add thread safety with locking for concurrent operations

Add threading.Lock() to SQLiteStore to ensure thread-safe access to
transaction state. While current provider implementations call
writer.write() sequentially from the main thread, Ubuntu and RHEL
providers use ThreadPoolExecutor internally. Adding defensive locking
prevents potential data corruption if providers are refactored to
parallelize writes in the future.

Also removes duplicate test definition in test_finder.py.

Signed-off-by: James Gardner &lt;james.gardner@chainguard.dev&gt;

* use thread local for pending ops counter

Signed-off-by: Alex Goodman &lt;wagoodman@users.noreply.github.com&gt;

---------

Signed-off-by: James Gardner &lt;james.gardner@chainguard.dev&gt;
Signed-off-by: Alex Goodman &lt;wagoodman@users.noreply.github.com&gt;
Co-authored-by: Alex Goodman &lt;wagoodman@users.noreply.github.com&gt;
diff --git a/src/vunnel/result.py b/src/vunnel/result.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import shutil
+import threading
 import time
 from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING, Any
@@ -114,12 +115,16 @@ class SQLiteStore(Store):
     temp_filename = "results.db.tmp"
     table_name = "results"
 
-    def __init__(self, *args: Any, write_location: str | None = None, **kwargs: Any):
+    def __init__(self, *args: Any, write_location: str | None = None, batch_size: int = 2000, **kwargs: Any):
         super().__init__(*args, **kwargs)
         self.conn: db.engine.Connection | None = None
         self.engine: db.engine.Engine | None = None
         self.table: db.Table | None = None
         self.write_location = write_location
+        self.batch_size = batch_size
+        self._pending_operations = 0
+        self._transaction: Any = None  # Active transaction context
+        self._lock = threading.Lock()  # Protects transaction state for thread safety
         if self.write_location:
             self.filename = os.path.basename(self.write_location)
             self.temp_filename = f"{self.filename}.tmp"
@@ -171,7 +176,11 @@ def store(self, identifier: str, record: Envelope) -> None:
         record_str = orjson.dumps(asdict(record))
         conn, table = self.connection()
 
-        with conn.begin():
+        with self._lock:
+            # Start a transaction if we don't have one active
+            if self._transaction is None:
+                self._transaction = conn.begin()
+
             # upsert the record conditionally based on the skip_duplicates configuration
             existing = conn.execute(table.select().where(table.c.id == identifier)).first()
             if existing:
@@ -185,6 +194,24 @@ def store(self, identifier: str, record: Envelope) -> None:
                 statement = db.insert(table).values(id=identifier, record=record_str)  # type: ignore[assignment]
 
             conn.execute(statement)
+            self._pending_operations += 1
+
+            # Auto-flush every batch_size operations to limit memory usage
+            if self._pending_operations >= self.batch_size:
+                self._flush_unlocked()
+
+    def flush(self) -> None:
+        """Commit any pending database operations (thread-safe)."""
+        with self._lock:
+            self._flush_unlocked()
+
+    def _flush_unlocked(self) -> None:
+        """Internal flush helper - caller must hold lock."""
+        if self._pending_operations > 0 and self._transaction is not None:
+            self._transaction.commit()
+            self.logger.debug(f"flushed {self._pending_operations} operations to database")
+            self._transaction = None
+            self._pending_operations = 0
 
     def read(self, identifier: str) -> Envelope:
         conn, table = self.connection()
@@ -204,6 +231,9 @@ def prepare(self) -> None:
             shutil.copy2(self.db_file_path, self.temp_db_file_path)
 
     def close(self, successful: bool) -> None:
+        # Flush any remaining operations before closing
+        self.flush()
+
         if self.conn:
             self.conn.close()
             if self.engine:
diff --git a/src/vunnel/tool/fixdate/finder.py b/src/vunnel/tool/fixdate/finder.py
@@ -1,5 +1,6 @@
 import abc
 import datetime
+import functools
 import logging
 from dataclasses import dataclass
 
@@ -81,9 +82,11 @@ def get_changed_vuln_ids_since(self, since_date: datetime.datetime) -> set[str]:
 
 
 class Finder:
-    def __init__(self, strategies: list[Strategy], first_observed: Strategy):
+    def __init__(self, strategies: list[Strategy], first_observed: Strategy, cache_size: int = 10000):
         self.strategies = strategies
         self.first_observed = first_observed
+        # Create cached version of database lookups
+        self._cached_find_from_strategies = functools.lru_cache(maxsize=cache_size)(self._find_from_strategies_uncached)
 
     def __enter__(self) -> "Finder":
         for s in self.strategies:
@@ -95,6 +98,8 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:  # type: ignore[no-untype
         for s in self.strategies:
             s.__exit__(exc_type, exc_val, exc_tb)
         self.first_observed.__exit__(exc_type, exc_val, exc_tb)
+        # Clear cache on exit
+        self._cached_find_from_strategies.cache_clear()
 
     def download(self) -> None:
         self.first_observed.download()
@@ -109,6 +114,26 @@ def _normalize_ecosystem(self, ecosystem: str | None) -> str | None:
 
         return ecosystem_mapping.get(ecosystem, ecosystem)
 
+    def _find_from_strategies_uncached(
+        self,
+        vuln_id: str,
+        cpe_or_package: str,
+        fix_version: str,
+        ecosystem: str | None,
+    ) -> tuple[list[Result], list[Result]]:
+        """Perform database lookups - uncached version for LRU wrapper.
+
+        Returns:
+            Tuple of (strategy_results, first_observed_results)
+        """
+        results = []
+        for s in self.strategies:
+            results.extend(s.find(vuln_id, cpe_or_package, fix_version, ecosystem))
+
+        first_observed_results = self.first_observed.find(vuln_id, cpe_or_package, fix_version, ecosystem)
+
+        return (results, first_observed_results)
+
     def best(
         self,
         vuln_id: str,
@@ -129,16 +154,19 @@ def best(
         if candidates:
             results.extend([c for c in candidates if c.accurate and c.date])
 
-        # add results from finders in order of priority (set by the constructor)
-        for s in self.strategies:
-            results.extend(s.find(vuln_id, cpe_or_package, fix_version, ecosystem))
+        # Use cached database lookups
+        strategy_results, first_observed_results = self._cached_find_from_strategies(
+            vuln_id,
+            cpe_or_package,
+            fix_version,
+            ecosystem,
+        )
+        results.extend(strategy_results)
 
         # add low quality candidates last
         if candidates:
             results.extend([c for c in candidates if not c.accurate and c.date])
 
-        first_observed_results = self.first_observed.find(vuln_id, cpe_or_package, fix_version, ecosystem)
-
         # we should select the date from the set of finders that is the highest quality (earlier in the s
         # results list) but should never be after the first observed date. However, first observed dates are not always
         # accurate, so we should only enforce this if we have an accurate first observed date (not part of the
diff --git a/src/vunnel/tool/fixdate/vunnel_first_observed.py b/src/vunnel/tool/fixdate/vunnel_first_observed.py
@@ -31,13 +31,24 @@ class FixDate:
 
 
 class Store:
-    def __init__(self, ws: workspace.Workspace) -> None:
+    def __init__(self, ws: workspace.Workspace, batch_size: int = 2000) -> None:
         self.workspace = ws
         self.provider = ws.name
         self.db_path = Path(ws.results_path) / "observed-fix-dates.db"
         self.logger = logging.getLogger("fixes-" + self.provider)
         self.engine: db.engine.Engine | None = None
         self._thread_local = threading.local()
+        self.batch_size = batch_size
+
+    @property
+    def _pending_operations(self) -> int:
+        """Get pending operation count for the current thread."""
+        return getattr(self._thread_local, "pending_operations", 0)
+
+    @_pending_operations.setter
+    def _pending_operations(self, value: int) -> None:
+        """Set pending operation count for the current thread."""
+        self._thread_local.pending_operations = value
 
     def add(
         self,
@@ -73,7 +84,19 @@ def add(
         )
 
         conn.execute(insert_stmt)
-        conn.commit()
+        self._pending_operations += 1
+
+        # auto-flush every batch_size operations to limit memory usage
+        if self._pending_operations >= self.batch_size:
+            self.flush()
+
+    def flush(self) -> None:
+        """Commit any pending database operations for the current thread."""
+        if self._pending_operations > 0:
+            conn, _ = self._get_connection()
+            conn.commit()
+            self.logger.debug(f"flushed {self._pending_operations} operations to database")
+            self._pending_operations = 0
 
     def get(
         self,
@@ -246,4 +269,6 @@ def __enter__(self) -> "Store":
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb) -> None:  # type: ignore[no-untyped-def]
+        # Flush any remaining operations before cleanup
+        self.flush()
         self.cleanup_thread_connections()
diff --git a/tests/unit/tool/test_finder.py b/tests/unit/tool/test_finder.py
@@ -364,3 +364,30 @@ def test_best_normalizes_ecosystems(self):
         strategy1.find.assert_called_once_with("CVE-2023-0001", "package", "1.0.0", "php-composer")
         strategy2.find.assert_called_once_with("CVE-2023-0001", "package", "1.0.0", "php-composer")
         first_observed.find.assert_called_once_with("CVE-2023-0001", "package", "1.0.0", "php-composer")
+
+    def test_caching_reduces_database_calls(self):
+        """test that repeated queries use cache instead of hitting database"""
+        # Create mock strategy that counts calls
+        call_count = {"count": 0}
+        test_result = self.create_result("2023-01-01", "test")
+        
+        def counted_find(*args, **kwargs):
+            call_count["count"] += 1
+            return [test_result]
+        
+        mock_strategy = Mock(spec=Strategy)
+        mock_strategy.find = counted_find
+        
+        mock_first_observed = Mock(spec=Strategy)
+        mock_first_observed.find = Mock(return_value=[])
+        
+        finder = Finder(strategies=[mock_strategy], first_observed=mock_first_observed)
+        
+        # Make the same query three times
+        result1 = finder.best("CVE-2023-1234", "pkg:pypi/requests", "2.28.0", "pypi")
+        result2 = finder.best("CVE-2023-1234", "pkg:pypi/requests", "2.28.0", "pypi")
+        result3 = finder.best("CVE-2023-1234", "pkg:pypi/requests", "2.28.0", "pypi")
+        
+        # Should only call find once - subsequent calls use cache
+        assert call_count["count"] == 1, f"Expected 1 database call, got {call_count['count']}"
+        assert result1 == result2 == result3, "Cached results should match original"
diff --git a/tests/unit/tool/test_vunnel_first_observed.py b/tests/unit/tool/test_vunnel_first_observed.py
@@ -403,3 +403,44 @@ def test_cleanup_thread_connections(self, tmpdir):
         # verify thread-local storage is cleared
         assert not hasattr(store._thread_local, "conn")
         assert not hasattr(store._thread_local, "table")
+
+    def test_batch_commit_performance(self, tmpdir, mocker):
+        """test that commits are batched for performance"""
+        ws = workspace.Workspace(tmpdir, "test-db", create=True)
+
+        # Use small batch size for testing
+        store = Store(ws, batch_size=5)
+        db = DatabaseFixture(store.db_path)
+
+        # Mock commit to count calls
+        conn, _ = store._get_connection()
+        original_commit = conn.commit
+        commit_count = [0]
+
+        def tracked_commit():
+            commit_count[0] += 1
+            return original_commit()
+
+        conn.commit = tracked_commit
+
+        # Add 12 entries (should trigger 2 auto-flushes at 5 and 10, plus final flush)
+        for i in range(12):
+            store.add(
+                first_observed_date=date(2023, 1, 1),
+                vuln_id=f"CVE-2023-{i:04d}",
+                cpe_or_package=f"cpe:2.3:a:vendor:product:{i}:*:*:*:*:*:*:*",
+                fix_version="1.0.0",
+            )
+
+        # Flush remaining
+        store.flush()
+
+        # Should have committed 3 times: at 5, at 10, and final flush
+        assert commit_count[0] == 3, f"Expected 3 commits but got {commit_count[0]}"
+
+        # Verify all records were saved
+        results = store.find("CVE-2023-0000", "cpe:2.3:a:vendor:product:0:*:*:*:*:*:*:*", "1.0.0")
+        assert len(results) == 1
+
+        results = store.find("CVE-2023-0011", "cpe:2.3:a:vendor:product:11:*:*:*:*:*:*:*", "1.0.0")
+        assert len(results) == 1