|
2 | 2 |
|
3 | 3 | import json |
4 | 4 | import os |
| 5 | +import threading |
| 6 | +import time |
5 | 7 | import uuid |
6 | 8 |
|
7 | | -from datetime import datetime |
| 9 | +from datetime import datetime, timedelta |
8 | 10 | from typing import TYPE_CHECKING, Any |
9 | 11 |
|
10 | 12 |
|
@@ -35,11 +37,123 @@ def __init__(self, config: LanceGraphDBConfig): |
35 | 37 | self.user_name = config.user_name or "default" |
36 | 38 | self.dim = config.embedding_dimension |
37 | 39 |
|
| 40 | + # Compaction settings |
| 41 | + self.compaction_version_threshold = config.compaction_version_threshold |
| 42 | + self.compaction_interval_mins = config.compaction_interval_mins |
| 43 | + self.cleanup_older_than_days = config.cleanup_older_than_days |
| 44 | + |
38 | 45 | self.memories_uri = os.path.join(self.uri, "memories") |
39 | 46 | self.edges_uri = os.path.join(self.uri, "edges") |
40 | 47 |
|
41 | 48 | self._init_schema() |
42 | 49 |
|
| 50 | + # Start LanceDB background optimizer thread |
| 51 | + self._last_compact_versions = { |
| 52 | + "memories": self._get_memories_table().version, |
| 53 | + "edges": self._get_edges_table().version, |
| 54 | + } |
| 55 | + self._optimizer_thread = threading.Thread( |
| 56 | + target=self._db_optimizer_loop, |
| 57 | + daemon=True, |
| 58 | + name="lancedb-optimizer", |
| 59 | + ) |
| 60 | + self._optimizer_thread.start() |
| 61 | + |
| 62 | + def _db_optimizer_loop(self): |
| 63 | + """Background loop to periodically trigger table optimization.""" |
| 64 | + import schedule |
| 65 | + |
| 66 | + schedule.every(self.compaction_interval_mins).minutes.do(self._force_optimize) |
| 67 | + |
| 68 | + logger.info( |
| 69 | + f"Started LanceDB optimizer thread. Compaction interval: {self.compaction_interval_mins}m, " |
| 70 | + f"Version threshold: {self.compaction_version_threshold}" |
| 71 | + ) |
| 72 | + |
| 73 | + while True: |
| 74 | + try: |
| 75 | + # 1. Check version threshold |
| 76 | + self._check_and_trigger_compaction() |
| 77 | + |
| 78 | + # 2. Run scheduled fallback compaction |
| 79 | + schedule.run_pending() |
| 80 | + except Exception as e: |
| 81 | + logger.error(f"Error in LanceDB optimizer loop: {e}", stack_info=True) |
| 82 | + |
| 83 | + time.sleep(5) # Avoid busy waiting |
| 84 | + |
| 85 | + def _check_and_trigger_compaction(self): |
| 86 | + """Trigger compaction if any table's version diff exceeds the threshold.""" |
| 87 | + try: |
| 88 | + memories_ds = self._get_memories_table() |
| 89 | + if ( |
| 90 | + memories_ds.version - self._last_compact_versions["memories"] |
| 91 | + > self.compaction_version_threshold |
| 92 | + ): |
| 93 | + self._optimize_table("memories", memories_ds) |
| 94 | + |
| 95 | + edges_ds = self._get_edges_table() |
| 96 | + if ( |
| 97 | + edges_ds.version - self._last_compact_versions["edges"] |
| 98 | + > self.compaction_version_threshold |
| 99 | + ): |
| 100 | + self._optimize_table("edges", edges_ds) |
| 101 | + except Exception as e: |
| 102 | + logger.error(f"Failed to check compaction versions: {e}") |
| 103 | + |
| 104 | + def _optimize_table(self, table_name: str, ds): |
| 105 | + """Helper method to optimize a specific LanceDB table.""" |
| 106 | + try: |
| 107 | + current_version = ds.version |
| 108 | + last_version = self._last_compact_versions[table_name] |
| 109 | + |
| 110 | + if current_version > last_version: |
| 111 | + logger.info( |
| 112 | + f"Triggering LanceDB optimization for '{table_name}'. " |
| 113 | + f"Current version: {current_version}, Last compacted: {last_version}" |
| 114 | + ) |
| 115 | + |
| 116 | + stats = ds.optimize(cleanup_older_than=timedelta(days=self.cleanup_older_than_days)) |
| 117 | + |
| 118 | + stats_msg = "" |
| 119 | + if stats: |
| 120 | + compaction = getattr(stats, "compaction", None) |
| 121 | + if compaction: |
| 122 | + stats_msg += ( |
| 123 | + f" | Compaction: " |
| 124 | + f"-{getattr(compaction, 'fragments_removed', 0)}/" |
| 125 | + f"+{getattr(compaction, 'fragments_added', 0)} fragments, " |
| 126 | + f"-{getattr(compaction, 'files_removed', 0)}/" |
| 127 | + f"+{getattr(compaction, 'files_added', 0)} files" |
| 128 | + ) |
| 129 | + |
| 130 | + prune = getattr(stats, "prune", None) |
| 131 | + if prune: |
| 132 | + stats_msg += ( |
| 133 | + f" | Prune: -{getattr(prune, 'bytes_removed', 0)} bytes, " |
| 134 | + f"-{getattr(prune, 'old_versions_removed', 0)} versions" |
| 135 | + ) |
| 136 | + |
| 137 | + # Reload the table to get the updated version after optimization |
| 138 | + if table_name == "memories": |
| 139 | + ds = self._get_memories_table() |
| 140 | + elif table_name == "edges": |
| 141 | + ds = self._get_edges_table() |
| 142 | + |
| 143 | + self._last_compact_versions[table_name] = ds.version |
| 144 | + logger.info( |
| 145 | + f"LanceDB '{table_name}' optimization completed successfully. " |
| 146 | + f"New version: {self._last_compact_versions[table_name]}{stats_msg}" |
| 147 | + ) |
| 148 | + except Exception as e: |
| 149 | + logger.error(f"LanceDB '{table_name}' optimization failed: {e}") |
| 150 | + |
| 151 | + def _force_optimize(self): |
| 152 | + # Optimize Memories Table |
| 153 | + self._optimize_table("memories", self._get_memories_table()) |
| 154 | + # Optimize Edges Table |
| 155 | + self._optimize_table("edges", self._get_edges_table()) |
| 156 | + |
43 | 157 | def _init_schema(self): |
44 | 158 | import lancedb |
45 | 159 | import pyarrow as pa |
@@ -74,6 +188,34 @@ def _init_schema(self): |
74 | 188 | self.db.create_table("memories", data=empty_table) |
75 | 189 | logger.info("Created LanceDB table for memories.") |
76 | 190 |
|
| 191 | + try: |
| 192 | + ds = self.db.open_table("memories") |
| 193 | + |
| 194 | + # Create vector index (aligned with memory-lancedb TS implementation) |
| 195 | + import math |
| 196 | + |
| 197 | + row_count = ds.count_rows() |
| 198 | + if row_count > 256: # LanceDB requires at least 256 rows to train vector index |
| 199 | + num_partitions = max(1, math.floor(math.sqrt(row_count))) |
| 200 | + ds.create_index( |
| 201 | + metric="cosine", |
| 202 | + vector_column_name="embedding", |
| 203 | + num_partitions=num_partitions, |
| 204 | + ) |
| 205 | + logger.info( |
| 206 | + f"Created IVF_FLAT index for memories.embedding with metric=cosine, partitions={num_partitions}" |
| 207 | + ) |
| 208 | + else: |
| 209 | + logger.debug( |
| 210 | + f"Skipping vector index creation, not enough rows ({row_count} <= 256)" |
| 211 | + ) |
| 212 | + |
| 213 | + # Create full-text search index |
| 214 | + ds.create_fts_index("memory", replace=True) |
| 215 | + logger.info("Created FTS index for memories.memory") |
| 216 | + except Exception as e: |
| 217 | + logger.warning(f"Failed to create LanceDB indices: {e}") |
| 218 | + |
77 | 219 | if "edges" not in table_names: |
78 | 220 | edge_schema = pa.schema( |
79 | 221 | [ |
|
0 commit comments