cloudera
diff --git a/‎.project-metadata.yaml‎
Lines changed: 5 additions & 5 deletions b/‎.project-metadata.yaml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backend/src/main/resources/application.properties‎
Lines changed: 8 additions & 0 deletions b/‎backend/src/main/resources/application.properties‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docker-compose.yaml‎
Lines changed: 2 additions & 0 deletions b/‎docker-compose.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎llm-service/.DS_Store‎
0 Bytes b/‎llm-service/.DS_Store‎
0 Bytes
diff --git a/‎llm-service/app/ai/indexing/base.py‎
Lines changed: 6 additions & 0 deletions b/‎llm-service/app/ai/indexing/base.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llm-service/app/ai/indexing/embedding_indexer.py‎
Lines changed: 24 additions & 5 deletions b/‎llm-service/app/ai/indexing/embedding_indexer.py‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎llm-service/app/ai/indexing/readers/base_reader.py‎
Lines changed: 30 additions & 10 deletions b/‎llm-service/app/ai/indexing/readers/base_reader.py‎
Lines changed: 30 additions & 10 deletions
diff --git a/‎llm-service/app/ai/indexing/readers/csv.py‎
Lines changed: 2 additions & 0 deletions b/‎llm-service/app/ai/indexing/readers/csv.py‎
Lines changed: 2 additions & 0 deletions
@@ -1,16 +1,16 @@
 name: RAG Studio
-description: | 
+description: |
   "Build a RAG application to ask questions about your documents.  Configuration for access to models will be available inside the application itself once it has been deployed."
 author: "Cloudera"
 date: "2024-09-10"
 specification_version: 1.0
 prototype_version: 1.0
 
 environment_variables:
-    UV_HTTP_TIMEOUT:
-        description: "Timeout for UV processing in seconds."
-        default: "60000"
-        required: false
+  UV_HTTP_TIMEOUT:
+    description: "Timeout for UV processing in seconds."
+    default: "60000"
+    required: false
 
 runtimes:
   - editor: JupyterLab
 
@@ -55,3 +55,11 @@ otel.traces.exporter=none
 
 server.address=${API_HOST:127.0.0.1}
 server.port=${METADATA_APP_PORT:8080}
+
+# HikariCP Database Connection Pool Configuration
+spring.datasource.hikari.maximum-pool-size=10
+spring.datasource.hikari.minimum-idle=5
+spring.datasource.hikari.connection-timeout=30000
+spring.datasource.hikari.idle-timeout=300000
+spring.datasource.hikari.max-lifetime=1800000
+spring.datasource.hikari.leak-detection-threshold=60000
@@ -52,6 +52,7 @@ services:
       - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
       - S3_RAG_DOCUMENT_BUCKET=cloudera-ai-rag-dev-us-west-2
       - QDRANT_HOST=qdrant
+      - QDRANT_GRPC_PORT=6334
       - API_URL=http://api:8080
       - MLFLOW_RECONCILER_DATA_PATH=/tmp
     depends_on:
@@ -66,5 +67,6 @@ services:
     image: qdrant/qdrant
     ports:
       - "6333:6333"
+      - "6334:6334"  # gRPC port for better performance
     environment:
       - RUST_LOG=info
@@ -18,6 +18,7 @@
 from .readers.pdf import PDFReader
 from .readers.pptx import PptxReader
 from .readers.simple_file import SimpleFileReader
+from .readers.excel import ExcelReader
 from ...config import settings
 
 logger = logging.getLogger(__name__)
@@ -30,6 +31,11 @@
     ".pptx": PptxReader,
     ".pptm": PptxReader,
     ".csv": CSVReader,
+    ".xlsx": ExcelReader,
+    ".xlsb": ExcelReader,
+    ".xlsm": ExcelReader,
+    ".xls": ExcelReader,
+    ".ods": ExcelReader,
     ".json": JSONReader,
     ".jpg": ImagesReader,
     ".jpeg": ImagesReader,
 
@@ -49,6 +49,9 @@
 
 from .base import BaseTextIndexer
 from .readers.base_reader import ReaderConfig, ChunksResult
+from .readers.excel import ExcelReader
+from .readers.csv import CSVReader
+from ...ai.vector_stores.qdrant import QdrantVectorStore
 from ...ai.vector_stores.vector_store import VectorStore
 from ...services.utils import batch_sequence, flatten_sequence
 
@@ -78,6 +81,8 @@ def index_file(self, file_path: Path, document_id: str) -> None:
 
         reader_cls = self._get_reader_class(file_path)
 
+        is_tabular_document = reader_cls in (ExcelReader, CSVReader)
+
         reader = reader_cls(
             splitter=self.splitter,
             document_id=document_id,
@@ -99,7 +104,14 @@ def index_file(self, file_path: Path, document_id: str) -> None:
         chunks_with_embeddings = flatten_sequence(self._compute_embeddings(nodes))
 
         acc = 0
-        for chunk_batch in batch_sequence(chunks_with_embeddings, 1000):
+        use_qdrant_safe_batches = isinstance(
+            self.chunks_vector_store, QdrantVectorStore
+        )
+        if use_qdrant_safe_batches and is_tabular_document:
+            batch_size = 256
+        else:
+            batch_size = 1000
+        for chunk_batch in batch_sequence(chunks_with_embeddings, batch_size):
             acc += len(chunk_batch)
             logger.debug(f"Adding {acc}/{len(nodes)} chunks to vector store")
 
@@ -125,13 +137,20 @@ def _compute_embeddings(
         batched_chunks = list(batch_sequence(chunks, 100))
         batched_texts = [[chunk.text for chunk in batch] for batch in batched_chunks]
 
-        with ThreadPoolExecutor(max_workers=20) as executor:
+        max_workers = 15
+        logger.debug("Using %s workers for embedding generation", max_workers)
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
             futures = [
                 executor.submit(
-                    lambda b: (i, self.embedding_model.get_text_embedding_batch(b)),
-                    batch,
+                    lambda batch_text, batch_index: (
+                        batch_index,
+                        self.embedding_model.get_text_embedding_batch(batch_text),
+                    ),
+                    b,
+                    i,
                 )
-                for i, batch in enumerate(batched_texts)
+                for i, b in enumerate(batched_texts)
             ]
             logger.debug(f"Waiting for {len(futures)} futures")
             for future in as_completed(futures):
 
@@ -1,3 +1,4 @@
+import functools
 import os
 import tempfile
 from abc import ABC, abstractmethod
@@ -13,6 +14,25 @@
 from presidio_anonymizer import AnonymizerEngine
 
 
+@functools.cache
+def _get_analyzer() -> AnalyzerEngine:
+    """Cached analyzer engine to reuse compiled regex patterns."""
+    return AnalyzerEngine()
+
+
+@functools.cache
+def _get_anonymizer() -> AnonymizerEngine:
+    """Cached anonymizer engine to reuse compiled patterns."""
+    return AnonymizerEngine()  # type: ignore[no-untyped-call]
+
+
+@functools.cache
+def _get_secret_collection() -> SecretsCollection:
+    """Cached secrets collection to reuse compiled regex patterns."""
+    with default_settings():
+        return SecretsCollection()
+
+
 @dataclass
 class ReaderConfig:
     block_secrets: bool = False
@@ -70,19 +90,19 @@ def _block_secrets(self, chunks: List[str]) -> Optional[Set[str]]:
         if not self.config.block_secrets:
             return None
 
+        # Create a fresh collection each time since clear() doesn't exist
+        # but still benefit from cached settings/plugins via default_settings()
         with tempfile.TemporaryDirectory() as tmpdir:
+            paths = []
             for i, chunk in enumerate(chunks):
-                with open(os.path.join(tmpdir, f"chunk_{i}.txt"), "w") as f:
+                path = os.path.join(tmpdir, f"chunk_{i}.txt")
+                with open(path, "w") as f:
                     f.write(chunk)
+                paths.append(path)
 
-            secrets_collection = SecretsCollection()
+            secrets_collection = _get_secret_collection()
             with default_settings():
-                secrets_collection.scan_files(
-                    *[
-                        os.path.join(tmpdir, f"chunk_{i}.txt")
-                        for i in range(len(chunks))
-                    ]
-                )
+                secrets_collection.scan_files(*paths)
 
         secrets_json = secrets_collection.json()
 
@@ -97,12 +117,12 @@ def _anonymize_pii(self, text: str) -> Optional[str]:
         if not self.config.anonymize_pii:
             return None
 
-        analyzer = AnalyzerEngine()
+        analyzer = _get_analyzer()
 
         # TODO: support other languages
         results = analyzer.analyze(text=text, entities=None, language="en")
 
-        anonymizer = AnonymizerEngine()  # type: ignore[no-untyped-call]
+        anonymizer = _get_anonymizer()
 
         anonymized_text = anonymizer.anonymize(text=text, analyzer_results=results)  # type: ignore[arg-type]
         if anonymized_text.text == text:
 
@@ -53,6 +53,8 @@
 
 class _CsvSplitter(MetadataAwareTextSplitter):
     def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
+        # metadata_str is kept as an argument to satisfy the interface, but it is not used
+        # because metadata is added to the chunks later.
         return self.split_text(text)
 
     def split_text(self, text: str) -> List[str]: