cloudera
diff --git a/‎.env.example‎
Lines changed: 25 additions & 1 deletion b/‎.env.example‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎.github/workflows/publish_release.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/publish_release.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 47 additions & 9 deletions b/‎README.md‎
Lines changed: 47 additions & 9 deletions
diff --git a/‎llm-service/app/ai/indexing/base.py‎
Lines changed: 14 additions & 2 deletions b/‎llm-service/app/ai/indexing/base.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎llm-service/app/ai/indexing/embedding_indexer.py‎
Lines changed: 6 additions & 0 deletions b/‎llm-service/app/ai/indexing/embedding_indexer.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llm-service/app/ai/indexing/summary_indexer.py‎
Lines changed: 11 additions & 5 deletions b/‎llm-service/app/ai/indexing/summary_indexer.py‎
Lines changed: 11 additions & 5 deletions
@@ -1,7 +1,19 @@
 AWS_DEFAULT_REGION=us-west-2
 
+# H2 or PostgreSQL (RDS) (H2 is default)
+DB_TYPE=H2
+
+# H2
 DB_URL=jdbc:h2:../databases/rag
 
+# RDS
+# DB_URL= "jdbc:postgresql://<host>:<port>/<database>"
+DB_USERNAME=
+DB_PASSWORD=
+
+# Model Provider
+MODEL_PROVIDER=Bedrock
+
 # CAII
 CAII_DOMAIN=
 
@@ -10,7 +22,7 @@ AZURE_OPENAI_API_KEY=
 AZURE_OPENAI_ENDPOINT=
 OPENAI_API_VERSION=
 
-# QDRANT or OPENSEARCH
+# QDRANT or OPENSEARCH or CHROMADB
 VECTOR_DB_PROVIDER=QDRANT
 
 # OpenSearch
@@ -19,6 +31,18 @@ OPENSEARCH_USERNAME=
 OPENSEARCH_PASSWORD=
 OPENSEARCH_NAMESPACE=
 
+# ChromaDB
+CHROMADB_HOST=http://localhost
+CHROMADB_PORT=8000
+CHROMADB_TOKEN=
+# Tenant and database defaults to the Chroma default values
+CHROMADB_TENANT=
+CHROMADB_DATABASE=
+# If CHROMADB_HOST starts with "https://" and your server uses a private CA,
+# set it to the path of your PEM bundle so Python can verify TLS connections to ChromaDB:
+CHROMADB_SERVER_SSL_CERT_PATH=/absolute/path/to/ca-bundle.pem
+CHROMADB_ENABLE_ANONYMIZED_TELEMETRY=false
+
 # AWS
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
 
@@ -18,6 +18,7 @@ on:
           - mob/main
           - release/1
           - customer-hotfix
+          - bs/chromadb
 jobs:
   build:
     runs-on: ubuntu-latest
 
@@ -1,6 +1,7 @@
 .env
 .idea/*
 .vscode/*
+.cursor/*
 !.idea/copyright/
 !.idea/prettier.xml
 !.idea/google-java-format.xml
 
@@ -52,6 +52,32 @@ RAG Studio can utilize the local file system or an S3 bucket for storing documen
 
 S3 will also require providing the AWS credentials for the bucket.
 
+### Vector Database Options
+
+RAG Studio supports Qdrant (default), OpenSearch (Cloudera Semantic Search), and ChromaDB.
+
+- To choose the vector DB, set `VECTOR_DB_PROVIDER` to one of `QDRANT`, `OPENSEARCH`, or `CHROMADB` in your `.env`.
+
+#### ChromaDB Setup
+
+If you select ChromaDB, configure the following environment variables in `.env`:
+
+- `CHROMADB_HOST` - Hostname or URL for ChromaDB. Use `localhost` for local Docker.
+- `CHROMADB_PORT` - Port for ChromaDB (default `8000`). Not required if `CHROMADB_HOST` starts with `https://` and the server infers the port.
+- `CHROMADB_TENANT` - Optional. Defaults to the Chroma default tenant.
+- `CHROMADB_DATABASE` - Optional. Defaults to the Chroma default database.
+- `CHROMADB_TOKEN` - Optional. Include if your Chroma server requires an auth token.
+- `CHROMADB_SERVER_SSL_CERT_PATH` - Optional. Path to PEM bundle for TLS verification when using HTTPS with a private CA.
+- `CHROMADB_ENABLE_ANONYMIZED_TELEMETRY` - Optional. Enables anonymized telemetry in the ChromaDB client; defaults to `false`.
+
+Notes:
+
+- The local-dev script will automatically start a ChromaDB Docker container when `VECTOR_DB_PROVIDER=CHROMADB`, `CHROMADB_HOST=localhost` on `CHROMADB_PORT=8000`.
+- ChromaDB collections are automatically namespaced using the tenant and database values to avoid conflicts between different RAG Studio instances.
+- For production deployments, consider using a dedicated ChromaDB server with authentication enabled via `CHROMADB_TOKEN`.
+- When using HTTPS endpoints, ensure your certificate chain is properly configured or provide the CA bundle path via `CHROMADB_SERVER_SSL_CERT_PATH`.
+- Anonymized telemetry is disabled by default. You can enable it either by setting `CHROMADB_ENABLE_ANONYMIZED_TELEMETRY=true`.
+
 ### Enhanced Parsing Options:
 
 RAG Studio can optionally enable enhanced parsing by providing the `USE_ENHANCED_PDF_PROCESSING` environment variable. Enabling this will allow RAG Studio to parse images and tables from PDFs. When enabling this feature, we strongly recommend using this with a GPU and at least 16GB of memory.
@@ -82,7 +108,7 @@ This variable can be set from the project settings for the AMP in CML.
 ## Air-gapped Environments
 
 If you are using an air-gapped environment, you will need to whitelist at the minimum the following domains in order to use the AMP.
-There may be other domains that need to be whitelisted depending on your environment and the model service provider you select. 
+There may be other domains that need to be whitelisted depending on your environment and the model service provider you select.
 
 - `https://github.com`
 - `https://raw.githubusercontent.com`
@@ -150,17 +176,29 @@ the Node service locally, you can do so by following these steps:
 docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/databases/qdrant_storage:/qdrant/storage:z qdrant/qdrant
 ```
 
+#### To run ChromaDB locally
+
+```
+docker run --name chromadb_dev --rm -d -p 8000:8000 -v $(pwd)/databases/chromadb_storage:/data chromadb/chroma
+```
+
+#### Use ChromaDB with local-dev.sh
+
+- Copy `.env.example` to `.env`.
+- Set `VECTOR_DB_PROVIDER=CHROMADB` in `.env` (defaults assume `CHROMADB_HOST=localhost` and `CHROMADB_PORT=8000`).
+- Run `./local-dev.sh` from the repo root. When `CHROMADB_HOST=localhost`, the script will auto-start a ChromaDB Docker container.
+
 #### Modifying UI in CML
 
-* This is an unsupported workflow, but it is possible to modify the UI code in CML.
+- This is an unsupported workflow, but it is possible to modify the UI code in CML.
 
-- Start a CML Session from a CML Project that has the RAG Studio AMP installed.
-- Open the terminal in the CML Session and navigate to the `ui` directory.
-- Run `source ~/.bashrc` to ensure the Node environment variables are loaded.
-- Install PNPM using `npm install -g pnpm`.  Docs on PNPM can be found here: https://pnpm.io/installation#using-npm
-- Run `pnpm install` to install the dependencies.
-- Make your changes to the UI code in the `ui` directory.
-- Run `pnpm build` to build the new UI bundle.
+* Start a CML Session from a CML Project that has the RAG Studio AMP installed.
+* Open the terminal in the CML Session and navigate to the `ui` directory.
+* Run `source ~/.bashrc` to ensure the Node environment variables are loaded.
+* Install PNPM using `npm install -g pnpm`. Docs on PNPM can be found here: https://pnpm.io/installation#using-npm
+* Run `pnpm install` to install the dependencies.
+* Make your changes to the UI code in the `ui` directory.
+* Run `pnpm build` to build the new UI bundle.
 
 ## The Fine Print
 
 
@@ -1,9 +1,12 @@
+import json
 import logging
 import os
 from abc import abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, Type, Optional
+from typing import Dict, Type, Optional, TypeVar
+
+from llama_index.core.schema import BaseNode
 
 from .readers.base_reader import BaseReader, ReaderConfig
 from .readers.csv import CSVReader
@@ -26,7 +29,6 @@
     ".docx": DocxReader,
     ".pptx": PptxReader,
     ".pptm": PptxReader,
-    ".ppt": PptxReader,
     ".csv": CSVReader,
     ".json": JSONReader,
     ".jpg": ImagesReader,
@@ -40,6 +42,9 @@
 }
 
 
+TNode = TypeVar("TNode", bound=BaseNode)
+
+
 @dataclass
 class NotSupportedFileExtensionError(Exception):
     file_extension: str
@@ -54,6 +59,13 @@ def __init__(
         self.data_source_id = data_source_id
         self.reader_config = reader_config
 
+    @staticmethod
+    def _flatten_metadata(chunk: TNode) -> TNode:
+        for key, value in chunk.metadata.items():
+            if isinstance(value, list) or isinstance(value, dict):
+                chunk.metadata[key] = json.dumps(value)
+        return chunk
+
     @abstractmethod
     def index_file(self, file_path: Path, doc_id: str) -> None:
         pass
 
@@ -108,6 +108,12 @@ def index_file(self, file_path: Path, document_id: str) -> None:
             # we're capturing "text".
             converted_chunks: List[BaseNode] = [chunk for chunk in chunk_batch]
 
+            # flatten metadata if vector store has self.flat_metadata
+            if self.chunks_vector_store.flat_metadata:
+                converted_chunks = [
+                    self._flatten_metadata(chunk) for chunk in converted_chunks
+                ]
+
             chunks_vector_store = self.chunks_vector_store.llama_vector_store()
             chunks_vector_store.add(converted_chunks)
 
 
@@ -70,6 +70,7 @@
 from qdrant_client.http.exceptions import UnexpectedResponse
 
 from app.services import models
+from app.ai.vector_stores.vector_store import VectorStore
 from .base import BaseTextIndexer
 from .readers.base_reader import ReaderConfig, ChunksResult
 from ..vector_stores.vector_store_factory import VectorStoreFactory
@@ -101,6 +102,7 @@ def __init__(
         self.splitter = splitter
         self.llm = llm
         self.embedding_model = embedding_model
+        self.summary_vector_store = VectorStoreFactory.for_summaries(data_source_id)
 
     @staticmethod
     def __database_dir(data_source_id: int) -> str:
@@ -177,19 +179,20 @@ def __summary_indexer(
             return SummaryIndexer.__summary_indexer_with_config(
                 persist_dir=persist_dir,
                 index_configuration=self.__index_kwargs(embed_summaries),
+                summary_vector_store=self.summary_vector_store,
             )
         except (ValueError, FileNotFoundError):
             doc_summary_index = self.__init_summary_store(persist_dir)
             return doc_summary_index
 
     @staticmethod
     def __summary_indexer_with_config(
-        persist_dir: str, index_configuration: Dict[str, Any]
+        persist_dir: str, index_configuration: Dict[str, Any], 
+        summary_vector_store: VectorStore,
     ) -> DocumentSummaryIndex:
-        data_source_id: int = index_configuration.get("data_source_id")
         storage_context = SummaryIndexer.create_storage_context(
             persist_dir,
-            VectorStoreFactory.for_summaries(data_source_id).llama_vector_store(),
+            summary_vector_store.llama_vector_store(),
         )
         doc_summary_index: DocumentSummaryIndex = cast(
             DocumentSummaryIndex,
@@ -293,6 +296,8 @@ def index_file(self, file_path: Path, document_id: str) -> None:
         with _write_lock:
             persist_dir = self.__persist_dir()
             summary_store: DocumentSummaryIndex = self.__summary_indexer(persist_dir)
+            if self.summary_vector_store.flat_metadata:
+                nodes = [self._flatten_metadata(node) for node in nodes]
             summary_store.insert_nodes(nodes)
             summary_store.storage_context.persist(persist_dir=persist_dir)
 
@@ -311,7 +316,7 @@ def __update_global_summary_store(
         # and re-index it with the addition/removal.
         global_persist_dir = self.__persist_root_dir()
         global_summary_store = self.__summary_indexer(
-            global_persist_dir, embed_summaries=False
+            global_persist_dir, embed_summaries=False,
         )
         data_source_node = Document(doc_id=str(self.data_source_id))
 
@@ -493,7 +498,8 @@ def delete_data_source_by_id(data_source_id: int) -> None:
                     embed_summaries=False,
                 )
                 global_summary_store = SummaryIndexer.__summary_indexer_with_config(
-                    global_persist_dir, configuration
+                    global_persist_dir, configuration,
+                    summary_vector_store=vector_store,
                 )
             except FileNotFoundError:
                 ## global summary store doesn't exist, nothing to do