tensorchord · kemingy · Feb 10, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/Makefile b/Makefile
@@ -18,3 +18,6 @@ publish: build
 
 test:
 	@uv run pytest -v tests
+
+sync:
+	@uv sync --all-extras
diff --git a/README.md b/README.md
@@ -27,3 +27,14 @@ timeline
               : Filter
         Rerank: ColBERT
 ```
+
+## Development
+
+```bash
+docker run --rm -d -e POSTGRES_PASSWORD=postgres -p 5432:5432 tensorchord/vchord-postgres:pg17-v0.2.0
+envd up
+# inside the envd env, sync all the dependencies
+make sync
+# format the code
+make format
+```
diff --git a/test.py → examples/basic.py b/test.py → examples/basic.py
@@ -4,8 +4,8 @@
     LocalLoader,
     Pipeline,
     SimpleExtractor,
-    SpacyEmbedding,
-    SpacySegmenter,
+    SpacyChunker,
+    SpacyDenseEmbedding,
     VectorChordClient,
 )
 
@@ -16,8 +16,8 @@
         ),
         loader=LocalLoader("data", include=[".pdf"]),
         extractor=SimpleExtractor(),
-        segmenter=SpacySegmenter(),
-        emb=SpacyEmbedding(),
+        chunker=SpacyChunker(),
+        emb=SpacyDenseEmbedding(),
     )
     pipe.run()
 

diff --git a/examples/gemini.py b/examples/gemini.py
@@ -0,0 +1,26 @@
+from rich import print
+
+from vechord import (
+    GeminiAugmenter,
+    GeminiDenseEmbedding,
+    GeminiExtractor,
+    LocalLoader,
+    Pipeline,
+    VectorChordClient,
+    WordLlamaChunker,
+)
+
+if __name__ == "__main__":
+    pipe = Pipeline(
+        client=VectorChordClient(
+            "local_pdf", "postgresql://postgres:postgres@172.17.0.1:5432/"
+        ),
+        loader=LocalLoader("data", include=[".pdf"]),
+        extractor=GeminiExtractor(),
+        chunker=WordLlamaChunker(),
+        emb=GeminiDenseEmbedding(),
+        augmenter=GeminiAugmenter(),
+    )
+    pipe.run()
+
+    print(pipe.query("vector search"))
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,18 +5,15 @@ description = "VectorChord Python SDK"
 readme = "README.md"
 requires-python = ">=3.9"
 dependencies = [
-    "en-core-web-sm",
+    "falcon>=4.0.2",
     "httpx>=0.28.1",
     "msgspec>=0.19.0",
     "numpy>=2.0.2",
-    "openai>=1.59.7",
     "pgvector>=0.3.6",
     "pillow>=11.1.0",
     "psycopg[binary]>=3.2.3",
     "pypdfium2>=4.30.1",
     "rich>=13.9.4",
-    "spacy>=3.8.4",
-    "trio>=0.28.0",
 ]
 
 [project.scripts]
@@ -26,6 +23,16 @@ vechord = "vechord.main:main"
 gemini = [
     "google-generativeai>=0.8.4",
 ]
+openai = [
+    "openai>=1.60.2",
+]
+spacy = [
+    "en-core-web-sm",
+    "spacy>=3.8.4",
+]
+wordllama = [
+    "wordllama>=0.3.8.post20",
+]
 
 [build-system]
 requires = ["pdm-backend"]
@@ -45,7 +52,7 @@ ignore = ["E501"]
 [tool.ruff.lint.isort]
 known-first-party = ["vechord"]
 [tool.ruff.lint.pylint]
-max-args = 7
+max-args = 5
 
 [tool.pdm]
 distribution = true

diff --git a/uv.lock b/uv.lock
diff --git a/vechord/__init__.py b/vechord/__init__.py
@@ -1,22 +1,29 @@
+from vechord.augment import GeminiAugmenter
+from vechord.chunk import RegexChunker, SpacyChunker, WordLlamaChunker
 from vechord.client import VectorChordClient
-from vechord.embedding import GeminiEmbedding, OpenAIEmbedding, SpacyEmbedding
+from vechord.embedding import (
+    GeminiDenseEmbedding,
+    OpenAIDenseEmbedding,
+    SpacyDenseEmbedding,
+)
 from vechord.extract import GeminiExtractor, SimpleExtractor
 from vechord.load import LocalLoader
 from vechord.model import Chunk, Document
 from vechord.pipeline import Pipeline
-from vechord.segment import RegexSegmenter, SpacySegmenter
 
 __all__ = [
     "Chunk",
     "Document",
-    "GeminiEmbedding",
+    "GeminiAugmenter",
+    "GeminiDenseEmbedding",
     "GeminiExtractor",
     "LocalLoader",
-    "OpenAIEmbedding",
+    "OpenAIDenseEmbedding",
     "Pipeline",
-    "RegexSegmenter",
+    "RegexChunker",
     "SimpleExtractor",
-    "SpacyEmbedding",
-    "SpacySegmenter",
+    "SpacyChunker",
+    "SpacyDenseEmbedding",
     "VectorChordClient",
+    "WordLlamaChunker",
 ]
diff --git a/vechord/augment.py b/vechord/augment.py
@@ -0,0 +1,115 @@
+import os
+from abc import ABC, abstractmethod
+from datetime import timedelta
+
+from vechord.log import logger
+
+
+class BaseAugmenter(ABC):
+    @abstractmethod
+    def reset(self, doc: str):
+        """Cache the document for augmentation."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def name(self) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def augment_context(self, chunks: list[str]) -> list[str]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def augment_query(self, chunks: list[str]) -> list[str]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def summarize_doc(self) -> str:
+        raise NotImplementedError
+
+
+class GeminiAugmenter(BaseAugmenter):
+    def __init__(self, model: str = "models/gemini-1.5-flash-001", ttl_sec: int = 600):
+        """Gemini Augmenter with cache.
+
+        Minimal cache token is 32768.
+        """
+        key = os.environ.get("GEMINI_API_KEY")
+        if not key:
+            raise ValueError("env GEMINI_API_KEY not set")
+
+        self.model_name = model
+        self.ttl_sec = ttl_sec
+        self.min_token = 32768
+
+    def name(self) -> str:
+        return f"gemini_augment_{self.model_name}"
+
+    def reset(self, doc: str):
+        import google.generativeai as genai
+
+        self.client = genai.GenerativeModel(model_name=self.model_name)
+        tokens = self.client.count_tokens(doc).total_tokens
+        self.doc = ""  # empty means doc is in the cache
+        if tokens <= self.min_token:
+            # cannot use cache due to the Gemini token limit
+            self.doc = doc
+        else:
+            logger.debug("use cache since the doc has %d tokens", tokens)
+            cache = genai.caching.CachedContent.create(
+                model=self.model_name,
+                system_instruction=(
+                    "You are an expert on the natural language understanding. "
+                    "Answer the questions based on the whole document you have access to."
+                ),
+                contents=doc,
+                ttl=timedelta(seconds=self.ttl_sec),
+            )
+            self.client = genai.GenerativeModel.from_cached_content(
+                cached_content=cache
+            )
+
+    def augment(self, chunks: list[str], prompt: str) -> list[str]:
+        res = []
+        try:
+            for chunk in chunks:
+                context = prompt.format(chunk=chunk)
+                if self.doc:
+                    context = f"<document>{self.doc}</document>\n" + context
+                response = self.client.generate_content([context])
+                res.append(response.text)
+        except Exception as e:
+            logger.error("GeminiAugmenter error: %s", e)
+            breakpoint()
+        return res
+
+    def augment_context(self, chunks: list[str]) -> list[str]:
+        prompt = (
+            "Here is the chunk we want to situate within the whole document "
+            "<chunk>{chunk}</chunk>"
+            "Please give a short succinct context to situate this chunk within "
+            "the overall document for the purposes of improving search retrieval "
+            "of the chunk. Answer only with the succinct context and nothing else."
+        )
+        return self.augment(chunks, prompt)
+
+    def augment_query(self, chunks: list[str]) -> list[str]:
+        prompt = (
+            "Here is the chunk we want to ask questions about "
+            "<chunk>{chunk}</chunk>"
+            "Please ask questions about this chunk based on the overall document "
+            "for the purposes of improving search retrieval of the chunk. "
+            "Answer only with the question and nothing else."
+        )
+        return self.augment(chunks, prompt)
+
+    def summarize_doc(self) -> str:
+        prompt = (
+            "Summarize the provided document concisely while preserving its key "
+            "ideas, main arguments, and essential details. Ensure clarity and "
+            "coherence, avoiding unnecessary repetition."
+        )
+        if self.doc:
+            prompt = f"<document>{self.doc}</document>\n" + prompt
+        response = self.client.generate_content([prompt])
+        return response.text
diff --git a/vechord/segment.py → vechord/chunk.py b/vechord/segment.py → vechord/chunk.py
@@ -2,16 +2,20 @@
 from abc import ABC, abstractmethod
 
 
-class BaseSegmenter(ABC):
+class BaseChunker(ABC):
     @abstractmethod
     def segment(self, text: str) -> list[str]:
         raise NotImplementedError
 
+    @abstractmethod
+    def name(self) -> str:
+        raise NotImplementedError
+
 
-class RegexSegmenter(BaseSegmenter):
+class RegexChunker(BaseChunker):
     def __init__(
         self,
-        size: int = 1000,
+        size: int = 1536,
         overlap: int = 200,
         separator: str = r"\s{2,}",
         concat: str = ". ",
@@ -21,6 +25,9 @@ def __init__(
         self.separator = re.compile(separator)
         self.concatenator = concat
 
+    def name(self) -> str:
+        return f"regex_chunk_{self.size}_{self.overlap}"
+
     def keep_overlap(self, pieces: list[str]) -> list[str]:
         length = 0
         i = len(pieces) - 1
@@ -69,11 +76,31 @@ def segment(self, text: str) -> list[str]:
         return [*chunks, remaining] if remaining else chunks
 
 
-class SpacySegmenter(BaseSegmenter):
-    def __init__(self):
+class SpacyChunker(BaseChunker):
+    def __init__(self, model: str = "en_core_web_sm"):
+        """A semantic sentence Chunker based on SpaCy."""
         import spacy
 
-        self.nlp = spacy.load("en_core_web_sm", enable=["parser", "tok2vec"])
+        self.model = model
+        self.nlp = spacy.load(model, enable=["parser", "tok2vec"])
+
+    def name(self) -> str:
+        return f"spacy_chunk_{self.model}"
 
     def segment(self, text: str) -> list[str]:
         return [sent.text for sent in self.nlp(text).sents]
+
+
+class WordLlamaChunker(BaseChunker):
+    def __init__(self, size: int = 1536):
+        """A semantic chunker based on WordLlama."""
+        from wordllama import WordLlama
+
+        self.model = WordLlama.load()
+        self.size = size
+
+    def name(self) -> str:
+        return f"wordllama_chunk_{self.size}"
+
+    def segment(self, text: str) -> list[str]:
+        return self.model.split(text, target_size=self.size)
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,3 +18,6 @@ publish: build @@
     test:
     	@uv run pytest -v tests
+    sync:
+    	@uv sync --all-extras