tensorchord
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 27 additions & 1 deletion b/‎README.md‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎build.envd‎
Lines changed: 1 addition & 0 deletions b/‎build.envd‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 8 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎test.py‎
Lines changed: 21 additions & 9 deletions b/‎test.py‎
Lines changed: 21 additions & 9 deletions
diff --git a/‎uv.lock‎
Lines changed: 466 additions & 11 deletions b/‎uv.lock‎
Lines changed: 466 additions & 11 deletions
diff --git a/‎vechord/__init__.py‎
Lines changed: 17 additions & 5 deletions b/‎vechord/__init__.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎vechord/client.py‎
Lines changed: 23 additions & 22 deletions b/‎vechord/client.py‎
Lines changed: 23 additions & 22 deletions
diff --git a/‎vechord/embedding.py‎
Lines changed: 76 additions & 0 deletions b/‎vechord/embedding.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎vechord/extract.py‎
Lines changed: 74 additions & 0 deletions b/‎vechord/extract.py‎
Lines changed: 74 additions & 0 deletions
@@ -173,6 +173,8 @@ cython_debug/
 
 # test data
 data/
+*.txt
+*.jpeg
 
 # generated version file
 vechord/__version__.py
@@ -183,3 +185,6 @@ vechord/__version__.py
 # editor
 .vscode/
 .idea/
+
+# ini file
+*.ini
@@ -1,3 +1,29 @@
 # vechord
 
-[VectorChord](https://github.com/tensorchord/VectorChord/) Python SDK.
+Python RAG framework built on top of PostgreSQL and [VectorChord](https://github.com/tensorchord/VectorChord/).
+
+## Diagram
+
+```mermaid
+timeline
+    title RAG
+    section Ingestion
+        Source: Local
+              : Google Drive
+              : Dropbox
+              : Notion
+        File: Document
+            : Image
+            : Audio
+        Chunk: Text
+             : Entities
+             : Embedding
+    section Query
+        Analysis: Expansion
+                : Keyword
+                : Embedding
+        Search: Vector Search
+              : Full Text Search
+              : Filter
+        Rerank: ColBERT
+```
@@ -6,3 +6,4 @@ def build():
     install.conda(use_mamba=True)
     install.python()
     install.python_packages(name=["uv"])
+    shell("fish")
@@ -11,15 +11,22 @@ dependencies = [
     "numpy>=2.0.2",
     "openai>=1.59.7",
     "pgvector>=0.3.6",
+    "pillow>=11.1.0",
     "psycopg[binary]>=3.2.3",
-    "pypdf>=5.1.0",
+    "pypdfium2>=4.30.1",
     "rich>=13.9.4",
     "spacy>=3.8.4",
+    "trio>=0.28.0",
 ]
 
 [project.scripts]
 vechord = "vechord.main:main"
 
+[project.optional-dependencies]
+gemini = [
+    "google-generativeai>=0.8.4",
+]
+
 [build-system]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
 
@@ -1,12 +1,24 @@
-from vechord import DataLoader, TextFile, VectorChordClient
+from rich import print
+
+from vechord import (
+    LocalLoader,
+    Pipeline,
+    SimpleExtractor,
+    SpacyEmbedding,
+    SpacySegmenter,
+    VectorChordClient,
+)
 
 if __name__ == "__main__":
-    namespace = "local_pdf"
-    client = VectorChordClient("postgresql://postgres:postgres@172.17.0.1:5432/")
-    client.create_namespace(namespace)
-    for file in DataLoader().local_files("data"):
-        text_file = TextFile.from_filepath(file)
-        client.insert_text(namespace, text_file)
+    pipe = Pipeline(
+        client=VectorChordClient(
+            "local_pdf", "postgresql://postgres:postgres@172.17.0.1:5432/"
+        ),
+        loader=LocalLoader("data", include=[".pdf"]),
+        extractor=SimpleExtractor(),
+        segmenter=SpacySegmenter(),
+        emb=SpacyEmbedding(),
+    )
+    pipe.run()
 
-    res = client.query(namespace=namespace, query="vector search", topk=5)
-    print(res)
+    print(pipe.query("vector search"))
@@ -1,10 +1,22 @@
 from vechord.client import VectorChordClient
-from vechord.loader import DataLoader
-from vechord.model import Sentence, TextFile
+from vechord.embedding import GeminiEmbedding, OpenAIEmbedding, SpacyEmbedding
+from vechord.extract import GeminiExtractor, SimpleExtractor
+from vechord.load import LocalLoader
+from vechord.model import Chunk, Document
+from vechord.pipeline import Pipeline
+from vechord.segment import RegexSegmenter, SpacySegmenter
 
 __all__ = [
-    "DataLoader",
-    "Sentence",
-    "TextFile",
+    "Chunk",
+    "Document",
+    "GeminiEmbedding",
+    "GeminiExtractor",
+    "LocalLoader",
+    "OpenAIEmbedding",
+    "Pipeline",
+    "RegexSegmenter",
+    "SimpleExtractor",
+    "SpacyEmbedding",
+    "SpacySegmenter",
     "VectorChordClient",
 ]
@@ -4,18 +4,18 @@
 from pgvector.psycopg import register_vector
 
 from vechord.log import logger
-from vechord.model import TextFile
-from vechord.text import EN_TEXT_PROCESSOR
+from vechord.model import Chunk, Document
 
 
 class VectorChordClient:
-    def __init__(self, url: str, autocommit: bool = True):
+    def __init__(self, namespace: str, url: str, autocommit: bool = True):
+        self.ns = namespace
         self.url = url
         self.conn = psycopg.connect(url, autocommit=autocommit)
         self.conn.execute("CREATE EXTENSION IF NOT EXISTS vchord CASCADE")
         register_vector(self.conn)
 
-    def create_namespace(self, namespace: str, dim: int = 96):
+    def create(self, dim):
         config = """
         residual_quantization = true
         [build.internal]
@@ -24,17 +24,17 @@ def create_namespace(self, namespace: str, dim: int = 96):
         """
         try:
             self.conn.execute(
-                f"CREATE TABLE IF NOT EXISTS {namespace}_meta "
+                f"CREATE TABLE IF NOT EXISTS {self.ns}_meta "
                 "(id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, "
-                "name TEXT, digest TEXT)"
+                "name TEXT, digest TEXT NOT NULL UNIQUE, updated_at TIMESTAMP)"
             )
             self.conn.execute(
-                f"CREATE TABLE IF NOT EXISTS {namespace} "
+                f"CREATE TABLE IF NOT EXISTS {self.ns} "
                 "(id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, "
                 f"doc_id INT, content TEXT, embedding vector({dim}))"
             )
             self.conn.execute(
-                f"CREATE INDEX IF NOT EXISTS {namespace}_vector_idx ON {namespace} "
+                f"CREATE INDEX IF NOT EXISTS {self.ns}_vector_idx ON {self.ns} "
                 "USING vchordrq (embedding vector_l2_ops) WITH "
                 f"(options = $${config}$$)"
             )
@@ -44,35 +44,36 @@ def create_namespace(self, namespace: str, dim: int = 96):
             self.conn.rollback()
             raise err
 
-    def insert_text(self, namespace: str, textfile: TextFile):
+    def is_file_exists(self, doc: Document) -> bool:
+        cursor = self.conn.execute(
+            f"SELECT id FROM {self.ns}_meta WHERE digest = %s", (doc.digest,)
+        )
+        return cursor.fetchone() is not None
+
+    def insert_text(self, doc: Document, chunks: list[Chunk]):
         try:
             cursor = self.conn.execute(
-                f"INSERT INTO {namespace}_meta (name, digest) VALUES (%s, %s) RETURNING id",
-                (textfile.filename, textfile.digest),
+                f"INSERT INTO {self.ns}_meta (name, digest, updated_at) VALUES (%s, %s, %s) RETURNING id",
+                (doc.path, doc.digest, doc.updated_at),
             )
             doc_id = cursor.fetchone()[0]
-            for sentence in textfile.sentences:
+            for chunk in chunks:
                 self.conn.execute(
-                    f"INSERT INTO {namespace} (doc_id, content, embedding) VALUES (%s, %s, %s)",
-                    (doc_id, sentence.text, sentence.vector),
+                    f"INSERT INTO {self.ns} (doc_id, content, embedding) VALUES (%s, %s, %s)",
+                    (doc_id, chunk.text, chunk.vector),
                 )
-            logger.debug(
-                "inserted %s sentences from file %s",
-                len(textfile.sentences),
-                textfile.filename,
-            )
+            logger.debug("inserted %s sentences from file %s", len(chunks), doc.path)
         except psycopg.errors.DatabaseError as err:
             logger.error(err)
             logger.info("rollback from the previous error")
             self.conn.rollback()
             raise err
 
-    def query(self, namespace: str, query: str, topk: int = 10):
+    def query(self, query: Chunk, topk: int = 10) -> list[str]:
         start = perf_counter()
-        query = EN_TEXT_PROCESSOR.process(query)
         try:
             cursor = self.conn.execute(
-                f"SELECT content FROM {namespace} ORDER BY embedding <-> %s LIMIT %s",
+                f"SELECT content FROM {self.ns} ORDER BY embedding <-> %s LIMIT %s",
                 (query.vector, topk),
             )
             res = cursor.fetchall()
 
@@ -0,0 +1,76 @@
+import os
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+
+class BaseEmbedding(ABC):
+    @abstractmethod
+    def vectorize(self, text: str) -> np.ndarray:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_dim(self) -> int:
+        raise NotImplementedError
+
+
+class SpacyEmbedding(BaseEmbedding):
+    def __init__(self, model: str = "en_core_web_sm", dim: int = 96):
+        import spacy
+
+        self.nlp = spacy.load(model, enable=["tok2vec"])
+        self.dim = dim
+
+    def get_dim(self) -> int:
+        return self.dim
+
+    def vectorize(self, text: str) -> np.ndarray:
+        doc = self.nlp(text)
+        return doc.vector
+
+
+class GeminiEmbedding(BaseEmbedding):
+    def __init__(self, model: str = "models/text-embedding-004", dim: int = 768):
+        key = os.environ.get("GEMINI_API_KEY")
+        if not key:
+            raise ValueError("env GEMINI_API_KEY not set")
+
+        import google.generativeai as genai
+
+        self.client = genai.embed_content
+        self.model = model
+        self.dim = dim
+
+    def get_dim(self) -> int:
+        return self.dim
+
+    def vectorize(self, text: str) -> np.ndarray:
+        res = self.client(
+            content=text, model=self.model, output_dimensionality=self.dim
+        )
+        return np.array(res["embedding"])
+
+
+class OpenAIEmbedding(BaseEmbedding):
+    def __init__(self, model: str = "text-embedding-3-large", dim: int = 3072):
+        key = os.environ.get("OPENAI_API_KEY")
+        if not key:
+            raise ValueError("env OPENAI_API_KEY not set")
+
+        from openai import OpenAI
+
+        self.client = OpenAI()
+        self.model = model
+        self.dim = dim
+
+    def get_dim(self) -> int:
+        return self.dim
+
+    def vectorize(self, text: str) -> np.ndarray:
+        return np.array(
+            self.client.embeddings.create(
+                model=self.model, input=text, dimensions=self.dim
+            )
+            .data[0]
+            .embedding
+        )
@@ -0,0 +1,74 @@
+import base64
+import os
+import unicodedata
+from abc import ABC, abstractmethod
+from io import BytesIO
+
+import pypdfium2 as pdfium
+
+from vechord.log import logger
+from vechord.model import Document
+
+
+class BaseExtractor(ABC):
+    @abstractmethod
+    def extract_pdf(self, doc: Document) -> str:
+        raise NotImplementedError
+
+    def extract(self, doc: Document) -> str:
+        if doc.ext == ".txt":
+            text = doc.data.decode("utf-8")
+        elif doc.ext == ".pdf":
+            text = self.extract_pdf(doc)
+        else:
+            logger.warning("unsupported file type '%s' for %s", doc.ext, doc.path)
+            text = ''
+        return unicodedata.normalize("NFKC", text)
+
+
+class SimpleExtractor(BaseExtractor):
+    def __init__(self):
+        pass
+
+    def extract_pdf(self, doc: Document) -> str:
+        pdf = pdfium.PdfDocument(doc.data)
+        text = []
+        for page in pdf:
+            text.append(page.get_textpage().get_text_bounded())
+
+        return "\n".join(text)
+
+
+class GeminiExtractor(BaseExtractor):
+    def __init__(self, model: str = "gemini-2.0-flash-exp"):
+        key = os.environ.get("GEMINI_API_KEY")
+        if not key:
+            raise ValueError("env GEMINI_API_KEY not set")
+
+        import google.generativeai as genai
+
+        self.model = genai.GenerativeModel(model)
+        self.prompt = (
+            "Extract all the text from the following document and return it exactly as "
+            "it appears, without any modifications, summarization, or interpretation"
+        )
+
+    def extract_pdf(self, doc: Document) -> str:
+        pdf = pdfium.PdfDocument(doc.data)
+        text = []
+        for page in pdf:
+            img = page.render(scale=2).to_pil()  # make the text clearer
+            img_bytes = BytesIO()
+            img.save(img_bytes, format="JPEG")
+            response = self.model.generate_content(
+                [
+                    {
+                        "mime_type": "image/jpeg",
+                        "data": base64.b64encode(img_bytes.getvalue()).decode("utf-8"),
+                    },
+                    self.prompt,
+                ]
+            )
+            text.append(response.text)
+
+        return "\n".join(text)