Skip to content

Commit e024e6e

Browse files
authored
refact: use pipeline (#2)
* refact: use pipeline Signed-off-by: Keming <kemingyang@tensorchord.ai> * check if file exists Signed-off-by: Keming <kemingyang@tensorchord.ai> * rename file to document Signed-off-by: Keming <kemingyang@tensorchord.ai> * address commments Signed-off-by: Keming <kemingyang@tensorchord.ai> * address commments Signed-off-by: Keming <kemingyang@tensorchord.ai> --------- Signed-off-by: Keming <kemingyang@tensorchord.ai>
1 parent 2bb5232 commit e024e6e

File tree

16 files changed

+888
-163
lines changed

16 files changed

+888
-163
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ cython_debug/
173173

174174
# test data
175175
data/
176+
*.txt
177+
*.jpeg
176178

177179
# generated version file
178180
vechord/__version__.py
@@ -183,3 +185,6 @@ vechord/__version__.py
183185
# editor
184186
.vscode/
185187
.idea/
188+
189+
# ini file
190+
*.ini

README.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,29 @@
11
# vechord
22

3-
[VectorChord](https://github.com/tensorchord/VectorChord/) Python SDK.
3+
Python RAG framework built on top of PostgreSQL and [VectorChord](https://github.com/tensorchord/VectorChord/).
4+
5+
## Diagram
6+
7+
```mermaid
8+
timeline
9+
title RAG
10+
section Ingestion
11+
Source: Local
12+
: Google Drive
13+
: Dropbox
14+
: Notion
15+
File: Document
16+
: Image
17+
: Audio
18+
Chunk: Text
19+
: Entities
20+
: Embedding
21+
section Query
22+
Analysis: Expansion
23+
: Keyword
24+
: Embedding
25+
Search: Vector Search
26+
: Full Text Search
27+
: Filter
28+
Rerank: ColBERT
29+
```

build.envd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ def build():
66
install.conda(use_mamba=True)
77
install.python()
88
install.python_packages(name=["uv"])
9+
shell("fish")

pyproject.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,22 @@ dependencies = [
1111
"numpy>=2.0.2",
1212
"openai>=1.59.7",
1313
"pgvector>=0.3.6",
14+
"pillow>=11.1.0",
1415
"psycopg[binary]>=3.2.3",
15-
"pypdf>=5.1.0",
16+
"pypdfium2>=4.30.1",
1617
"rich>=13.9.4",
1718
"spacy>=3.8.4",
19+
"trio>=0.28.0",
1820
]
1921

2022
[project.scripts]
2123
vechord = "vechord.main:main"
2224

25+
[project.optional-dependencies]
26+
gemini = [
27+
"google-generativeai>=0.8.4",
28+
]
29+
2330
[build-system]
2431
requires = ["pdm-backend"]
2532
build-backend = "pdm.backend"

test.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,24 @@
1-
from vechord import DataLoader, TextFile, VectorChordClient
1+
from rich import print
2+
3+
from vechord import (
4+
LocalLoader,
5+
Pipeline,
6+
SimpleExtractor,
7+
SpacyEmbedding,
8+
SpacySegmenter,
9+
VectorChordClient,
10+
)
211

312
if __name__ == "__main__":
4-
namespace = "local_pdf"
5-
client = VectorChordClient("postgresql://postgres:postgres@172.17.0.1:5432/")
6-
client.create_namespace(namespace)
7-
for file in DataLoader().local_files("data"):
8-
text_file = TextFile.from_filepath(file)
9-
client.insert_text(namespace, text_file)
13+
pipe = Pipeline(
14+
client=VectorChordClient(
15+
"local_pdf", "postgresql://postgres:postgres@172.17.0.1:5432/"
16+
),
17+
loader=LocalLoader("data", include=[".pdf"]),
18+
extractor=SimpleExtractor(),
19+
segmenter=SpacySegmenter(),
20+
emb=SpacyEmbedding(),
21+
)
22+
pipe.run()
1023

11-
res = client.query(namespace=namespace, query="vector search", topk=5)
12-
print(res)
24+
print(pipe.query("vector search"))

uv.lock

Lines changed: 466 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vechord/__init__.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
from vechord.client import VectorChordClient
2-
from vechord.loader import DataLoader
3-
from vechord.model import Sentence, TextFile
2+
from vechord.embedding import GeminiEmbedding, OpenAIEmbedding, SpacyEmbedding
3+
from vechord.extract import GeminiExtractor, SimpleExtractor
4+
from vechord.load import LocalLoader
5+
from vechord.model import Chunk, Document
6+
from vechord.pipeline import Pipeline
7+
from vechord.segment import RegexSegmenter, SpacySegmenter
48

59
__all__ = [
6-
"DataLoader",
7-
"Sentence",
8-
"TextFile",
10+
"Chunk",
11+
"Document",
12+
"GeminiEmbedding",
13+
"GeminiExtractor",
14+
"LocalLoader",
15+
"OpenAIEmbedding",
16+
"Pipeline",
17+
"RegexSegmenter",
18+
"SimpleExtractor",
19+
"SpacyEmbedding",
20+
"SpacySegmenter",
921
"VectorChordClient",
1022
]

vechord/client.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,18 @@
44
from pgvector.psycopg import register_vector
55

66
from vechord.log import logger
7-
from vechord.model import TextFile
8-
from vechord.text import EN_TEXT_PROCESSOR
7+
from vechord.model import Chunk, Document
98

109

1110
class VectorChordClient:
12-
def __init__(self, url: str, autocommit: bool = True):
11+
def __init__(self, namespace: str, url: str, autocommit: bool = True):
12+
self.ns = namespace
1313
self.url = url
1414
self.conn = psycopg.connect(url, autocommit=autocommit)
1515
self.conn.execute("CREATE EXTENSION IF NOT EXISTS vchord CASCADE")
1616
register_vector(self.conn)
1717

18-
def create_namespace(self, namespace: str, dim: int = 96):
18+
def create(self, dim):
1919
config = """
2020
residual_quantization = true
2121
[build.internal]
@@ -24,17 +24,17 @@ def create_namespace(self, namespace: str, dim: int = 96):
2424
"""
2525
try:
2626
self.conn.execute(
27-
f"CREATE TABLE IF NOT EXISTS {namespace}_meta "
27+
f"CREATE TABLE IF NOT EXISTS {self.ns}_meta "
2828
"(id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, "
29-
"name TEXT, digest TEXT)"
29+
"name TEXT, digest TEXT NOT NULL UNIQUE, updated_at TIMESTAMP)"
3030
)
3131
self.conn.execute(
32-
f"CREATE TABLE IF NOT EXISTS {namespace} "
32+
f"CREATE TABLE IF NOT EXISTS {self.ns} "
3333
"(id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, "
3434
f"doc_id INT, content TEXT, embedding vector({dim}))"
3535
)
3636
self.conn.execute(
37-
f"CREATE INDEX IF NOT EXISTS {namespace}_vector_idx ON {namespace} "
37+
f"CREATE INDEX IF NOT EXISTS {self.ns}_vector_idx ON {self.ns} "
3838
"USING vchordrq (embedding vector_l2_ops) WITH "
3939
f"(options = $${config}$$)"
4040
)
@@ -44,35 +44,36 @@ def create_namespace(self, namespace: str, dim: int = 96):
4444
self.conn.rollback()
4545
raise err
4646

47-
def insert_text(self, namespace: str, textfile: TextFile):
47+
def is_file_exists(self, doc: Document) -> bool:
48+
cursor = self.conn.execute(
49+
f"SELECT id FROM {self.ns}_meta WHERE digest = %s", (doc.digest,)
50+
)
51+
return cursor.fetchone() is not None
52+
53+
def insert_text(self, doc: Document, chunks: list[Chunk]):
4854
try:
4955
cursor = self.conn.execute(
50-
f"INSERT INTO {namespace}_meta (name, digest) VALUES (%s, %s) RETURNING id",
51-
(textfile.filename, textfile.digest),
56+
f"INSERT INTO {self.ns}_meta (name, digest, updated_at) VALUES (%s, %s, %s) RETURNING id",
57+
(doc.path, doc.digest, doc.updated_at),
5258
)
5359
doc_id = cursor.fetchone()[0]
54-
for sentence in textfile.sentences:
60+
for chunk in chunks:
5561
self.conn.execute(
56-
f"INSERT INTO {namespace} (doc_id, content, embedding) VALUES (%s, %s, %s)",
57-
(doc_id, sentence.text, sentence.vector),
62+
f"INSERT INTO {self.ns} (doc_id, content, embedding) VALUES (%s, %s, %s)",
63+
(doc_id, chunk.text, chunk.vector),
5864
)
59-
logger.debug(
60-
"inserted %s sentences from file %s",
61-
len(textfile.sentences),
62-
textfile.filename,
63-
)
65+
logger.debug("inserted %s sentences from file %s", len(chunks), doc.path)
6466
except psycopg.errors.DatabaseError as err:
6567
logger.error(err)
6668
logger.info("rollback from the previous error")
6769
self.conn.rollback()
6870
raise err
6971

70-
def query(self, namespace: str, query: str, topk: int = 10):
72+
def query(self, query: Chunk, topk: int = 10) -> list[str]:
7173
start = perf_counter()
72-
query = EN_TEXT_PROCESSOR.process(query)
7374
try:
7475
cursor = self.conn.execute(
75-
f"SELECT content FROM {namespace} ORDER BY embedding <-> %s LIMIT %s",
76+
f"SELECT content FROM {self.ns} ORDER BY embedding <-> %s LIMIT %s",
7677
(query.vector, topk),
7778
)
7879
res = cursor.fetchall()

vechord/embedding.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os
2+
from abc import ABC, abstractmethod
3+
4+
import numpy as np
5+
6+
7+
class BaseEmbedding(ABC):
8+
@abstractmethod
9+
def vectorize(self, text: str) -> np.ndarray:
10+
raise NotImplementedError
11+
12+
@abstractmethod
13+
def get_dim(self) -> int:
14+
raise NotImplementedError
15+
16+
17+
class SpacyEmbedding(BaseEmbedding):
18+
def __init__(self, model: str = "en_core_web_sm", dim: int = 96):
19+
import spacy
20+
21+
self.nlp = spacy.load(model, enable=["tok2vec"])
22+
self.dim = dim
23+
24+
def get_dim(self) -> int:
25+
return self.dim
26+
27+
def vectorize(self, text: str) -> np.ndarray:
28+
doc = self.nlp(text)
29+
return doc.vector
30+
31+
32+
class GeminiEmbedding(BaseEmbedding):
33+
def __init__(self, model: str = "models/text-embedding-004", dim: int = 768):
34+
key = os.environ.get("GEMINI_API_KEY")
35+
if not key:
36+
raise ValueError("env GEMINI_API_KEY not set")
37+
38+
import google.generativeai as genai
39+
40+
self.client = genai.embed_content
41+
self.model = model
42+
self.dim = dim
43+
44+
def get_dim(self) -> int:
45+
return self.dim
46+
47+
def vectorize(self, text: str) -> np.ndarray:
48+
res = self.client(
49+
content=text, model=self.model, output_dimensionality=self.dim
50+
)
51+
return np.array(res["embedding"])
52+
53+
54+
class OpenAIEmbedding(BaseEmbedding):
55+
def __init__(self, model: str = "text-embedding-3-large", dim: int = 3072):
56+
key = os.environ.get("OPENAI_API_KEY")
57+
if not key:
58+
raise ValueError("env OPENAI_API_KEY not set")
59+
60+
from openai import OpenAI
61+
62+
self.client = OpenAI()
63+
self.model = model
64+
self.dim = dim
65+
66+
def get_dim(self) -> int:
67+
return self.dim
68+
69+
def vectorize(self, text: str) -> np.ndarray:
70+
return np.array(
71+
self.client.embeddings.create(
72+
model=self.model, input=text, dimensions=self.dim
73+
)
74+
.data[0]
75+
.embedding
76+
)

vechord/extract.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import base64
2+
import os
3+
import unicodedata
4+
from abc import ABC, abstractmethod
5+
from io import BytesIO
6+
7+
import pypdfium2 as pdfium
8+
9+
from vechord.log import logger
10+
from vechord.model import Document
11+
12+
13+
class BaseExtractor(ABC):
14+
@abstractmethod
15+
def extract_pdf(self, doc: Document) -> str:
16+
raise NotImplementedError
17+
18+
def extract(self, doc: Document) -> str:
19+
if doc.ext == ".txt":
20+
text = doc.data.decode("utf-8")
21+
elif doc.ext == ".pdf":
22+
text = self.extract_pdf(doc)
23+
else:
24+
logger.warning("unsupported file type '%s' for %s", doc.ext, doc.path)
25+
text = ''
26+
return unicodedata.normalize("NFKC", text)
27+
28+
29+
class SimpleExtractor(BaseExtractor):
30+
def __init__(self):
31+
pass
32+
33+
def extract_pdf(self, doc: Document) -> str:
34+
pdf = pdfium.PdfDocument(doc.data)
35+
text = []
36+
for page in pdf:
37+
text.append(page.get_textpage().get_text_bounded())
38+
39+
return "\n".join(text)
40+
41+
42+
class GeminiExtractor(BaseExtractor):
43+
def __init__(self, model: str = "gemini-2.0-flash-exp"):
44+
key = os.environ.get("GEMINI_API_KEY")
45+
if not key:
46+
raise ValueError("env GEMINI_API_KEY not set")
47+
48+
import google.generativeai as genai
49+
50+
self.model = genai.GenerativeModel(model)
51+
self.prompt = (
52+
"Extract all the text from the following document and return it exactly as "
53+
"it appears, without any modifications, summarization, or interpretation"
54+
)
55+
56+
def extract_pdf(self, doc: Document) -> str:
57+
pdf = pdfium.PdfDocument(doc.data)
58+
text = []
59+
for page in pdf:
60+
img = page.render(scale=2).to_pil() # make the text clearer
61+
img_bytes = BytesIO()
62+
img.save(img_bytes, format="JPEG")
63+
response = self.model.generate_content(
64+
[
65+
{
66+
"mime_type": "image/jpeg",
67+
"data": base64.b64encode(img_bytes.getvalue()).decode("utf-8"),
68+
},
69+
self.prompt,
70+
]
71+
)
72+
text.append(response.text)
73+
74+
return "\n".join(text)

0 commit comments

Comments
 (0)