Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ publish: build

test:
@uv run pytest -v tests

sync:
@uv sync --all-extras
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,14 @@ timeline
: Filter
Rerank: ColBERT
```

## Development

```bash
docker run --rm -d -e POSTGRES_PASSWORD=postgres -p 5432:5432 tensorchord/vchord-postgres:pg17-v0.2.0
envd up
# inside the envd env, sync all the dependencies
make sync
# format the code
make format
```
8 changes: 4 additions & 4 deletions test.py → examples/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
LocalLoader,
Pipeline,
SimpleExtractor,
SpacyEmbedding,
SpacySegmenter,
SpacyChunker,
SpacyDenseEmbedding,
VectorChordClient,
)

Expand All @@ -16,8 +16,8 @@
),
loader=LocalLoader("data", include=[".pdf"]),
extractor=SimpleExtractor(),
segmenter=SpacySegmenter(),
emb=SpacyEmbedding(),
chunker=SpacyChunker(),
emb=SpacyDenseEmbedding(),
)
pipe.run()

Expand Down
26 changes: 26 additions & 0 deletions examples/gemini.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from rich import print

from vechord import (
GeminiAugmenter,
GeminiDenseEmbedding,
GeminiExtractor,
LocalLoader,
Pipeline,
VectorChordClient,
WordLlamaChunker,
)

if __name__ == "__main__":
pipe = Pipeline(
client=VectorChordClient(
"local_pdf", "postgresql://postgres:postgres@172.17.0.1:5432/"
),
loader=LocalLoader("data", include=[".pdf"]),
extractor=GeminiExtractor(),
chunker=WordLlamaChunker(),
emb=GeminiDenseEmbedding(),
augmenter=GeminiAugmenter(),
)
pipe.run()

print(pipe.query("vector search"))
17 changes: 12 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,15 @@ description = "VectorChord Python SDK"
readme = "README.md"
requires-python = ">=3.9"
dependencies = [
"en-core-web-sm",
"falcon>=4.0.2",
"httpx>=0.28.1",
"msgspec>=0.19.0",
"numpy>=2.0.2",
"openai>=1.59.7",
"pgvector>=0.3.6",
"pillow>=11.1.0",
"psycopg[binary]>=3.2.3",
"pypdfium2>=4.30.1",
"rich>=13.9.4",
"spacy>=3.8.4",
"trio>=0.28.0",
]

[project.scripts]
Expand All @@ -26,6 +23,16 @@ vechord = "vechord.main:main"
gemini = [
"google-generativeai>=0.8.4",
]
openai = [
"openai>=1.60.2",
]
spacy = [
"en-core-web-sm",
"spacy>=3.8.4",
]
wordllama = [
"wordllama>=0.3.8.post20",
]

[build-system]
requires = ["pdm-backend"]
Expand All @@ -45,7 +52,7 @@ ignore = ["E501"]
[tool.ruff.lint.isort]
known-first-party = ["vechord"]
[tool.ruff.lint.pylint]
max-args = 7
max-args = 5

[tool.pdm]
distribution = true
Expand Down
539 changes: 365 additions & 174 deletions uv.lock

Large diffs are not rendered by default.

21 changes: 14 additions & 7 deletions vechord/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
from vechord.augment import GeminiAugmenter
from vechord.chunk import RegexChunker, SpacyChunker, WordLlamaChunker
from vechord.client import VectorChordClient
from vechord.embedding import GeminiEmbedding, OpenAIEmbedding, SpacyEmbedding
from vechord.embedding import (
GeminiDenseEmbedding,
OpenAIDenseEmbedding,
SpacyDenseEmbedding,
)
from vechord.extract import GeminiExtractor, SimpleExtractor
from vechord.load import LocalLoader
from vechord.model import Chunk, Document
from vechord.pipeline import Pipeline
from vechord.segment import RegexSegmenter, SpacySegmenter

__all__ = [
"Chunk",
"Document",
"GeminiEmbedding",
"GeminiAugmenter",
"GeminiDenseEmbedding",
"GeminiExtractor",
"LocalLoader",
"OpenAIEmbedding",
"OpenAIDenseEmbedding",
"Pipeline",
"RegexSegmenter",
"RegexChunker",
"SimpleExtractor",
"SpacyEmbedding",
"SpacySegmenter",
"SpacyChunker",
"SpacyDenseEmbedding",
"VectorChordClient",
"WordLlamaChunker",
]
115 changes: 115 additions & 0 deletions vechord/augment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import os
from abc import ABC, abstractmethod
from datetime import timedelta

from vechord.log import logger


class BaseAugmenter(ABC):
@abstractmethod
def reset(self, doc: str):
"""Cache the document for augmentation."""
raise NotImplementedError

@abstractmethod
def name(self) -> str:
raise NotImplementedError

@abstractmethod
def augment_context(self, chunks: list[str]) -> list[str]:
raise NotImplementedError

@abstractmethod
def augment_query(self, chunks: list[str]) -> list[str]:
raise NotImplementedError

@abstractmethod
def summarize_doc(self) -> str:
raise NotImplementedError


class GeminiAugmenter(BaseAugmenter):
def __init__(self, model: str = "models/gemini-1.5-flash-001", ttl_sec: int = 600):
"""Gemini Augmenter with cache.

Minimal cache token is 32768.
"""
key = os.environ.get("GEMINI_API_KEY")
if not key:
raise ValueError("env GEMINI_API_KEY not set")

self.model_name = model
self.ttl_sec = ttl_sec
self.min_token = 32768

def name(self) -> str:
return f"gemini_augment_{self.model_name}"

def reset(self, doc: str):
import google.generativeai as genai

self.client = genai.GenerativeModel(model_name=self.model_name)
tokens = self.client.count_tokens(doc).total_tokens
self.doc = "" # empty means doc is in the cache
if tokens <= self.min_token:
# cannot use cache due to the Gemini token limit
self.doc = doc
else:
logger.debug("use cache since the doc has %d tokens", tokens)
cache = genai.caching.CachedContent.create(
model=self.model_name,
system_instruction=(
"You are an expert on the natural language understanding. "
"Answer the questions based on the whole document you have access to."
),
contents=doc,
ttl=timedelta(seconds=self.ttl_sec),
)
self.client = genai.GenerativeModel.from_cached_content(
cached_content=cache
)

def augment(self, chunks: list[str], prompt: str) -> list[str]:
res = []
try:
for chunk in chunks:
context = prompt.format(chunk=chunk)
if self.doc:
context = f"<document>{self.doc}</document>\n" + context
response = self.client.generate_content([context])
res.append(response.text)
except Exception as e:
logger.error("GeminiAugmenter error: %s", e)
breakpoint()
return res

def augment_context(self, chunks: list[str]) -> list[str]:
prompt = (
"Here is the chunk we want to situate within the whole document "
"<chunk>{chunk}</chunk>"
"Please give a short succinct context to situate this chunk within "
"the overall document for the purposes of improving search retrieval "
"of the chunk. Answer only with the succinct context and nothing else."
)
return self.augment(chunks, prompt)

def augment_query(self, chunks: list[str]) -> list[str]:
prompt = (
"Here is the chunk we want to ask questions about "
"<chunk>{chunk}</chunk>"
"Please ask questions about this chunk based on the overall document "
"for the purposes of improving search retrieval of the chunk. "
"Answer only with the question and nothing else."
)
return self.augment(chunks, prompt)

def summarize_doc(self) -> str:
prompt = (
"Summarize the provided document concisely while preserving its key "
"ideas, main arguments, and essential details. Ensure clarity and "
"coherence, avoiding unnecessary repetition."
)
if self.doc:
prompt = f"<document>{self.doc}</document>\n" + prompt
response = self.client.generate_content([prompt])
return response.text
39 changes: 33 additions & 6 deletions vechord/segment.py → vechord/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,20 @@
from abc import ABC, abstractmethod


class BaseSegmenter(ABC):
class BaseChunker(ABC):
@abstractmethod
def segment(self, text: str) -> list[str]:
raise NotImplementedError

@abstractmethod
def name(self) -> str:
raise NotImplementedError


class RegexSegmenter(BaseSegmenter):
class RegexChunker(BaseChunker):
def __init__(
self,
size: int = 1000,
size: int = 1536,
overlap: int = 200,
separator: str = r"\s{2,}",
concat: str = ". ",
Expand All @@ -21,6 +25,9 @@ def __init__(
self.separator = re.compile(separator)
self.concatenator = concat

def name(self) -> str:
return f"regex_chunk_{self.size}_{self.overlap}"

def keep_overlap(self, pieces: list[str]) -> list[str]:
length = 0
i = len(pieces) - 1
Expand Down Expand Up @@ -69,11 +76,31 @@ def segment(self, text: str) -> list[str]:
return [*chunks, remaining] if remaining else chunks


class SpacySegmenter(BaseSegmenter):
def __init__(self):
class SpacyChunker(BaseChunker):
def __init__(self, model: str = "en_core_web_sm"):
"""A semantic sentence Chunker based on SpaCy."""
import spacy

self.nlp = spacy.load("en_core_web_sm", enable=["parser", "tok2vec"])
self.model = model
self.nlp = spacy.load(model, enable=["parser", "tok2vec"])

def name(self) -> str:
return f"spacy_chunk_{self.model}"

def segment(self, text: str) -> list[str]:
return [sent.text for sent in self.nlp(text).sents]


class WordLlamaChunker(BaseChunker):
def __init__(self, size: int = 1536):
"""A semantic chunker based on WordLlama."""
from wordllama import WordLlama

self.model = WordLlama.load()
self.size = size

def name(self) -> str:
return f"wordllama_chunk_{self.size}"

def segment(self, text: str) -> list[str]:
return self.model.split(text, target_size=self.size)
Loading