Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,21 @@ name: Python Check
on:
push:
branches: [ "main" ]
paths:
- 'vechord/**'
- 'examples/**'
- '.github/workflows/check.yml'
- 'pyproject.toml'
- 'Makefile'
pull_request:
branches: [ "main" ]
paths:
- 'vechord/**'
- 'examples/**'
- '.github/workflows/check.yml'
- 'pyproject.toml'
- 'Makefile'
workflow_dispatch:

permissions:
contents: read
Expand All @@ -26,8 +39,9 @@ jobs:
- name: Test
env:
PYTEST_ADDOPTS: -s
IMAGE: kemingy/vechord:latest
run: |
docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=1s --health-timeout=1s --health-retries=5 ghcr.io/tensorchord/vchord_bm25-postgres:pg17-v0.1.1
docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=1s --health-timeout=1s --health-retries=5 ${IMAGE}

# Wait for the container to be healthy
for i in {1..10}; do
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@ on:
- 'docs/**'
- '.github/workflows/pages.yml'
- 'examples/**'
- '**.md'
push:
branches: [ main ]
paths:
- 'vechord/**'
- 'docs/**'
- '.github/workflows/pages.yml'
- 'examples/**'
- '**.md'
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

Expand All @@ -35,8 +33,6 @@ jobs:
with:
enable-cache: true
python-version: "3.12"
- name: Set up Rust
uses: dtolnay/rust-toolchain@stable
- name: Install dependencies
run: |
make sync
Expand Down
42 changes: 30 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
<div align="center">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="128" fill="none" viewBox="0 0 200 206">
<defs><path id="a" stroke="#EAB711" d="M0-8h40"/></defs>
<path stroke="#EAB711" stroke-width="16" d="M8 6v200M0 8h40M0 198h40M192 6v200"/>
<use xlink:href="#a" stroke-width="16" transform="matrix(-1 0 0 1 200 16)"/>
<use xlink:href="#a" stroke-width="16" transform="matrix(-1 0 0 1 200 206)"/>
<path fill="#3776AB" d="m75.91 67.91 22.5 70.726h.863l22.545-70.727h21.818L111.545 161H86.182L54.045 67.91z"/>
</svg>
<img src="https://github.com/user-attachments/assets/7b2819bb-1a7d-4b84-9ff9-d0c4d5340da9">

<p>

[![Python Check](https://github.com/tensorchord/vechord/actions/workflows/check.yml/badge.svg)](https://github.com/tensorchord/vechord/actions/workflows/check.yml)
[![Pages](https://github.com/tensorchord/vechord/actions/workflows/pages.yml/badge.svg)]( tensorchord.github.io/vechord/)
![GitHub License](https://img.shields.io/github/license/tensorchord/vechord)
![PyPI - Version](https://img.shields.io/pypi/v/vechord)
[![Discord](https://img.shields.io/discord/974584200327991326?&logoColor=white&color=5865F2&style=flat&logo=discord&cacheSeconds=60)](https://discord.gg/KqswhpVgdU)
[![Python Check][ci-check-badge]][ci-check-file]
[![Pages][ci-page-badge]][document-link]
![GitHub License][license-badge]
![PyPI - Version][pypi-badge]
[![Discord][discord-badge]][discord-link]

</p>
<p><em>Turn PostgreSQL into your search engine in a Pythonic way.</em></p>
Expand All @@ -25,12 +19,23 @@
pip install vechord
```

## Features

- [x] vector search with [RaBitQ][rabitq] (powered by [VectorChord][vectorchord])
- [x] multivec search with [WARP][xtr-warp] (powered by [VectorChord][vectorchord])
- [x] keyword search with BM25 score (powered by [VectorChord-bm25][vectorchord-bm25])
- [x] guarantee the data consistency with transaction (use the `VechordRegistry.run`)
- [x] provide decorator to inject the data from/to the database
- [x] auto-generate the web service

## Examples

- [simple.py](examples/simple.py): for people that are familiar with specialized vector database APIs
- [beir.py](examples/beir.py): the most flexible way to use the library (loading, indexing, querying and evaluation)
- [web.py](examples/web.py): build a web application with from the defined tables and pipeline
- [essay.py](examples/essay.py): extract the content from Paul Graham's essays and evaluate the search results from LLM generated queries
- [contextual.py](examples/contextual.py): contextual retrieval example
- [hybrid.py](examples/hybrid.py): hybrid search that rerank the results from vector search with keyword search

## Development

Expand All @@ -42,3 +47,16 @@ make sync
# format the code
make format
```

[vectorchord]: https://github.com/tensorchord/VectorChord/
[vectorchord-bm25]: https://github.com/tensorchord/VectorChord-bm25
[rabitq]: https://github.com/gaoj0017/RaBitQ
[xtr-warp]:https://github.com/jlscheerer/xtr-warp
[ci-check-badge]: https://github.com/tensorchord/vechord/actions/workflows/check.yml/badge.svg
[ci-check-file]: https://github.com/tensorchord/vechord/actions/workflows/check.yml
[ci-page-badge]: https://github.com/tensorchord/vechord/actions/workflows/pages.yml/badge.svg
[document-link]: https://tensorchord.github.io/vechord/
[license-badge]: https://img.shields.io/github/license/tensorchord/vechord
[pypi-badge]: https://img.shields.io/pypi/v/vechord
[discord-badge]: https://img.shields.io/discord/974584200327991326?&logoColor=white&color=5865F2&style=flat&logo=discord&cacheSeconds=60
[discord-link]: https://discord.gg/KqswhpVgdU
2 changes: 1 addition & 1 deletion docs/source/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

```{eval-rst}
.. automodule:: vechord.spec
:members: Vector,ForeignKey,PrimaryKeyAutoIncrease,Table
:members: Vector,ForeignKey,PrimaryKeyAutoIncrease,Table,Keyword
```

## Augment
Expand Down
6 changes: 3 additions & 3 deletions examples/hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import httpx

from vechord.chunk import RegexChunker
from vechord.embedding import SpacyDenseEmbedding
from vechord.embedding import GeminiDenseEmbedding
from vechord.registry import VechordRegistry
from vechord.rerank import CohereReranker
from vechord.spec import ForeignKey, Keyword, PrimaryKeyAutoIncrease, Table, Vector

URL = "https://paulgraham.com/{}.html"
DenseVector = Vector[96]
emb = SpacyDenseEmbedding()
DenseVector = Vector[768]
emb = GeminiDenseEmbedding()
chunker = RegexChunker(size=1024, overlap=0)
reranker = CohereReranker()

Expand Down
31 changes: 31 additions & 0 deletions examples/simple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from vechord.embedding import GeminiDenseEmbedding
from vechord.registry import VechordRegistry
from vechord.spec import PrimaryKeyAutoIncrease, Table, Vector

DenseVector = Vector[768]


class Document(Table, kw_only=True):
uid: PrimaryKeyAutoIncrease | None = None
title: str = ""
text: str
vec: DenseVector


if __name__ == "__main__":
vr = VechordRegistry("simple", "postgresql://postgres:postgres@172.17.0.1:5432/")
vr.register([Document])
emb = GeminiDenseEmbedding()

# add a document
text = "my personal long note"
doc = Document(title="note", text=text, vec=DenseVector(emb.vectorize_chunk(text)))
vr.insert(doc)

# load
docs = vr.select_by(Document.partial_init(), limit=1)
print(docs)

# query
res = vr.search_by_vector(Document, emb.vectorize_query("note"), topk=1)
print(res)
1 change: 1 addition & 0 deletions examples/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def chunk_document(uid: int, text: str) -> list[Chunk]:


if __name__ == "__main__":
# this pipeline will be used in the web app, or you can run it with `vr.run()`
vr.set_pipeline([load_document, chunk_document])
app = create_web_app(vr)

Expand Down
16 changes: 14 additions & 2 deletions tests/test_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pytest

from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector
from vechord.spec import ForeignKey, Keyword, PrimaryKeyAutoIncrease, Table, Vector


class Document(Table, kw_only=True):
Expand All @@ -20,9 +20,16 @@ class Chunk(Table, kw_only=True):
doc_id: Annotated[int, ForeignKey[Document.uid]]
text: str
vec: Vector[128]
multivec: list[Vector[128]]
keyword: Keyword


@pytest.mark.parametrize("table", [Document, Chunk])
class Simple(Table):
uid: int
text: str


@pytest.mark.parametrize("table", [Document, Chunk, Simple])
def test_storage_cls_methods(table: type[Table]):
assert table.name() == table.__name__.lower()
assert "uid" in table.fields()
Expand All @@ -31,13 +38,18 @@ def test_storage_cls_methods(table: type[Table]):
for field in t.fields():
assert getattr(t, field) is msgspec.UNSET

# UNSET won't appear in the `todict` result
assert t.todict() == {}


def test_table_cls_methods():
assert Document.primary_key() == "uid", Document
assert Chunk.primary_key() == "uid", Chunk

assert Document.vector_column() is None
assert Chunk.vector_column() == "vec"
assert Chunk.multivec_column() == "multivec"
assert Chunk.keyword_column() == "keyword"

def find_schema_by_name(schema, name):
for n, t in schema:
Expand Down
39 changes: 38 additions & 1 deletion tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ class Chunk(Table, kw_only=True):
keyword: Keyword


class Sentence(Table, kw_only=True):
uid: PrimaryKeyAutoIncrease | None = None
text: str
vector: list[DenseVector]


@pytest.fixture(name="registry")
def fixture_registry(request):
registry = VechordRegistry(request.node.name, TEST_POSTGRES)
Expand All @@ -60,6 +66,10 @@ def test_insert_select_remove(registry):
assert inserted[0].text == "hello world"
assert inserted[1].text == "hello there"

# select with limit
one = registry.select_by(Document.partial_init(), limit=1)
assert len(one) == 1

# select by id
first = registry.select_by(Document.partial_init(uid=1))
assert len(first) == 1
Expand Down Expand Up @@ -113,15 +123,42 @@ def create_chunk(uid: int, text: str) -> list[Chunk]:
chunks = registry.select_by(Chunk.partial_init())
assert len(chunks) == len(text.split())

# test search
topk = 3
# vector search
vec_res = registry.search_by_vector(Chunk, gen_vector(), topk=topk)
assert len(vec_res) == topk
assert all(chunk.text in text for chunk in vec_res)
# keyword search
text_res = registry.search_by_keyword(Chunk, "vector", topk=topk)
assert len(text_res) == 1


@pytest.mark.db
def test_multi_vec_maxsim(registry):
registry.register([Sentence])

@registry.inject(output=Sentence)
def create_sentence(text: str) -> Sentence:
return Sentence(
text=text, vector=[gen_vector() for _ in range(len(text.split()))]
)

text = "the quick brown fox jumps over the lazy dog"
num = 32
for _ in range(num):
create_sentence(text)
sentence = registry.select_by(Sentence.partial_init())
assert len(sentence) == num
assert len(sentence[0].vector) == len(text.split())

topk = 3
for dim in range(1, 10):
res = registry.search_by_multivec(
Sentence, [gen_vector() for _ in range(dim)], topk=topk
)
assert len(res) == topk


@pytest.mark.db
def test_pipeline(registry):
@registry.inject(output=Document)
Expand Down
58 changes: 56 additions & 2 deletions vechord/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,23 @@ def create_vector_index(self, name: str, column: str):
)
)

def create_multivec_index(self, name: str, column: str):
config = "build.internal.lists = []"
with self.transaction():
cursor = self.get_cursor()
cursor.execute(
sql.SQL(
"CREATE INDEX IF NOT EXISTS {index} ON "
"{table} USING vchordrq ({column} vector_maxsim_ops) WITH "
"(options = $${config}$$);"
).format(
table=sql.Identifier(f"{self.ns}_{name}"),
index=sql.Identifier(f"{self.ns}_{name}_{column}_multivec_idx"),
column=sql.Identifier(column),
config=sql.SQL(config),
)
)

def _keyword_index_name(self, name: str, column: str):
return f"{self.ns}_{name}_{column}_bm25_idx"

Expand All @@ -114,6 +131,7 @@ def select(
raw_columns: Sequence[str],
kvs: Optional[dict[str, Any]] = None,
from_buffer: bool = False,
limit: Optional[int] = None,
):
"""Select from db table with optional key-value condition or from un-committed
transaction buffer.
Expand All @@ -129,12 +147,18 @@ def select(
)
if kvs:
condition = sql.SQL(" AND ").join(
sql.SQL("{} = {}").format(sql.Identifier(col), sql.Placeholder(col))
for col in kvs
sql.SQL("{} IS NULL").format(sql.Identifier(col))
if val is None
else sql.SQL("{} = {}").format(
sql.Identifier(col), sql.Placeholder(col)
)
for col, val in kvs.items()
)
query += sql.SQL(" WHERE {condition}").format(condition=condition)
elif from_buffer:
query += sql.SQL(" WHERE xmin = pg_current_xact_id()::xid;")
if limit:
query += sql.SQL(" LIMIT {}").format(sql.Literal(limit))
cursor.execute(query, kvs)
return [row for row in cursor.fetchall()]

Expand Down Expand Up @@ -199,6 +223,36 @@ def query_vec(
)
return [row for row in cursor.fetchall()]

def query_multivec( # noqa: PLR0913
self,
name: str,
multivec_col: str,
vec: np.ndarray,
max_maxsim_tuples: int,
return_fields: list[str],
topk: int = 10,
):
columns = sql.SQL(", ").join(map(sql.Identifier, return_fields))
with self.transaction():
cursor = self.get_cursor()
cursor.execute("SET vchordrq.probes = '';")
cursor.execute(
sql.SQL("SET vchordrq.max_maxsim_tuples = {};").format(
sql.Literal(max_maxsim_tuples)
)
)
cursor.execute(
sql.SQL(
"SELECT {columns} FROM {table} ORDER BY {multivec_col} @# %s LIMIT %s;"
).format(
table=sql.Identifier(f"{self.ns}_{name}"),
columns=columns,
multivec_col=sql.Identifier(multivec_col),
),
(vec, topk),
)
return [row for row in cursor.fetchall()]

def query_keyword( # noqa: PLR0913
self,
name: str,
Expand Down
Loading