Skip to content

Commit 49940f6

Browse files
kemingyCopilot
andauthored
docs: add more guides (#25)
* docs: add more guides Signed-off-by: Keming <kemingyang@tensorchord.ai> * Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Signed-off-by: Keming <kemingyang@tensorchord.ai> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 54f5619 commit 49940f6

File tree

9 files changed

+270
-7
lines changed

9 files changed

+270
-7
lines changed

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ pip install vechord
4747

4848
## User Guide
4949

50-
For the API references, check our [documentation][document-link].
51-
50+
For more details, check our [API reference][document-api] and [User Guide][document-guide].
5251
### Define the table
5352

5453
```python
@@ -161,7 +160,7 @@ with make_server("", 8000, app) as server:
161160
## Development
162161

163162
```bash
164-
docker run --rm -d --name vdb -e POSTGRES_PASSWORD=postgres -p 5432:5432 ghcr.io/tensorchord/vchord_bm25-postgres:pg17-v0.1.1
163+
docker run --rm -d --name vdb -e POSTGRES_PASSWORD=postgres -p 5432:5432 ghcr.io/tensorchord/vchord-suite:pg17-latest
165164
envd up
166165
# inside the envd env, sync all the dependencies
167166
make sync
@@ -177,6 +176,8 @@ make format
177176
[ci-check-file]: https://github.com/tensorchord/vechord/actions/workflows/check.yml
178177
[ci-page-badge]: https://github.com/tensorchord/vechord/actions/workflows/pages.yml/badge.svg
179178
[document-link]: https://tensorchord.github.io/vechord/
179+
[document-api]: https://tensorchord.github.io/vechord/api.html
180+
[document-guide]: https://tensorchord.github.io/vechord/guide.html
180181
[license-badge]: https://img.shields.io/github/license/tensorchord/vechord
181182
[license-link]: https://github.com/tensorchord/vechord/blob/main/LICENSE
182183
[pypi-badge]: https://img.shields.io/pypi/v/vechord

docs/source/conf.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
extensions = [
1818
"sphinx.ext.autodoc",
19+
"sphinx.ext.intersphinx",
1920
"sphinx.ext.napoleon",
2021
"sphinx.ext.viewcode",
2122
"sphinx.ext.githubpages",
@@ -45,7 +46,20 @@
4546
html_baseurl = "https://tensorchord.github.io/vechord/"
4647
html_extra_path = ["robots.txt"]
4748
# myst
48-
myst_enable_extensions = ["tasklist", "fieldlist", "colon_fence"]
49+
myst_enable_extensions = [
50+
"tasklist",
51+
"fieldlist",
52+
"colon_fence",
53+
"replacements",
54+
"substitution",
55+
"smartquotes",
56+
"html_admonition",
57+
"deflist",
58+
]
59+
myst_ref_domains = ["std", "py"]
60+
intersphinx_mapping = {
61+
"python": ("https://docs.python.org/3", None),
62+
}
4963

5064
# -- Options for HTML output -------------------------------------------------
5165
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

docs/source/guide.md

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# Guide
2+
3+
## Define the table
4+
5+
Inherite the {py:class}`~vechord.spec.Table` class and define the columns as attributes with the
6+
type hints. Some advanced configuration can be done by using the {py:class}`typing.Annotated`.
7+
8+
### Choose a primary key
9+
10+
- {py:class}`~vechord.spec.PrimaryKeyAutoIncrease`: generate an auto-incrementing integer as the primary key
11+
- {py:class}`~vechord.spec.PrimaryKeyUUID`: use `uuid7` as the primary key, suitable for distributed systems or general purposes
12+
- `int` or `str`: insert the key manually
13+
14+
### Vector and Keyword search
15+
16+
- {py:class}`~vechord.spec.Vector`: define a vector column with dimensions, it's recommended to define something like `DenseVector = Vector[768]` and use it in all tables. This accepts `list[float]` or `numpy.ndarray` as the input. For now, it only supports `f32` type.
17+
- for multivector, use `list[DenseVector]` as the type hint
18+
- {py:class}`~vechord.spec.Keyword`: define a keyword column that the `str` will be tokenized and stored as the `bm25vector` type. This accepts `str` as the input.
19+
20+
### Configure the Index
21+
22+
The default index is suitable for small datasets (less than 100k). For larger datasets, you can
23+
customize the index configuration by using the {py:class}`typing.Annotated` with:
24+
25+
- {py:class}`~vechord.spec.VectorIndex`: configure the `lists` and `distance` operators.
26+
- {py:class}`~vechord.spec.MultiVectorIndex`: configure the `lists`.
27+
28+
```python
29+
DenseVector = Vector[768]
30+
31+
class MyTable(Table, kw_only=True):
32+
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
33+
vec: Annotated[DenseVector, VectorIndex(lists=128)]
34+
text: str
35+
```
36+
37+
:::{tip}
38+
If you need to use a customized tokenizer, please refer to the [VectorChord-bm25 document](https://github.com/tensorchord/VectorChord-bm25/?tab=readme-ov-file#more-examples).
39+
:::
40+
41+
### Use the foreign key to link tables
42+
43+
By default, the foreign key will add `REFERENCES ON DELETE CASCADE`.
44+
45+
```python
46+
class SubTable(Table, kw_only=True):
47+
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
48+
text: str
49+
mytable_uid: Annotated[UUID, ForeignKey[MyTable.uid]]
50+
```
51+
52+
### JSONB
53+
54+
If you want to store a JSONB column, you can define like:
55+
56+
```python
57+
from psycopg.types.json import Jsonb
58+
59+
class MyJsonTable(Table, kw_only=True):
60+
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
61+
json: JSONB
62+
63+
item = MyJsonTable(json=Jsonb({"key": "value"}))
64+
```
65+
66+
## Inject with decorator
67+
68+
The decorator {py:meth}`~vechord.registry.VechordRegistry.inject` can be used to load the
69+
function arguments from the database and dump the return values to the database.
70+
71+
To use this decorator, you need to specify at least one of the `input` or `output` with
72+
the table class you have defined.
73+
74+
- `input=Type[Table]`: will load the specified columns rom the database and inject the data to the decorated function arguments
75+
- if `input=None`, the function will need to pass the arguments manually
76+
- `output=Type[Table]`: will dump the return values to the database (will also need to annotate the return type with the provided table class or a list of the table class)
77+
- if `output=None`, you can get the return value from the functiona call
78+
79+
The following example uses the pre-defined tables:
80+
81+
- {py:class}`~vechord.spec.DefaultDocument`
82+
- {py:func}`~vechord.spec.create_chunk_with_dim`
83+
84+
```python
85+
from uuid import UUID
86+
import httpx
87+
from vechord.registry import VechordRegistry
88+
from vechord.extract import SimpleExtractor
89+
from vechord.embedding import GeminiDenseEmbedding
90+
from vechord.spec import DefaultDocument, create_chunk_with_dim
91+
92+
DefaultChunk = create_chunk_with_dim(768)
93+
vr = VechordRegistry(namespace="test", url="postgresql://postgres:postgres@127.0.0.1:5432/")
94+
vr.register([DefaultDocument, DefaultChunk])
95+
extractor = SimpleExtractor()
96+
emb = GeminiDenseEmbedding()
97+
98+
99+
@vr.inject(output=DefaultDocument)
100+
def add_document(url: str) -> DefaultDocument:
101+
with httpx.Client() as client:
102+
resp = client.get(url)
103+
text = extractor.extract_html(resp.text)
104+
return DefaultDocument(title=url, text=text)
105+
106+
107+
@vr.inject(input=Document, output=DefaultChunk)
108+
def add_chunk(uid: UUID, text: str) -> list[DefaultChunk]:
109+
chunks = text.split("\n")
110+
return [DefaultChunk(doc_id=uid, vec=emb.vectorize_chunk(t), text=t) for t in chunks]
111+
112+
113+
for url in ["https://paulgraham.com/best.html", "https://paulgraham.com/read.html"]:
114+
add_document(url)
115+
add_chunk()
116+
```
117+
118+
### Select/Insert/Delete
119+
120+
We also provide some functions to select, insert and delete the data from the database.
121+
122+
- {py:meth}`~vechord.registry.VechordRegistry.select_by`
123+
- {py:meth}`~vechord.registry.VechordRegistry.insert`
124+
- {py:meth}`~vechord.registry.VechordRegistry.copy_bulk`
125+
- {py:meth}`~vechord.registry.VechordRegistry.remove_by`
126+
127+
```python
128+
docs = vr.select_by(DefaultDocument.partial_init())
129+
vr.insert(DefaultDocument(text="hello world"))
130+
vr.copy_bulk([DefaultDocument(text="hello world"), DefaultDocument(text="hello vector")])
131+
vr.remove_by(DefaultDocument.partial_init())
132+
```
133+
134+
## Transaction
135+
136+
Use the {py:class}`~vechord.registry.VechordPipeline` to run multiple functions in a transaction.
137+
138+
This also guarantees that the decorated functions will only load the data from the current
139+
transaction instead of the whole table. So users can focus on the data processing part.
140+
141+
```python
142+
pipeline = vr.create_pipeline([add_document, add_chunk])
143+
pipeline.run("https://paulgraham.com/best.html")
144+
```
145+
146+
## Search
147+
148+
We provide search interface for different types of queries:
149+
150+
- {py:meth}`~vechord.registry.VechordRegistry.search_by_vector`
151+
- {py:meth}`~vechord.registry.VechordRegistry.search_by_keyword`
152+
- {py:meth}`~vechord.registry.VechordRegistry.search_by_multivec`
153+
154+
```python
155+
vr.search_by_vector(DefaultChunk, emb.vectorize_query("hey"), topk=10)
156+
```
157+
158+
## Access the cursor
159+
160+
If you need to change some settings or use the cursor directly:
161+
162+
```python
163+
vr.client.get_cursor().execute("SET vchordrq.probes = 100;")
164+
```

docs/source/index.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ hidden:
2323
caption: User Guide
2424
---
2525
26+
guide
27+
utils
2628
api
2729
example
2830
```

docs/source/utils.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Toolkit
2+
3+
We provides some basic tools to help you build the RAG pipeline. But it's not limited to thses
4+
internal tools. You can use whatever you like.
5+
6+
You may need to install with extras:
7+
8+
```bash
9+
pip install vechord[gemini,openai,spacy,cohere]
10+
```
11+
12+
- Augment
13+
- {py:class}`~vechord.augment.GeminiAugmenter`: for contextual retrieval
14+
- Chunk
15+
- {py:class}`~vechord.chunk.RegexChunker`: Regex based chunker
16+
- {py:class}`~vechord.chunk.SpacyChunker`: Spacy based chunker
17+
- {py:class}`~vechord.chunk.GeminiChunker`: Gemini based chunker
18+
- Embedding
19+
- {py:class}`~vechord.embedding.GeminiDenseEmbedding`: Gemini embedding
20+
- {py:class}`~vechord.embedding.OpenAIDenseEmbedding`: OpenAI embedding
21+
- {py:class}`~vechord.embedding.SpacyDenseEmbedding`: Spacy embedding
22+
- Evaluate
23+
- {py:class}`~vechord.evaluate.GeminiEvaluator`: Gemini based evaluator
24+
- Extract
25+
- {py:class}`~vechord.extract.SimpleExtractor`: Simple extractor
26+
- {py:class}`~vechord.extract.GeminiExtractor`: Gemini extractor
27+
- Rerank
28+
- {py:class}`~vechord.rerank.CohereReranker`: Gemini based reranker
29+
- {py:class}`~vechord.rerank.ReciprocalRankFusion`: fuse function for hybrid retrieval

tests/test_table.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import msgspec
66
import numpy as np
77
import pytest
8+
from psycopg.types.json import Jsonb
89

910
from vechord.log import logger
1011
from vechord.registry import VechordRegistry
@@ -108,6 +109,45 @@ def test_annotated_index(registry):
108109
assert len(res) == topk
109110

110111

112+
@pytest.mark.db
113+
def test_keyword_tokenizer(registry):
114+
Tockenizer = Keyword.with_model("wiki_tocken")
115+
116+
class OtherTokenizer(Table, kw_only=True):
117+
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
118+
text: str
119+
keyword: Tockenizer
120+
121+
registry.register([OtherTokenizer])
122+
num = 20
123+
topk = 5
124+
for text in (f"hello {i}" for i in range(num)):
125+
registry.insert(OtherTokenizer(text=text, keyword=Tockenizer(text)))
126+
127+
inserted = registry.select_by(OtherTokenizer.partial_init(), fields=["text"])
128+
assert len(inserted) == num
129+
130+
res = registry.search_by_keyword(OtherTokenizer, "hello", topk=topk)
131+
assert len(res) == topk
132+
assert all("hello" in record.text for record in res)
133+
134+
135+
@pytest.mark.db
136+
def test_jsonb(registry):
137+
class JsonTable(Table, kw_only=True):
138+
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
139+
text: str
140+
data: Jsonb
141+
142+
registry.register([JsonTable])
143+
num = 10
144+
for i in range(num):
145+
registry.insert(JsonTable(text=f"hello {i}", data=Jsonb({"key": i})))
146+
147+
inserted = registry.select_by(JsonTable.partial_init(), fields=["text"])
148+
assert len(inserted) == num
149+
150+
111151
@pytest.mark.db
112152
def test_foreign_key(registry):
113153
docs = [

vechord/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,23 @@
1414
from vechord.rerank import CohereReranker
1515
from vechord.service import create_web_app
1616
from vechord.spec import (
17+
DefaultDocument,
1718
ForeignKey,
1819
IndexColumn,
1920
Keyword,
2021
KeywordIndex,
2122
MultiVectorIndex,
2223
PrimaryKeyAutoIncrease,
24+
PrimaryKeyUUID,
2325
Table,
2426
Vector,
2527
VectorIndex,
28+
create_chunk_with_dim,
2629
)
2730

2831
__all__ = [
2932
"CohereReranker",
33+
"DefaultDocument",
3034
"Document",
3135
"ForeignKey",
3236
"GeminiAugmenter",
@@ -41,6 +45,7 @@
4145
"MultiVectorIndex",
4246
"OpenAIDenseEmbedding",
4347
"PrimaryKeyAutoIncrease",
48+
"PrimaryKeyUUID",
4449
"RegexChunker",
4550
"SimpleExtractor",
4651
"SpacyChunker",
@@ -52,5 +57,6 @@
5257
"Vector",
5358
"VectorIndex",
5459
"WordLlamaChunker",
60+
"create_chunk_with_dim",
5561
"create_web_app",
5662
]

vechord/registry.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,11 @@ def register(self, tables: list[type[Table]], create_index: bool = True):
121121
)
122122

123123
def create_pipeline(self, steps: list[Callable]) -> VechordPipeline:
124-
"""Create the :class:`VechordPipeline` to run multiple functions in a transaction."""
124+
"""Create the :class:`VechordPipeline` to run multiple functions in a transaction.
125+
126+
Args:
127+
steps: a list of functions to be run in the pipeline.
128+
"""
125129
return VechordPipeline(client=self.client, steps=steps)
126130

127131
def select_by(
@@ -283,7 +287,11 @@ def remove_by(self, obj: Table):
283287
self.client.delete(obj.__class__.name(), kvs)
284288

285289
def insert(self, obj: Table):
286-
"""Insert the given object to the DB."""
290+
"""Insert the given object to the DB.
291+
292+
Args:
293+
obj: the object to be inserted
294+
"""
287295
if not isinstance(obj, Table):
288296
raise ValueError(f"unsupported class {type(obj)}")
289297
self.client.insert(obj.name(), obj.todict())

vechord/spec.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ def schema(cls) -> str:
194194

195195
@classmethod
196196
def with_model(cls, model: Literal["bert_base_uncased", "wiki_tocken"]) -> Type:
197-
"""TODO: test this"""
198197
cls._model = model
199198
return cls
200199

0 commit comments

Comments
 (0)