docs: add more guides (#25)

kemingy · Copilot · web-flow · commit 49940f6fcd2a · 2025-04-09T09:15:35.000+08:00
* docs: add more guides

Signed-off-by: Keming &lt;kemingyang@tensorchord.ai&gt;

* Update README.md

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

---------

Signed-off-by: Keming &lt;kemingyang@tensorchord.ai&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -47,8 +47,7 @@ pip install vechord
 
 ## User Guide
 
-For the API references, check our [documentation][document-link].
-
+For more details, check our [API reference][document-api] and [User Guide][document-guide].
 ### Define the table
 
 ```python
@@ -161,7 +160,7 @@ with make_server("", 8000, app) as server:
 ## Development
 
 ```bash
-docker run --rm -d --name vdb -e POSTGRES_PASSWORD=postgres -p 5432:5432 ghcr.io/tensorchord/vchord_bm25-postgres:pg17-v0.1.1
+docker run --rm -d --name vdb -e POSTGRES_PASSWORD=postgres -p 5432:5432 ghcr.io/tensorchord/vchord-suite:pg17-latest
 envd up
 # inside the envd env, sync all the dependencies
 make sync
@@ -177,6 +176,8 @@ make format
 [ci-check-file]: https://github.com/tensorchord/vechord/actions/workflows/check.yml
 [ci-page-badge]: https://github.com/tensorchord/vechord/actions/workflows/pages.yml/badge.svg
 [document-link]: https://tensorchord.github.io/vechord/
+[document-api]: https://tensorchord.github.io/vechord/api.html
+[document-guide]: https://tensorchord.github.io/vechord/guide.html
 [license-badge]: https://img.shields.io/github/license/tensorchord/vechord
 [license-link]: https://github.com/tensorchord/vechord/blob/main/LICENSE
 [pypi-badge]: https://img.shields.io/pypi/v/vechord
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -16,6 +16,7 @@
 
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
     "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
     "sphinx.ext.githubpages",
@@ -45,7 +46,20 @@
 html_baseurl = "https://tensorchord.github.io/vechord/"
 html_extra_path = ["robots.txt"]
 # myst
-myst_enable_extensions = ["tasklist", "fieldlist", "colon_fence"]
+myst_enable_extensions = [
+    "tasklist",
+    "fieldlist",
+    "colon_fence",
+    "replacements",
+    "substitution",
+    "smartquotes",
+    "html_admonition",
+    "deflist",
+]
+myst_ref_domains = ["std", "py"]
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+}
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
diff --git a/docs/source/guide.md b/docs/source/guide.md
@@ -0,0 +1,164 @@
+# Guide
+
+## Define the table
+
+Inherite the {py:class}`~vechord.spec.Table` class and define the columns as attributes with the
+type hints. Some advanced configuration can be done by using the {py:class}`typing.Annotated`.
+
+### Choose a primary key
+
+- {py:class}`~vechord.spec.PrimaryKeyAutoIncrease`: generate an auto-incrementing integer as the primary key
+- {py:class}`~vechord.spec.PrimaryKeyUUID`: use `uuid7` as the primary key, suitable for distributed systems or general purposes
+- `int` or `str`: insert the key manually
+
+### Vector and Keyword search
+
+- {py:class}`~vechord.spec.Vector`: define a vector column with dimensions, it's recommended to define something like `DenseVector = Vector[768]` and use it in all tables. This accepts `list[float]` or `numpy.ndarray` as the input. For now, it only supports `f32` type.
+  - for multivector, use `list[DenseVector]` as the type hint
+- {py:class}`~vechord.spec.Keyword`: define a keyword column that the `str` will be tokenized and stored as the `bm25vector` type. This accepts `str` as the input.
+
+### Configure the Index
+
+The default index is suitable for small datasets (less than 100k). For larger datasets, you can
+customize the index configuration by using the {py:class}`typing.Annotated` with:
+
+- {py:class}`~vechord.spec.VectorIndex`: configure the `lists` and `distance` operators.
+- {py:class}`~vechord.spec.MultiVectorIndex`: configure the `lists`.
+
+```python
+DenseVector = Vector[768]
+
+class MyTable(Table, kw_only=True):
+    uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+    vec: Annotated[DenseVector, VectorIndex(lists=128)]
+    text: str
+```
+
+:::{tip}
+If you need to use a customized tokenizer, please refer to the [VectorChord-bm25 document](https://github.com/tensorchord/VectorChord-bm25/?tab=readme-ov-file#more-examples).
+:::
+
+### Use the foreign key to link tables
+
+By default, the foreign key will add `REFERENCES ON DELETE CASCADE`.
+
+```python
+class SubTable(Table, kw_only=True):
+    uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+    text: str
+    mytable_uid: Annotated[UUID, ForeignKey[MyTable.uid]]
+```
+
+### JSONB
+
+If you want to store a JSONB column, you can define like:
+
+```python
+from psycopg.types.json import Jsonb
+
+class MyJsonTable(Table, kw_only=True):
+    uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+    json: JSONB
+
+item = MyJsonTable(json=Jsonb({"key": "value"}))
+```
+
+## Inject with decorator
+
+The decorator {py:meth}`~vechord.registry.VechordRegistry.inject` can be used to load the
+function arguments from the database and dump the return values to the database.
+
+To use this decorator, you need to specify at least one of the `input` or `output` with
+the table class you have defined.
+
+- `input=Type[Table]`: will load the specified columns rom the database and inject the data to the decorated function arguments
+  - if `input=None`, the function will need to pass the arguments manually
+- `output=Type[Table]`: will dump the return values to the database (will also need to annotate the return type with the provided table class or a list of the table class)
+  - if `output=None`, you can get the return value from the functiona call
+
+The following example uses the pre-defined tables:
+
+- {py:class}`~vechord.spec.DefaultDocument`
+- {py:func}`~vechord.spec.create_chunk_with_dim`
+
+```python
+from uuid import UUID
+import httpx
+from vechord.registry import VechordRegistry
+from vechord.extract import SimpleExtractor
+from vechord.embedding import GeminiDenseEmbedding
+from vechord.spec import DefaultDocument, create_chunk_with_dim
+
+DefaultChunk = create_chunk_with_dim(768)
+vr = VechordRegistry(namespace="test", url="postgresql://postgres:postgres@127.0.0.1:5432/")
+vr.register([DefaultDocument, DefaultChunk])
+extractor = SimpleExtractor()
+emb = GeminiDenseEmbedding()
+
+
+@vr.inject(output=DefaultDocument)
+def add_document(url: str) -> DefaultDocument:
+    with httpx.Client() as client:
+        resp = client.get(url)
+        text = extractor.extract_html(resp.text)
+        return DefaultDocument(title=url, text=text)
+
+
+@vr.inject(input=Document, output=DefaultChunk)
+def add_chunk(uid: UUID, text: str) -> list[DefaultChunk]:
+    chunks = text.split("\n")
+    return [DefaultChunk(doc_id=uid, vec=emb.vectorize_chunk(t), text=t) for t in chunks]
+
+
+for url in ["https://paulgraham.com/best.html", "https://paulgraham.com/read.html"]:
+    add_document(url)
+add_chunk()
+```
+
+### Select/Insert/Delete
+
+We also provide some functions to select, insert and delete the data from the database.
+
+- {py:meth}`~vechord.registry.VechordRegistry.select_by`
+- {py:meth}`~vechord.registry.VechordRegistry.insert`
+- {py:meth}`~vechord.registry.VechordRegistry.copy_bulk`
+- {py:meth}`~vechord.registry.VechordRegistry.remove_by`
+
+```python
+docs = vr.select_by(DefaultDocument.partial_init())
+vr.insert(DefaultDocument(text="hello world"))
+vr.copy_bulk([DefaultDocument(text="hello world"), DefaultDocument(text="hello vector")])
+vr.remove_by(DefaultDocument.partial_init())
+```
+
+## Transaction
+
+Use the {py:class}`~vechord.registry.VechordPipeline` to run multiple functions in a transaction.
+
+This also guarantees that the decorated functions will only load the data from the current
+transaction instead of the whole table. So users can focus on the data processing part.
+
+```python
+pipeline = vr.create_pipeline([add_document, add_chunk])
+pipeline.run("https://paulgraham.com/best.html")
+```
+
+## Search
+
+We provide search interface for different types of queries:
+
+- {py:meth}`~vechord.registry.VechordRegistry.search_by_vector`
+- {py:meth}`~vechord.registry.VechordRegistry.search_by_keyword`
+- {py:meth}`~vechord.registry.VechordRegistry.search_by_multivec`
+
+```python
+vr.search_by_vector(DefaultChunk, emb.vectorize_query("hey"), topk=10)
+```
+
+## Access the cursor
+
+If you need to change some settings or use the cursor directly:
+
+```python
+vr.client.get_cursor().execute("SET vchordrq.probes = 100;")
+```
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -23,6 +23,8 @@ hidden:
 caption: User Guide
 ---
 
+guide
+utils
 api
 example
 ```
diff --git a/docs/source/utils.md b/docs/source/utils.md
@@ -0,0 +1,29 @@
+# Toolkit
+
+We provides some basic tools to help you build the RAG pipeline. But it's not limited to thses
+internal tools. You can use whatever you like.
+
+You may need to install with extras:
+
+```bash
+pip install vechord[gemini,openai,spacy,cohere]
+```
+
+- Augment
+    - {py:class}`~vechord.augment.GeminiAugmenter`: for contextual retrieval
+- Chunk
+    - {py:class}`~vechord.chunk.RegexChunker`: Regex based chunker
+    - {py:class}`~vechord.chunk.SpacyChunker`: Spacy based chunker
+    - {py:class}`~vechord.chunk.GeminiChunker`: Gemini based chunker
+- Embedding
+    - {py:class}`~vechord.embedding.GeminiDenseEmbedding`: Gemini embedding
+    - {py:class}`~vechord.embedding.OpenAIDenseEmbedding`: OpenAI embedding
+    - {py:class}`~vechord.embedding.SpacyDenseEmbedding`: Spacy embedding
+- Evaluate
+    - {py:class}`~vechord.evaluate.GeminiEvaluator`: Gemini based evaluator
+- Extract
+    - {py:class}`~vechord.extract.SimpleExtractor`: Simple extractor
+    - {py:class}`~vechord.extract.GeminiExtractor`: Gemini extractor
+- Rerank
+    - {py:class}`~vechord.rerank.CohereReranker`: Gemini based reranker
+    - {py:class}`~vechord.rerank.ReciprocalRankFusion`: fuse function for hybrid retrieval
diff --git a/tests/test_table.py b/tests/test_table.py
@@ -5,6 +5,7 @@
 import msgspec
 import numpy as np
 import pytest
+from psycopg.types.json import Jsonb
 
 from vechord.log import logger
 from vechord.registry import VechordRegistry
@@ -108,6 +109,45 @@ def test_annotated_index(registry):
     assert len(res) == topk
 
 
+@pytest.mark.db
+def test_keyword_tokenizer(registry):
+    Tockenizer = Keyword.with_model("wiki_tocken")
+
+    class OtherTokenizer(Table, kw_only=True):
+        uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+        text: str
+        keyword: Tockenizer
+
+    registry.register([OtherTokenizer])
+    num = 20
+    topk = 5
+    for text in (f"hello {i}" for i in range(num)):
+        registry.insert(OtherTokenizer(text=text, keyword=Tockenizer(text)))
+
+    inserted = registry.select_by(OtherTokenizer.partial_init(), fields=["text"])
+    assert len(inserted) == num
+
+    res = registry.search_by_keyword(OtherTokenizer, "hello", topk=topk)
+    assert len(res) == topk
+    assert all("hello" in record.text for record in res)
+
+
+@pytest.mark.db
+def test_jsonb(registry):
+    class JsonTable(Table, kw_only=True):
+        uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+        text: str
+        data: Jsonb
+
+    registry.register([JsonTable])
+    num = 10
+    for i in range(num):
+        registry.insert(JsonTable(text=f"hello {i}", data=Jsonb({"key": i})))
+
+    inserted = registry.select_by(JsonTable.partial_init(), fields=["text"])
+    assert len(inserted) == num
+
+
 @pytest.mark.db
 def test_foreign_key(registry):
     docs = [
diff --git a/vechord/__init__.py b/vechord/__init__.py
@@ -14,19 +14,23 @@
 from vechord.rerank import CohereReranker
 from vechord.service import create_web_app
 from vechord.spec import (
+    DefaultDocument,
     ForeignKey,
     IndexColumn,
     Keyword,
     KeywordIndex,
     MultiVectorIndex,
     PrimaryKeyAutoIncrease,
+    PrimaryKeyUUID,
     Table,
     Vector,
     VectorIndex,
+    create_chunk_with_dim,
 )
 
 __all__ = [
     "CohereReranker",
+    "DefaultDocument",
     "Document",
     "ForeignKey",
     "GeminiAugmenter",
@@ -41,6 +45,7 @@
     "MultiVectorIndex",
     "OpenAIDenseEmbedding",
     "PrimaryKeyAutoIncrease",
+    "PrimaryKeyUUID",
     "RegexChunker",
     "SimpleExtractor",
     "SpacyChunker",
@@ -52,5 +57,6 @@
     "Vector",
     "VectorIndex",
     "WordLlamaChunker",
+    "create_chunk_with_dim",
     "create_web_app",
 ]
diff --git a/vechord/registry.py b/vechord/registry.py
@@ -121,7 +121,11 @@ def register(self, tables: list[type[Table]], create_index: bool = True):
                 )
 
     def create_pipeline(self, steps: list[Callable]) -> VechordPipeline:
-        """Create the :class:`VechordPipeline` to run multiple functions in a transaction."""
+        """Create the :class:`VechordPipeline` to run multiple functions in a transaction.
+
+        Args:
+            steps: a list of functions to be run in the pipeline.
+        """
         return VechordPipeline(client=self.client, steps=steps)
 
     def select_by(
@@ -283,7 +287,11 @@ def remove_by(self, obj: Table):
         self.client.delete(obj.__class__.name(), kvs)
 
     def insert(self, obj: Table):
-        """Insert the given object to the DB."""
+        """Insert the given object to the DB.
+
+        Args:
+            obj: the object to be inserted
+        """
         if not isinstance(obj, Table):
             raise ValueError(f"unsupported class {type(obj)}")
         self.client.insert(obj.name(), obj.todict())
diff --git a/vechord/spec.py b/vechord/spec.py
@@ -194,7 +194,6 @@ def schema(cls) -> str:
 
     @classmethod
     def with_model(cls, model: Literal["bert_base_uncased", "wiki_tocken"]) -> Type:
-        """TODO: test this"""
         cls._model = model
         return cls
 

-Original file line number
+Diff line change
 caption: User Guide
 ---
 +guide
 +utils
 api
 example
 ```