From 957aeea9d4f2acc4d028a3003d780cc3cdb77926 Mon Sep 17 00:00:00 2001 From: Keming Date: Mon, 17 Mar 2025 21:09:32 +0800 Subject: [PATCH 1/7] feat: add tests and docs Signed-off-by: Keming --- .github/workflows/check.yml | 5 +- .github/workflows/pages.yml | 66 +++++++ Makefile | 6 + README.md | 40 +--- design.md | 29 +++ docs/Makefile | 20 ++ docs/make.bat | 35 ++++ docs/source/api.md | 70 +++++++ docs/source/conf.py | 57 ++++++ docs/source/example.md | 25 +++ docs/source/index.md | 38 ++++ examples/beir.py | 10 +- examples/contextual.py | 24 +-- examples/essay.py | 12 +- examples/web.py | 9 +- pyproject.toml | 17 ++ tests/test_table.py | 140 +++++++++++++ uv.lock | 378 ++++++++++++++++++++++++++++++++++++ vechord/__init__.py | 2 + vechord/augment.py | 15 +- vechord/chunk.py | 16 +- vechord/client.py | 6 - vechord/embedding.py | 6 + vechord/evaluate.py | 4 + vechord/extract.py | 6 + vechord/load.py | 2 + vechord/registry.py | 101 +++++++--- vechord/service.py | 61 +++++- vechord/spec.py | 18 ++ 29 files changed, 1120 insertions(+), 98 deletions(-) create mode 100644 .github/workflows/pages.yml create mode 100644 design.md create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/api.md create mode 100644 docs/source/conf.py create mode 100644 docs/source/example.md create mode 100644 docs/source/index.md create mode 100644 tests/test_table.py diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index f30c655..204a3eb 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -24,4 +24,7 @@ jobs: - name: Lint run: make lint - name: Test - run: make test + run: | + docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres tensorchord/vchord-postgres:pg17-v0.2.1 + make test + docker stop vdb diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000..2a4e09f --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,66 @@ +name: Pages + +on: + pull_request: + paths: + - 'vechord/**' + - 'docs/**' + - '.github/workflows/pages.yml' + - 'examples/**' + - '**.md' + push: + branches: [ main ] + paths: + - 'vechord/**' + - 'docs/**' + - '.github/workflows/pages.yml' + - 'examples/**' + - '**.md' + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +concurrency: + group: ${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + python-version: "3.12" + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + - name: Install dependencies + run: | + make sync + - name: Generate docs + run: | + cd docs && make html + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + # Upload entire repository + path: 'docs/build/html' + + deploy: + runs-on: ubuntu-latest + needs: build + if: ${{ github.event_name == 'push' }} + # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages + permissions: + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/Makefile b/Makefile index 616a1b1..982a495 100644 --- a/Makefile +++ b/Makefile @@ -24,3 +24,9 @@ test: sync: @uv sync --all-extras --all-groups + +doc: + @cd docs && make html && cd .. + @uv run -m http.server -d docs/build/html -b 127.0.0.1 8000 + +.PHONY: lint format test doc diff --git a/README.md b/README.md index 6670ab7..09055fb 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,17 @@ -# vechord - Python RAG framework built on top of PostgreSQL and [VectorChord](https://github.com/tensorchord/VectorChord/). -## Diagram +## Installation -```mermaid -timeline - title RAG - section Ingestion - Source: Local - : Google Drive - : Dropbox - : Notion - File: Document - : Image - : Audio - Chunk: Text - : Entities - : Embedding - section Query - Analysis: Expansion - : Keyword - : Embedding - Search: Vector Search - : Full Text Search - : Filter - Rerank: ColBERT - section Evaluation - Metric: MAP - : Recall - : NDCG +```sh +pip install vechord ``` ## Examples -- [beir.py](./examples/beir.py): the most flexible way to use the library (loading, indexing, querying and evaluation) -- [web.py](./examples/web.py): build a web application with from the defined tables and pipeline -- [essay.py](./examples/essay.py): extract the content from Paul Graham's essays and evaluate the search results from LLM generated queries -- [contextual.py](./examples/contextual.py): contextual retrieval example +- [beir.py](examples/beir.py): the most flexible way to use the library (loading, indexing, querying and evaluation) +- [web.py](examples/web.py): build a web application with from the defined tables and pipeline +- [essay.py](examples/essay.py): extract the content from Paul Graham's essays and evaluate the search results from LLM generated queries +- [contextual.py](examples/contextual.py): contextual retrieval example ## Development diff --git a/design.md b/design.md new file mode 100644 index 0000000..878f2cf --- /dev/null +++ b/design.md @@ -0,0 +1,29 @@ +## Diagram + +```mermaid +timeline + title RAG + section Ingestion + Source: Local + : Google Drive + : Dropbox + : Notion + File: Document + : Image + : Audio + Chunk: Text + : Entities + : Embedding + section Query + Analysis: Expansion + : Keyword + : Embedding + Search: Vector Search + : Full Text Search + : Filter + Rerank: ColBERT + section Evaluation + Metric: MAP + : Recall + : NDCG +``` diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..faf5089 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= uv run sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/api.md b/docs/source/api.md new file mode 100644 index 0000000..783140f --- /dev/null +++ b/docs/source/api.md @@ -0,0 +1,70 @@ +# Interface + +## VechordRegistry + +```{eval-rst} +.. automodule:: vechord.registry + :members: VechordRegistry +``` + +## Types + +```{eval-rst} +.. automodule:: vechord.spec + :members: Vector,ForeignKey,PrimaryKeyAutoIncrease,Table +``` + +## Augment + +```{eval-rst} +.. automodule:: vechord.augment + :members: + :show-inheritance: +``` + +## Chunk + +```{eval-rst} +.. automodule:: vechord.chunk + :members: + :show-inheritance: +``` + +## Embedding + +```{eval-rst} +.. automodule:: vechord.embedding + :members: + :show-inheritance: +``` + +## Evaluate + +```{eval-rst} +.. automodule:: vechord.evaluate + :members: + :show-inheritance: +``` + +## Extract + +```{eval-rst} +.. automodule:: vechord.extract + :members: + :show-inheritance: +``` + +## Load + +```{eval-rst} +.. automodule:: vechord.load + :members: + :show-inheritance: +``` + +## Service + +```{eval-rst} +.. automodule:: vechord.service + :members: +``` diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..5f9688c --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,57 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "vechord" +copyright = "2025, TensorChord" +author = "TensorChord" +release = "latest" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "myst_parser", + "sphinx_autodoc_typehints", + "sphinxext.opengraph", + "sphinx_sitemap", +] + +templates_path = ["_templates"] +exclude_patterns = [] +source_suffix = [".rst", ".md"] + +# Extensions +myst_heading_anchors = 3 +autodoc_member_order = "bysource" +# napoleon +napoleon_attr_annotations = True +napoleon_include_init_with_doc = True +napoleon_use_admonition_for_references = True +# opengraph +ogp_site_url = "https://github.com/tensorchord/vechord" +ogp_image = "https://github.com/tensorchord/vechord" +# sitemap +html_baseurl = "https://tensorchord.github.io/vechord/" +html_extra_path = ["robots.txt"] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "furo" +html_static_path = ["_static"] +html_theme_options = { + "sidebar_hide_name": True, + "navigation_with_keys": True, + "source_repository": "https://github.com/tensorchord/vechord", + "source_branch": "main", + "source_directory": "docs/source", +} diff --git a/docs/source/example.md b/docs/source/example.md new file mode 100644 index 0000000..9385453 --- /dev/null +++ b/docs/source/example.md @@ -0,0 +1,25 @@ +# Examples + +## BEIR evaluation + +```{include} ../../examples/beir.py +:code: python +``` + +## HTTP web service + +```{include} ../../examples/web.py +:code: python +``` + +## Contextual chunk augmentation + +```{include} ../../examples/contextual.py +:code: python +``` + +## Evaluate with generated queries + +```{include} ../../examples/essay.py +:code: python +``` diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..f304b6a --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,38 @@ +# Vechord + +```{eval-rst} +.. meta:: + :description lang=en: + Python RAG framework built on top of PostgreSQL and VectorChord. +``` + +Vechord is a Python RAG framework built on top of PostgreSQL and [VectorChord](https://github.com/tensorchord/VectorChord/). + +## Installation + +```sh +pip install vechord +``` + +```{toctree} +--- +hidden: +caption: User Guide +--- + +api +example +``` + +```{toctree} +--- +hidden: +caption: Project Links +--- + +GitHub +``` + +## Indices + +- {ref}`genindex` diff --git a/examples/beir.py b/examples/beir.py index 6a1b331..8e55e78 100644 --- a/examples/beir.py +++ b/examples/beir.py @@ -16,6 +16,9 @@ DEFAULT_DATASET = "scifact" TOP_K = 10 +emb = GeminiDenseEmbedding() +DenseVector = Vector[768] + def download_dataset(dataset: str, output: Path): output.mkdir(parents=True, exist_ok=True) @@ -49,14 +52,14 @@ class Corpus(Table): uid: str text: str title: str - vector: Vector[768] + vector: DenseVector class Query(Table): uid: str cid: str text: str - vector: Vector[768] + vector: DenseVector class Evaluation(msgspec.Struct): @@ -67,7 +70,6 @@ class Evaluation(msgspec.Struct): vr = VechordRegistry(DEFAULT_DATASET, "postgresql://postgres:postgres@172.17.0.1:5432/") vr.register([Corpus, Query]) -emb = GeminiDenseEmbedding() @vr.inject(output=Corpus) @@ -118,7 +120,7 @@ def load_query(dataset: str, output: Path) -> Iterator[Query]: @vr.inject(input=Query) -def evaluate(cid: str, vector: Vector[768]) -> Evaluation: +def evaluate(cid: str, vector: DenseVector) -> Evaluation: docs: list[Corpus] = vr.search(Corpus, vector, topk=TOP_K) score = BaseEvaluator.evaluate_one(cid, [doc.uid for doc in docs]) return Evaluation( diff --git a/examples/contextual.py b/examples/contextual.py index 6b5e502..668acd9 100644 --- a/examples/contextual.py +++ b/examples/contextual.py @@ -3,11 +3,11 @@ from vechord import ( GeminiAugmenter, + GeminiDenseEmbedding, GeminiEvaluator, LocalLoader, RegexChunker, SimpleExtractor, - SpacyDenseEmbedding, ) from vechord.registry import VechordRegistry from vechord.spec import ( @@ -17,6 +17,9 @@ Vector, ) +emb = GeminiDenseEmbedding() +DenseVector = Vector[768] + class Document(Table, kw_only=True): uid: Optional[PrimaryKeyAutoIncrease] = None @@ -31,13 +34,13 @@ class Chunk(Table, kw_only=True): doc_uid: Annotated[int, ForeignKey[Document.uid]] seq_id: int text: str - vector: Vector[96] + vector: DenseVector class ContextChunk(Table, kw_only=True): chunk_uid: Annotated[int, ForeignKey[Chunk.uid]] text: str - vector: Vector[96] + vector: DenseVector vr = VechordRegistry("decorator", "postgresql://postgres:postgres@172.17.0.1:5432/") @@ -59,15 +62,12 @@ def load_from_dir(dirpath: str) -> list[Document]: ] -dense = SpacyDenseEmbedding() - - @vr.inject(input=Document, output=Chunk) def split_document(uid: int, text: str) -> list[Chunk]: chunker = RegexChunker(overlap=0) chunks = chunker.segment(text) return [ - Chunk(doc_uid=uid, seq_id=i, text=chunk, vector=dense.vectorize_chunk(chunk)) + Chunk(doc_uid=uid, seq_id=i, text=chunk, vector=emb.vectorize_chunk(chunk)) for i, chunk in enumerate(chunks) ] @@ -75,7 +75,7 @@ def split_document(uid: int, text: str) -> list[Chunk]: @vr.inject(input=Document, output=ContextChunk) def context_embedding(uid: int, text: str) -> list[ContextChunk]: chunks: list[Chunk] = vr.select_by( - Chunk, Chunk.partial_init(doc_uid=uid), fields=["uid", "text"] + Chunk.partial_init(doc_uid=uid), fields=["uid", "text"] ) augmentor = GeminiAugmenter() augmentor.reset(text) @@ -89,7 +89,7 @@ def context_embedding(uid: int, text: str) -> list[ContextChunk]: ] return [ ContextChunk( - chunk_uid=chunk_uid, text=augmented, vector=dense.vectorize_chunk(augmented) + chunk_uid=chunk_uid, text=augmented, vector=emb.vectorize_chunk(augmented) ) for (chunk_uid, augmented) in zip( [c.uid for c in chunks], context_chunks, strict=False @@ -98,7 +98,7 @@ def context_embedding(uid: int, text: str) -> list[ContextChunk]: def query_chunk(query: str) -> list[Chunk]: - vector = dense.vectorize_query(query) + vector = emb.vectorize_query(query) res: list[Chunk] = vr.search( Chunk, vector, @@ -109,7 +109,7 @@ def query_chunk(query: str) -> list[Chunk]: def query_context_chunk(query: str) -> list[ContextChunk]: - vector = dense.vectorize_query(query) + vector = emb.vectorize_query(query) res: list[ContextChunk] = vr.search( ContextChunk, vector, @@ -122,7 +122,7 @@ def query_context_chunk(query: str) -> list[ContextChunk]: @vr.inject(input=Chunk) def evaluate(uid: int, doc_uid: int, text: str): evaluator = GeminiEvaluator() - doc: Document = vr.select_by(Document, Document.partial_init(uid=doc_uid))[0] + doc: Document = vr.select_by(Document.partial_init(uid=doc_uid))[0] query = evaluator.produce_query(doc.text, text) retrieved = query_chunk(query) score = evaluator.evaluate_one(uid, [r.uid for r in retrieved]) diff --git a/examples/essay.py b/examples/essay.py index bf60805..a4e2d92 100644 --- a/examples/essay.py +++ b/examples/essay.py @@ -18,6 +18,10 @@ ARTICLE = "best" TOP_K = 10 +DenseVector = Vector[768] +emb = GeminiDenseEmbedding() +evaluator = GeminiEvaluator() + class EssayParser(HTMLParser): def __init__(self, *, convert_charrefs: bool = ...) -> None: @@ -41,14 +45,14 @@ def handle_data(self, data: str) -> None: class Chunk(Table, kw_only=True): uid: PrimaryKeyAutoIncrease | None = None text: str - vector: Vector[768] + vector: DenseVector class Query(Table, kw_only=True): uid: PrimaryKeyAutoIncrease | None = None cid: Annotated[int, ForeignKey[Chunk.uid]] text: str - vector: Vector[768] + vector: DenseVector class Evaluation: @@ -59,8 +63,6 @@ class Evaluation: vr = VechordRegistry(ARTICLE, "postgresql://postgres:postgres@172.17.0.1:5432/") vr.register([Chunk, Query]) -emb = GeminiDenseEmbedding() -evaluator = GeminiEvaluator() with httpx.Client() as client: resp = client.get(URL.format(ARTICLE)) @@ -83,7 +85,7 @@ def create_query(uid: int, text: str) -> Query: @vr.inject(input=Query) -def evaluate(cid: int, vector: Vector[768]) -> Evaluation: +def evaluate(cid: int, vector: DenseVector) -> Evaluation: chunks: list[Chunk] = vr.search(Chunk, vector, topk=TOP_K) score = evaluator.evaluate_one(cid, [chunk.uid for chunk in chunks]) return Evaluation( diff --git a/examples/web.py b/examples/web.py index 12b4afc..c50af52 100644 --- a/examples/web.py +++ b/examples/web.py @@ -6,7 +6,7 @@ import msgspec from vechord.chunk import RegexChunker -from vechord.embedding import SpacyDenseEmbedding +from vechord.embedding import GeminiDenseEmbedding from vechord.registry import VechordRegistry from vechord.service import create_web_app from vechord.spec import ( @@ -17,6 +17,9 @@ ) URL = "https://paulgraham.com/{}.html" +DenseVector = Vector[768] +emb = GeminiDenseEmbedding() +chunker = RegexChunker(size=1024, overlap=0) class EssayParser(HTMLParser): @@ -49,13 +52,11 @@ class Chunk(Table, kw_only=True): uid: PrimaryKeyAutoIncrease | None = None doc_id: Annotated[int, ForeignKey[Document.uid]] text: str - vector: Vector[96] + vector: DenseVector vr = VechordRegistry("http", "postgresql://postgres:postgres@172.17.0.1:5432/") vr.register([Document, Chunk]) -emb = SpacyDenseEmbedding() -chunker = RegexChunker(size=1024, overlap=0) @vr.inject(output=Document) diff --git a/pyproject.toml b/pyproject.toml index 3a25e1a..6a7738d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "VectorChord Python SDK" readme = "README.md" requires-python = ">=3.10" dependencies = [ + "defspec>=0.3.0", "falcon>=4.0.2", "httpx>=0.28.1", "msgspec>=0.19.0", @@ -46,6 +47,20 @@ dev = [ "pytest>=8.3.5", "ruff>=0.9.1", ] +doc = [ + "furo>=2024.8.6", + "myst-parser>=4.0.1", + "sphinx>=8.1.3", + "sphinx-autodoc-typehints>=3.0.1", + "sphinx-sitemap>=2.6.0", + "sphinxcontrib-napoleon>=0.7", + "sphinxext-opengraph>=0.9.1", +] + +[tool.pytest.ini_options] +markers = [ + "db", +] [tool.ruff] target-version = "py310" @@ -56,6 +71,8 @@ ignore = ["E501"] known-first-party = ["vechord"] [tool.ruff.lint.pylint] max-args = 5 +[tool.ruff.lint.pydocstyle] +convention = "google" [tool.mypy] python_version = "3.10" diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 0000000..d119dff --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,140 @@ +from datetime import datetime +from typing import Annotated + +import msgspec +import numpy as np +import pytest + +from vechord.registry import VechordRegistry +from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector + +TEST_POSTGRES = "postgresql://postgres:postgres@172.17.0.1:5432/" +DenseVector = Vector[128] + + +def gen_vector(): + rng = np.random.default_rng() + return rng.random((128,), dtype=np.float32) + + +class Document(Table, kw_only=True): + uid: PrimaryKeyAutoIncrease | None = None + title: str = "" + text: str + updated_at: datetime = msgspec.field(default_factory=datetime.now) + + +class Chunk(Table, kw_only=True): + uid: PrimaryKeyAutoIncrease | None = None + doc_id: Annotated[int, ForeignKey[Document.uid]] + text: str + vector: DenseVector + + +@pytest.fixture +def registry(): + registry = VechordRegistry("test", TEST_POSTGRES) + registry.register([Document, Chunk]) + yield registry + registry.clear_storage(drop_table=True) + registry.pipeline.clear() + + +@pytest.mark.db +def test_insert_select_remove(registry): + docs = [Document(text="hello world"), Document(text="hello there")] + for doc in docs: + registry.insert(doc) + + # select all + inserted = registry.select_by(Document.partial_init(), fields=["text"]) + assert len(inserted) == len(docs) + assert inserted[0].text == "hello world" + assert inserted[1].text == "hello there" + + # select by id + first = registry.select_by(Document.partial_init(uid=1)) + assert len(first) == 1 + assert first[0].text == "hello world" + + # remove by id + registry.remove_by(Document.partial_init(uid=2)) + assert len(registry.select_by(Document.partial_init())) == 1 + + +@pytest.mark.db +def test_foreign_key(registry): + docs = [ + Document(text="hello world"), + Document(text="hello there"), + ] + chunks = [ + Chunk(doc_id=1, text="hello", vector=gen_vector()), + Chunk(doc_id=1, text="world", vector=gen_vector()), + ] + for record in docs + chunks: + registry.insert(record) + + registry.remove_by(Document.partial_init(uid=1)) + assert len(registry.select_by(Document.partial_init())) == 1 + assert len(registry.select_by(Chunk.partial_init())) == 0 + + +@pytest.mark.db +def test_injection(registry): + @registry.inject(output=Document) + def create_doc(text: str) -> Document: + return Document(text=text) + + @registry.inject(input=Document, output=Chunk) + def create_chunk(uid: int, text: str) -> list[Chunk]: + return [Chunk(doc_id=uid, text=t, vector=gen_vector()) for t in text.split()] + + text = "hello world what happened to vector search" + create_doc(text) + create_chunk() + + docs = registry.select_by(Document.partial_init()) + assert len(docs) == 1 + + chunks = registry.select_by(Chunk.partial_init()) + assert len(chunks) == len(text.split()) + + # test search + topk = 3 + res = registry.search(Chunk, gen_vector(), topk=topk) + assert len(res) == topk + assert all(chunk.text in text for chunk in res) + + +@pytest.mark.db +def test_pipeline(registry): + @registry.inject(output=Document) + def create_doc(text: str) -> Document: + return Document(text=text) + + @registry.inject(input=Document, output=Chunk) + def create_chunk(uid: int, text: str) -> list[Chunk]: + nums = [int(x) for x in text.split()] + return [ + Chunk(doc_id=uid, text=f"num[{num}]", vector=gen_vector()) for num in nums + ] + + correct = "1 2 3 4 5" + error = "100 0.1 no no" + registry.set_pipeline([create_doc, create_chunk]) + + registry.run(correct) + docs = registry.select_by(Document.partial_init()) + assert len(docs) == 1 + chunks = registry.select_by(Chunk.partial_init()) + assert len(chunks) == len(correct.split()) + + # break the transaction won't add new records + with pytest.raises(ValueError): + registry.run(error) + + docs = registry.select_by(Document.partial_init()) + assert len(docs) == 1 + chunks = registry.select_by(Chunk.partial_init()) + assert len(chunks) == len(correct.split()) diff --git a/uv.lock b/uv.lock index e53f7aa..e1862cf 100644 --- a/uv.lock +++ b/uv.lock @@ -7,6 +7,15 @@ resolution-markers = [ "python_full_version < '3.11'", ] +[[package]] +name = "alabaster" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929 }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -31,6 +40,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 }, ] +[[package]] +name = "babel" +version = "2.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537 }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.13.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f0/3c/adaf39ce1fb4afdd21b611e3d530b183bb7759c9b673d60db0e347fd4439/beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b", size = 619516 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/49/6abb616eb3cbab6a7cca303dc02fdf3836de2e0b834bf966a7f5271a34d8/beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16", size = 186015 }, +] + [[package]] name = "blis" version = "1.2.0" @@ -233,6 +264,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090 }, ] +[[package]] +name = "defspec" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "msgspec" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/3a/7b263e6edc9fcf40e456df10ad3b3162d0a14709f42503ee8326b70e5cf2/defspec-0.3.0.tar.gz", hash = "sha256:3809227eae1af54fe36159b726fb1b7c7531f5f5f73cc9676258f477f4ec8e53", size = 14457 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/ed/6c391eb8fdf87a859245eea8f3fadd428a19dad281c0f29e6ece1a8e7afc/defspec-0.3.0-py3-none-any.whl", hash = "sha256:fb4095917c62119ce0343d68063cce737fb81970634c02de960500a8092d6450", size = 10908 }, +] + [[package]] name = "distro" version = "1.9.0" @@ -242,6 +286,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 }, ] +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, +] + [[package]] name = "en-core-web-sm" version = "3.8.0" @@ -318,6 +371,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/86/5486b0188d08aa643e127774a99bac51ffa6cf343e3deb0583956dca5b22/fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2", size = 183862 }, ] +[[package]] +name = "furo" +version = "2024.8.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "pygments" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "sphinx-basic-ng" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a0/e2/d351d69a9a9e4badb4a5be062c2d0e87bd9e6c23b5e57337fef14bef34c8/furo-2024.8.6.tar.gz", hash = "sha256:b63e4cee8abfc3136d3bc03a3d45a76a850bada4d6374d24c1716b0e01394a01", size = 1661506 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/48/e791a7ed487dbb9729ef32bb5d1af16693d8925f4366befef54119b2e576/furo-2024.8.6-py3-none-any.whl", hash = "sha256:6cd97c58b47813d3619e63e9081169880fbe331f0ca883c871ff1f3f11814f5c", size = 341333 }, +] + [[package]] name = "google-ai-generativelanguage" version = "0.6.15" @@ -562,6 +631,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "imagesize" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769 }, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -791,6 +869,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, ] +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/03/a2ecab526543b152300717cf232bb4bb8605b6edb946c845016fa9c9c9fd/mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5", size = 43542 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/f7/7782a043553ee469c1ff49cfa1cdace2d6bf99a1f333cf38676b3ddf30da/mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636", size = 55316 }, +] + [[package]] name = "mdurl" version = "0.1.2" @@ -919,6 +1009,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/e2/5d3f6ada4297caebe1a2add3b126fe800c96f56dbe5d1988a2cbe0b267aa/mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d", size = 4695 }, ] +[[package]] +name = "myst-parser" +version = "4.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docutils" }, + { name = "jinja2" }, + { name = "markdown-it-py" }, + { name = "mdit-py-plugins" }, + { name = "pyyaml" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/a5/9626ba4f73555b3735ad86247a8077d4603aa8628537687c839ab08bfe44/myst_parser-4.0.1.tar.gz", hash = "sha256:5cfea715e4f3574138aecbf7d54132296bfd72bb614d31168f48c477a830a7c4", size = 93985 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/df/76d0321c3797b54b60fef9ec3bd6f4cfd124b9e422182156a1dd418722cf/myst_parser-4.0.1-py3-none-any.whl", hash = "sha256:9134e88959ec3b5780aedf8a99680ea242869d012e8821db3126d427edc9c95d", size = 84579 }, +] + [[package]] name = "numpy" version = "2.2.1" @@ -1106,6 +1214,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "pockets" +version = "0.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/8e/0601097cfcce2e8c2297db5080e9719f549c2bd4b94420ddc8d3f848bbca/pockets-0.9.1.tar.gz", hash = "sha256:9320f1a3c6f7a9133fe3b571f283bcf3353cd70249025ae8d618e40e9f7e92b3", size = 24993 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/2f/a4583c70fbd8cd04910e2884bcc2bdd670e884061f7b4d70bc13e632a993/pockets-0.9.1-py2.py3-none-any.whl", hash = "sha256:68597934193c08a08eb2bf6a1d85593f627c22f9b065cc727a4f03f669d96d86", size = 26263 }, +] + [[package]] name = "preshed" version = "3.0.9" @@ -1494,6 +1614,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 }, ] +[[package]] +name = "roman-numerals-py" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/76/48fd56d17c5bdbdf65609abbc67288728a98ed4c02919428d4f52d23b24b/roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d", size = 9017 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c", size = 7742 }, +] + [[package]] name = "rsa" version = "4.9" @@ -1571,6 +1700,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 }, ] +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, +] + [[package]] name = "smart-open" version = "7.1.0" @@ -1592,6 +1730,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, ] +[[package]] +name = "snowballstemmer" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/7b/af302bebf22c749c56c9c3e8ae13190b5b5db37a33d9068652e8f73b7089/snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", size = 86699 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/dc/c02e01294f7265e63a7315fe086dd1df7dacb9f840a804da846b96d01b96/snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a", size = 93002 }, +] + +[[package]] +name = "soupsieve" +version = "2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, +] + [[package]] name = "spacy" version = "3.8.4" @@ -1653,6 +1809,206 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645", size = 22343 }, ] +[[package]] +name = "sphinx" +version = "8.1.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +dependencies = [ + { name = "alabaster", marker = "python_full_version < '3.11'" }, + { name = "babel", marker = "python_full_version < '3.11'" }, + { name = "colorama", marker = "python_full_version < '3.11' and sys_platform == 'win32'" }, + { name = "docutils", marker = "python_full_version < '3.11'" }, + { name = "imagesize", marker = "python_full_version < '3.11'" }, + { name = "jinja2", marker = "python_full_version < '3.11'" }, + { name = "packaging", marker = "python_full_version < '3.11'" }, + { name = "pygments", marker = "python_full_version < '3.11'" }, + { name = "requests", marker = "python_full_version < '3.11'" }, + { name = "snowballstemmer", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.11'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.11'" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125 }, +] + +[[package]] +name = "sphinx" +version = "8.2.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.13'", + "python_full_version >= '3.11' and python_full_version < '3.13'", +] +dependencies = [ + { name = "alabaster", marker = "python_full_version >= '3.11'" }, + { name = "babel", marker = "python_full_version >= '3.11'" }, + { name = "colorama", marker = "python_full_version >= '3.11' and sys_platform == 'win32'" }, + { name = "docutils", marker = "python_full_version >= '3.11'" }, + { name = "imagesize", marker = "python_full_version >= '3.11'" }, + { name = "jinja2", marker = "python_full_version >= '3.11'" }, + { name = "packaging", marker = "python_full_version >= '3.11'" }, + { name = "pygments", marker = "python_full_version >= '3.11'" }, + { name = "requests", marker = "python_full_version >= '3.11'" }, + { name = "roman-numerals-py", marker = "python_full_version >= '3.11'" }, + { name = "snowballstemmer", marker = "python_full_version >= '3.11'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.11'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.11'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.11'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.11'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.11'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/ad/4360e50ed56cb483667b8e6dadf2d3fda62359593faabbe749a27c4eaca6/sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348", size = 8321876 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/53/136e9eca6e0b9dc0e1962e2c908fbea2e5ac000c2a2fbd9a35797958c48b/sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3", size = 3589741 }, +] + +[[package]] +name = "sphinx-autodoc-typehints" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/f0/43c6a5ff3e7b08a8c3b32f81b859f1b518ccc31e45f22e2b41ced38be7b9/sphinx_autodoc_typehints-3.0.1.tar.gz", hash = "sha256:b9b40dd15dee54f6f810c924f863f9cf1c54f9f3265c495140ea01be7f44fa55", size = 36282 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/dc/dc46c5c7c566b7ec5e8f860f9c89533bf03c0e6aadc96fb9b337867e4460/sphinx_autodoc_typehints-3.0.1-py3-none-any.whl", hash = "sha256:4b64b676a14b5b79cefb6628a6dc8070e320d4963e8ff640a2f3e9390ae9045a", size = 20245 }, +] + +[[package]] +name = "sphinx-autodoc-typehints" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.13'", + "python_full_version >= '3.11' and python_full_version < '3.13'", +] +dependencies = [ + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cb/cc/d38e7260b1bd3af0c84ad8285dfd78236584b74544510584e07963e000ec/sphinx_autodoc_typehints-3.1.0.tar.gz", hash = "sha256:a6b7b0b6df0a380783ce5b29150c2d30352746f027a3e294d37183995d3f23ed", size = 36528 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/2f/bc5bed0677ae00b9ca7919968ea675e2f696b6b20f1648262f26a7a6c6b4/sphinx_autodoc_typehints-3.1.0-py3-none-any.whl", hash = "sha256:67bdee7e27ba943976ce92ebc5647a976a7a08f9f689a826c54617b96a423913", size = 20404 }, +] + +[[package]] +name = "sphinx-basic-ng" +version = "1.0.0b2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/0b/a866924ded68efec7a1759587a4e478aec7559d8165fac8b2ad1c0e774d6/sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9", size = 20736 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/dd/018ce05c532a22007ac58d4f45232514cd9d6dd0ee1dc374e309db830983/sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b", size = 22496 }, +] + +[[package]] +name = "sphinx-sitemap" +version = "2.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/ed/96cc112b671e06df01c8306c25ce3331ccfece0d30235e32eb039afc8094/sphinx_sitemap-2.6.0.tar.gz", hash = "sha256:5e0c66b9f2e371ede80c659866a9eaad337d46ab02802f9c7e5f7bc5893c28d2", size = 6042 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/d4/dffb4da380be24fd390d284634735d6fba560980014050e52569c04d215b/sphinx_sitemap-2.6.0-py3-none-any.whl", hash = "sha256:7478e417d141f99c9af27ccd635f44c03a471a08b20e778a0f9daef7ace1d30b", size = 5632 }, +] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, +] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, +] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, +] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, +] + +[[package]] +name = "sphinxcontrib-napoleon" +version = "0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pockets" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fa/eb/ad89500f4cee83187596e07f43ad561f293e8e6e96996005c3319653b89f/sphinxcontrib-napoleon-0.7.tar.gz", hash = "sha256:407382beed396e9f2d7f3043fad6afda95719204a1e1a231ac865f40abcbfcf8", size = 21232 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/f2/6b7627dfe7b4e418e295e254bb15c3a6455f11f8c0ad0d43113f678049c3/sphinxcontrib_napoleon-0.7-py2.py3-none-any.whl", hash = "sha256:711e41a3974bdf110a484aec4c1a556799eb0b3f3b897521a018ad7e2db13fef", size = 17151 }, +] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, +] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, +] + +[[package]] +name = "sphinxext-opengraph" +version = "0.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/5b/4302fe33c88dbfb572e2c1cad26735164c23f16fb8dba94ddb1867d0ef06/sphinxext-opengraph-0.9.1.tar.gz", hash = "sha256:dd2868a1e7c9497977fbbf44cc0844a42af39ca65fe1bb0272518af225d06fc5", size = 1034511 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/0a/970b80b4fa1feeb6deb6f2e22d4cb14e388b27b315a1afdb9db930ff91a4/sphinxext_opengraph-0.9.1-py3-none-any.whl", hash = "sha256:b3b230cc6a5b5189139df937f0d9c7b23c7c204493b22646273687969dcb760e", size = 1005241 }, +] + [[package]] name = "srsly" version = "2.5.1" @@ -1869,6 +2225,7 @@ wheels = [ name = "vechord" source = { editable = "." } dependencies = [ + { name = "defspec" }, { name = "falcon" }, { name = "httpx" }, { name = "msgspec" }, @@ -1903,9 +2260,21 @@ dev = [ { name = "pytest" }, { name = "ruff" }, ] +doc = [ + { name = "furo" }, + { name = "myst-parser" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "sphinx-autodoc-typehints", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx-autodoc-typehints", version = "3.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "sphinx-sitemap" }, + { name = "sphinxcontrib-napoleon" }, + { name = "sphinxext-opengraph" }, +] [package.metadata] requires-dist = [ + { name = "defspec", specifier = ">=0.3.0" }, { name = "en-core-web-sm", marker = "extra == 'spacy'", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }, { name = "falcon", specifier = ">=4.0.2" }, { name = "google-generativeai", marker = "extra == 'gemini'", specifier = ">=0.8.4" }, @@ -1931,6 +2300,15 @@ dev = [ { name = "pytest", specifier = ">=8.3.5" }, { name = "ruff", specifier = ">=0.9.1" }, ] +doc = [ + { name = "furo", specifier = ">=2024.8.6" }, + { name = "myst-parser", specifier = ">=4.0.1" }, + { name = "sphinx", specifier = ">=8.1.3" }, + { name = "sphinx-autodoc-typehints", specifier = ">=3.0.1" }, + { name = "sphinx-sitemap", specifier = ">=2.6.0" }, + { name = "sphinxcontrib-napoleon", specifier = ">=0.7" }, + { name = "sphinxext-opengraph", specifier = ">=0.9.1" }, +] [[package]] name = "wasabi" diff --git a/vechord/__init__.py b/vechord/__init__.py index d50a102..f423fb5 100644 --- a/vechord/__init__.py +++ b/vechord/__init__.py @@ -11,6 +11,7 @@ from vechord.load import LocalLoader from vechord.model import Document from vechord.registry import VechordRegistry +from vechord.service import create_web_app from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector __all__ = [ @@ -34,4 +35,5 @@ "Vector", "VectorChordClient", "WordLlamaChunker", + "create_web_app", ] diff --git a/vechord/augment.py b/vechord/augment.py index 9339921..eb78ad6 100644 --- a/vechord/augment.py +++ b/vechord/augment.py @@ -29,12 +29,13 @@ def summarize_doc(self) -> str: class GeminiAugmenter(BaseAugmenter): - # Context caching is only available for stable models with fixed versions - def __init__(self, model: str = "models/gemini-1.5-flash-001", ttl_sec: int = 600): - """Gemini Augmenter with cache. + """Gemini Augmenter. + + Context caching is only available for stable models with fixed versions. + Minimal cache token is 32768. + """ - Minimal cache token is 32768. - """ + def __init__(self, model: str = "models/gemini-1.5-flash-001", ttl_sec: int = 600): key = os.environ.get("GEMINI_API_KEY") if not key: raise ValueError("env GEMINI_API_KEY not set") @@ -47,6 +48,7 @@ def name(self) -> str: return f"gemini_augment_{self.model_name}" def reset(self, doc: str): + """Reset the document.""" import google.generativeai as genai self.client = genai.GenerativeModel(model_name=self.model_name) @@ -85,6 +87,7 @@ def augment(self, chunks: list[str], prompt: str) -> list[str]: return res def augment_context(self, chunks: list[str]) -> list[str]: + """Generate the contextual chunks.""" prompt = ( "Here is the chunk we want to situate within the whole document " "{chunk}" @@ -95,6 +98,7 @@ def augment_context(self, chunks: list[str]) -> list[str]: return self.augment(chunks, prompt) def augment_query(self, chunks: list[str]) -> list[str]: + """Generate the queries for chunks.""" prompt = ( "Here is the chunk we want to ask questions about " "{chunk}" @@ -105,6 +109,7 @@ def augment_query(self, chunks: list[str]) -> list[str]: return self.augment(chunks, prompt) def summarize_doc(self) -> str: + """Summarize the document.""" prompt = ( "Summarize the provided document concisely while preserving its key " "ideas, main arguments, and essential details. Ensure clarity and " diff --git a/vechord/chunk.py b/vechord/chunk.py index 280828c..a35658a 100644 --- a/vechord/chunk.py +++ b/vechord/chunk.py @@ -15,6 +15,8 @@ def name(self) -> str: class RegexChunker(BaseChunker): + """A simple regex-based chunker.""" + def __init__( self, size: int = 1536, @@ -79,8 +81,12 @@ def segment(self, text: str) -> list[str]: class SpacyChunker(BaseChunker): + """A semantic sentence Chunker based on SpaCy. + + This guarantees the generated chunks are sentences. + """ + def __init__(self, model: str = "en_core_web_sm"): - """A semantic sentence Chunker based on SpaCy.""" import spacy self.model = model @@ -94,8 +100,12 @@ def segment(self, text: str) -> list[str]: class WordLlamaChunker(BaseChunker): + """A semantic chunker based on WordLlama. + + This doesn't guarantee the generated chunks are sentences. + """ + def __init__(self, size: int = 1536): - """A semantic chunker based on WordLlama.""" from wordllama import WordLlama self.model = WordLlama.load() @@ -109,6 +119,8 @@ def segment(self, text: str) -> list[str]: class GeminiChunker(BaseChunker): + """A semantic chunker based on Gemini.""" + def __init__(self, model: str = "gemini-2.0-flash", size: int = 1536): key = os.environ.get("GEMINI_API_KEY") if not key: diff --git a/vechord/client.py b/vechord/client.py index f18d746..a0cb405 100644 --- a/vechord/client.py +++ b/vechord/client.py @@ -1,6 +1,5 @@ import contextlib import contextvars -import hashlib from typing import Any, Optional, Sequence import numpy as np @@ -8,11 +7,6 @@ from pgvector.psycopg import register_vector from psycopg import sql - -def hash_table_suffix(name: str) -> str: - return hashlib.shake_256(name.encode()).hexdigest(4) - - active_cursor = contextvars.ContextVar("active_cursor", default=None) select_transaction_buffer = contextvars.ContextVar( "select_transaction_buffer", default=False diff --git a/vechord/embedding.py b/vechord/embedding.py index aa8bdcb..1603548 100644 --- a/vechord/embedding.py +++ b/vechord/embedding.py @@ -37,6 +37,8 @@ def vec_type(self) -> VecType: class SpacyDenseEmbedding(BaseEmbedding): + """Spacy Dense Embedding.""" + def __init__(self, model: str = "en_core_web_sm", dim: int = 96): import spacy @@ -59,6 +61,8 @@ def vectorize_chunk(self, text: str) -> np.ndarray: class GeminiDenseEmbedding(BaseEmbedding): + """Gemini Dense Embedding.""" + def __init__(self, model: str = "models/text-embedding-004", dim: int = 768): key = os.environ.get("GEMINI_API_KEY") if not key: @@ -88,6 +92,8 @@ def vectorize_chunk(self, text: str) -> np.ndarray: class OpenAIDenseEmbedding(BaseEmbedding): + """OpenAI Dense Embedding.""" + def __init__(self, model: str = "text-embedding-3-large", dim: int = 3072): key = os.environ.get("OPENAI_API_KEY") if not key: diff --git a/vechord/evaluate.py b/vechord/evaluate.py index 7add448..f99ef24 100644 --- a/vechord/evaluate.py +++ b/vechord/evaluate.py @@ -15,6 +15,7 @@ def evaluate( retrieves: list[list[RetrievedChunk]], measures: Sequence[str] = ("map", "ndcg", "recall"), ): + """Evaluate the retrieval results for multiple queries.""" num = len(chunk_ids) qids = list(range(num)) query_relevance = { @@ -43,6 +44,7 @@ def evaluate_one( resp_ids: list[int], measures: Sequence[str] = ("map", "ndcg", "recall"), ): + """Evaluate the retrieval results for a single query.""" query_relevance = {"0": {str(truth_id): 1}} evaluator = pytrec_eval.RelevanceEvaluator( query_relevance=query_relevance, measures=measures @@ -61,6 +63,8 @@ def produce_query(self, doc: str, chunk: str) -> str: class GeminiEvaluator(BaseEvaluator): + """Evaluator using Gemini model to generate search queries.""" + def __init__(self, model: str = "gemini-2.0-flash"): key = os.environ.get("GEMINI_API_KEY") if not key: diff --git a/vechord/extract.py b/vechord/extract.py index a4ca5c7..b7b0d97 100644 --- a/vechord/extract.py +++ b/vechord/extract.py @@ -31,6 +31,8 @@ def extract(self, doc: Document) -> str: class SimpleExtractor(BaseExtractor): + """Local extractor for text files.""" + def __init__(self): pass @@ -38,6 +40,7 @@ def name(self) -> str: return "basic_extractor" def extract_pdf(self, doc: Document) -> str: + """Extract text from PDF using pypdfium2.""" pdf = pdfium.PdfDocument(doc.data) text = [] for page in pdf: @@ -47,6 +50,8 @@ def extract_pdf(self, doc: Document) -> str: class GeminiExtractor(BaseExtractor): + """Extract text with Gemini model.""" + def __init__(self, model: str = "gemini-2.0-flash"): key = os.environ.get("GEMINI_API_KEY") if not key: @@ -64,6 +69,7 @@ def name(self) -> str: return f"gemini_extractor_{self.model.model_name}" def extract_pdf(self, doc: Document) -> str: + """Extract text from PDF page by page.""" pdf = pdfium.PdfDocument(doc.data) text = [] for page in pdf: diff --git a/vechord/load.py b/vechord/load.py index 04506d3..26066d9 100644 --- a/vechord/load.py +++ b/vechord/load.py @@ -19,6 +19,8 @@ def name(self) -> str: class LocalLoader(BaseLoader): + """Load documents from local file system.""" + def __init__(self, path: str, include: list[str] | None = None): self.path = Path(path) self.include = set(ext.lower() for ext in include or [".txt"]) diff --git a/vechord/registry.py b/vechord/registry.py index 47ed122..0fdaaad 100644 --- a/vechord/registry.py +++ b/vechord/registry.py @@ -8,8 +8,6 @@ get_type_hints, ) -import msgspec - from vechord.client import ( VectorChordClient, limit_to_transaction_buffer, @@ -29,6 +27,14 @@ def is_list_of_type(typ) -> bool: class VechordRegistry: + """Create a registry for the given namespace and PostgreSQL URL. + + Args: + namespace: the namespace for this registry, will be the prefix for all the + tables registered. + url: the PostgreSQL URL to connect to. + """ + def __init__(self, namespace: str, url: str): self.ns = namespace self.client = VectorChordClient(namespace, url) @@ -36,6 +42,13 @@ def __init__(self, namespace: str, url: str): self.pipeline: list[Callable] = [] def register(self, tables: list[type[Table]]): + """Register the given tables to the registry. + + This will create the tables in the database if not exists. + + Args: + tables: a list of Table classes to be registered. + """ for table in tables: if not issubclass(table, Table): raise ValueError(f"unsupported class {table}") @@ -52,27 +65,42 @@ def register(self, tables: list[type[Table]]): self.tables.append(table) def set_pipeline(self, pipeline: list[Callable]): + """Set the pipeline to be executed in the `run` method.""" self.pipeline = pipeline def run(self, *args, **kwargs): - """Execute the pipeline in a transactional manner.""" + """Execute the pipeline in a transactional manner. + + All the `args` and `kwargs` will be passed to the first function in the + pipeline. The pipeline will run in *one* transaction, and all the `inject` + can only see the data inserted in this transaction (to guarantee only the + new inserted data will be processed in this pipeline). + + This will also return the final result of the last function in the pipeline. + """ if not self.pipeline: raise RuntimeError("pipeline is not set") with self.client.transaction(), limit_to_transaction_buffer(): # only the 1st one can accept input (could be empty) self.pipeline[0](*args, **kwargs) - for func in self.pipeline[1:]: + for func in self.pipeline[1:-1]: func() + return self.pipeline[-1]() - def select_by( - self, cls: type[Table], obj: Table, fields: Optional[list[str]] = None - ): - if not isinstance(obj, cls): - raise ValueError(f"expected {cls}, got {type(obj)}") - if not issubclass(cls, Table): - raise ValueError(f"unsupported class {cls}") + def select_by(self, obj: Table, fields: Optional[list[str]] = None) -> list[Table]: + """Retrieve the requested fields for the given object stored in the DB. - cls_fields = cls.fields() + Args: + obj: the object to be retrieved, this should be a `Table.partial_init()` + instance, which means given values will be used for filtering. + fields: the fields to be retrieved, if not set, all the fields will be + retrieved. + """ + if not isinstance(obj, Table): + raise ValueError(f"unsupported class {type(obj)}") + + cls_fields = obj.fields() + cls = obj.__class__ if fields is not None: if any(f not in cls_fields for f in fields): raise ValueError(f"unknown fields {fields}") @@ -81,9 +109,8 @@ def select_by( kvs = obj.todict() res = self.client.select(cls.name(), fields, kvs) - missing = dict(zip(cls_fields, [msgspec.UNSET] * len(cls_fields), strict=False)) return [ - cls(**(missing | {k: v for k, v in zip(fields, r, strict=False)})) + cls.partial_init(**{k: v for k, v in zip(fields, r, strict=False)}) for r in res ] @@ -93,7 +120,17 @@ def search( vec, topk: int = 10, return_vector: bool = False, - ): + ) -> list[Table]: + """Search the vector for the given `Table` class. + + Args: + cls: the `Table` class to be searched. + vec: the vector to be searched. + topk: the number of results to be returned. + return_vector: whether to return the vector column in the result. + This is usually much larger than any other fields, and it's + not necessary to return it if not needed. + """ if not issubclass(cls, Table): raise ValueError(f"unsupported class {cls}") fields = list(cls.fields()) @@ -114,18 +151,23 @@ def search( for r in res ] - def remove_by(self, cls: type[Table], obj): - if not isinstance(obj, cls): - raise ValueError(f"expected {cls}, got {type(obj)}") - if not issubclass(cls, Table): - raise ValueError(f"unsupported class {cls}") + def remove_by(self, obj: Table): + """Remove the given object from the DB. + + Args: + obj: the object to be removed, this should be a `Table.partial_init()` + instance, which means given values will be used for filtering. + """ + if not isinstance(obj, Table): + raise ValueError(f"unsupported class {type(obj)}") kvs = obj.todict() if not kvs: raise ValueError("empty object") - self.client.delete(cls.name(), kvs) + self.client.delete(obj.__class__.name(), kvs) - def insert(self, obj): + def insert(self, obj: Table): + """Insert the given object to the DB.""" if not isinstance(obj, Table): raise ValueError(f"unsupported class {type(obj)}") self.client.insert(obj.name(), obj.todict()) @@ -133,6 +175,14 @@ def insert(self, obj): def inject( self, input: Optional[type[Table]] = None, output: Optional[type[Table]] = None ): + """Decorator to inject the data for the function arguments & return value. + + Args: + input: the input table to be retrieved from the DB. If not set, the function + will require the input to be passed in the function call. + output: the output table to store the return value. If not set, the return + value will be return to the caller in a `list`. + """ if input is None and output is None: return lambda func: func if input is not None and not issubclass(input, Table): @@ -186,8 +236,13 @@ def wrapper(*args, **kwargs): return decorator def clear_storage(self, drop_table: bool = False): + """Clear the storage of the registry. + + Args: + drop_table: whether to drop the table after removing all the data. + """ for table in self.tables: if drop_table: self.client.drop(table.name()) else: - self.remove_by(table, table.partial_init()) + self.remove_by(table.partial_init()) diff --git a/vechord/service.py b/vechord/service.py index c664f5c..3677e10 100644 --- a/vechord/service.py +++ b/vechord/service.py @@ -2,6 +2,7 @@ import falcon import msgspec +from defspec import OpenAPI, RenderTemplate from falcon import App, Request, Response from vechord.log import logger @@ -50,7 +51,7 @@ def __init__(self, table: type[Table], registry: VechordRegistry): def on_get(self, req: Request, resp: Response): table = self.table_cls.partial_init(**req.params) - rows = self.registry.select_by(cls=self.table_cls, obj=table) + rows = self.registry.select_by(obj=table) resp.data = msgspec.json.encode(rows) def on_post(self, req: Request, resp: Response): @@ -63,7 +64,7 @@ def on_post(self, req: Request, resp: Response): def on_delete(self, req: Request, resp: Response): table = self.table_cls.partial_init(**req.params) - self.registry.remove_by(cls=self.table_cls, obj=table) + self.registry.remove_by(obj=table) class PipelineResource: @@ -81,14 +82,68 @@ def on_post(self, req: Request, resp: Response): self.registry.run(**json) +class OpenAPIResource: + def __init__(self, tables: list[Table]) -> None: + self.openapi = OpenAPI() + self.openapi.register_route("/", "get", summary="health check") + self.openapi.register_route("/api/pipeline", "post", summary="run the pipeline") + for table in tables: + path = f"/api/table/{table.name()}" + self.openapi.register_route( + path, + "get", + "get the table with partial attributes", + query_type=table, + ) + self.openapi.register_route( + path, + "delete", + "delete table records according to partial attributes", + query_type=table, + ) + self.openapi.register_route( + path, + "post", + "insert a new record to the table", + request_type=table, + request_content_type="json", + ) + self.spec = self.openapi.to_json() + + def on_get(self, req: Request, resp: Response): + resp.content_type = falcon.MEDIA_JSON + resp.data = self.spec + + +class OpenAPIRender: + def __init__(self, spec_url: str, template: RenderTemplate) -> None: + self.template = template.value.format(spec_url=spec_url) + + def on_get(self, req: Request, resp: Response): + resp.content_type = falcon.MEDIA_HTML + resp.text = self.template + + def create_web_app(registry: VechordRegistry) -> App: + """Create a `Falcon` WSGI application for the given registry. + + This includes the: + - health check + - table GET/POST/DELETE + - pipeline POST + - OpenAPI spec and Swagger UI + """ app = App() - app.add_route("/health", HealthCheck()) + app.add_route("/", HealthCheck()) for table in registry.tables: app.add_route( f"/api/table/{table.name()}", TableResource(table=table, registry=registry), ) app.add_route("/api/pipeline", PipelineResource(registry)) + app.add_route("/openapi/spec.json", OpenAPIResource(registry.tables)) + app.add_route( + "/openapi/swagger", OpenAPIRender("/openapi/spec.json", RenderTemplate.SWAGGER) + ) app.add_error_handler(Exception, uncaught_exception_handler) return app diff --git a/vechord/spec.py b/vechord/spec.py index a347205..fa564aa 100644 --- a/vechord/spec.py +++ b/vechord/spec.py @@ -39,6 +39,8 @@ def __getitem__(self, dim: int): class Vector(Generic[TypeVar("T")], metaclass=VectorMeta): + """Vector type with fixed dimension.""" + def __init__(self, *args, **kwargs): raise NotImplementedError("Use Vector[dim] to create a vector type") @@ -77,6 +79,11 @@ def __getitem__(self, ref): class ForeignKey(Generic[TypeVar("K")], metaclass=ForeignKeyMeta): + """Reference to another table's attribute as a foreign key. + + This should be used in the `Annotated[]` type hint. + """ + def __init__(self, *args, **kwargs): raise NotImplementedError("Use ForeignKey[ref] to create a foreign key type") @@ -106,6 +113,8 @@ def schema(cls): class PrimaryKeyAutoIncrease(int): + """Primary key with auto-increment ID type.""" + @classmethod def schema(cls) -> str: return "BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY" @@ -172,13 +181,17 @@ def partial_init(cls, **kwargs): class Table(Storage): + """Base class for table definition.""" + @classmethod def table_schema(cls) -> Sequence[tuple[str, str]]: + """Generate the table schema from the class attributes' type hints.""" hints = get_type_hints(cls, include_extras=True) return ((name, type_to_psql(typ)) for name, typ in hints.items()) @classmethod def vector_column(cls) -> Optional[str]: + """Get the vector column name.""" for name, typ in get_type_hints(cls, include_extras=True).items(): if issubclass(typ.__class__, VectorMeta): return name @@ -186,6 +199,7 @@ def vector_column(cls) -> Optional[str]: @classmethod def primary_key(cls) -> Optional[str]: + """Get the primary key column name.""" for name, typ in get_type_hints(cls, include_extras=True).items(): typ_cls = ( get_first_type_from_optional(typ) if is_optional_type(typ) else typ @@ -195,6 +209,10 @@ def primary_key(cls) -> Optional[str]: return None def todict(self) -> dict[str, Any]: + """Convert the table instance to a dictionary. + + This will ignore the default values. + """ defaults = getattr(self, "__struct_defaults__", None) fields = self.fields() if not defaults: From b69dc109803c0d6d735a8b5b98f8abbd46919660 Mon Sep 17 00:00:00 2001 From: Keming Date: Mon, 17 Mar 2025 21:18:33 +0800 Subject: [PATCH 2/7] Update .github/workflows/check.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/check.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 204a3eb..ea9ad76 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -25,6 +25,9 @@ jobs: run: make lint - name: Test run: | - docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres tensorchord/vchord-postgres:pg17-v0.2.1 + docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=10s --health-timeout=5s --health-retries=5 tensorchord/vchord-postgres:pg17-v0.2.1 + until [ "$(docker inspect --format='{{.State.Health.Status}}' vdb)" == "healthy" ]; do + sleep 1 + done make test docker stop vdb From b5ffbfdb0d0a231831503db62f72e0aed27b97a0 Mon Sep 17 00:00:00 2001 From: Keming Date: Mon, 17 Mar 2025 21:19:17 +0800 Subject: [PATCH 3/7] use 127.0.0.1 Signed-off-by: Keming --- pyproject.toml | 1 - tests/test_table.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6a7738d..bdcfa98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ dependencies = [ ] [project.scripts] -vechord = "vechord.main:main" [project.optional-dependencies] gemini = [ diff --git a/tests/test_table.py b/tests/test_table.py index d119dff..2381013 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -8,7 +8,7 @@ from vechord.registry import VechordRegistry from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector -TEST_POSTGRES = "postgresql://postgres:postgres@172.17.0.1:5432/" +TEST_POSTGRES = "postgresql://postgres:postgres@127.0.0.1:5432/" DenseVector = Vector[128] From 15afab6ab844f7802a2238b8d16dd717db2f7bf7 Mon Sep 17 00:00:00 2001 From: Keming Date: Mon, 17 Mar 2025 21:21:25 +0800 Subject: [PATCH 4/7] reduce health check interval for ci Signed-off-by: Keming --- .github/workflows/check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index ea9ad76..e189af2 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -25,7 +25,7 @@ jobs: run: make lint - name: Test run: | - docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=10s --health-timeout=5s --health-retries=5 tensorchord/vchord-postgres:pg17-v0.2.1 + docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=1s --health-timeout=1s --health-retries=5 tensorchord/vchord-postgres:pg17-v0.2.1 until [ "$(docker inspect --format='{{.State.Health.Status}}' vdb)" == "healthy" ]; do sleep 1 done From 47de0e8550c12bd81b29ddb7fe4a8fb6fef5ef3e Mon Sep 17 00:00:00 2001 From: Keming Date: Mon, 17 Mar 2025 21:59:38 +0800 Subject: [PATCH 5/7] show logs in ci test Signed-off-by: Keming --- .github/workflows/check.yml | 5 ++--- Makefile | 8 ++++---- tests/test_table.py | 4 ++++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index e189af2..2c28bda 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -24,10 +24,9 @@ jobs: - name: Lint run: make lint - name: Test + env: + PYTEST_ADDOPTS: -s run: | docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=1s --health-timeout=1s --health-retries=5 tensorchord/vchord-postgres:pg17-v0.2.1 - until [ "$(docker inspect --format='{{.State.Health.Status}}' vdb)" == "healthy" ]; do - sleep 1 - done make test docker stop vdb diff --git a/Makefile b/Makefile index 982a495..623f7eb 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,14 @@ PY_SOURCE=. lint: - @uv run ruff check ${PY_SOURCE} + @uv run -- ruff check ${PY_SOURCE} typecheck: @uv run -- mypy --non-interactive --install-types ${PY_SOURCE} format: - @uv run ruff check --fix ${PY_SOURCE} - @uv run ruff format ${PY_SOURCE} + @uv run -- ruff check --fix ${PY_SOURCE} + @uv run -- ruff format ${PY_SOURCE} clean: @-rm -rf dist build */__pycache__ *.egg-info vechord/__version__.py @@ -20,7 +20,7 @@ publish: build @uv publish test: - @uv run pytest -v tests + @uv run -- pytest -v tests sync: @uv sync --all-extras --all-groups diff --git a/tests/test_table.py b/tests/test_table.py index 2381013..ecfbbad 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from vechord.log import logger from vechord.registry import VechordRegistry from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector @@ -36,6 +37,7 @@ def registry(): registry = VechordRegistry("test", TEST_POSTGRES) registry.register([Document, Chunk]) yield registry + logger.debug("clearing storage...") registry.clear_storage(drop_table=True) registry.pipeline.clear() @@ -75,6 +77,8 @@ def test_foreign_key(registry): for record in docs + chunks: registry.insert(record) + assert len(registry.select_by(Document.partial_init())) == len(docs) + # remove the doc should also remove the related chunks registry.remove_by(Document.partial_init(uid=1)) assert len(registry.select_by(Document.partial_init())) == 1 assert len(registry.select_by(Chunk.partial_init())) == 0 From 30dea6b0cad18698dac46b6c358baa009e3fc1f1 Mon Sep 17 00:00:00 2001 From: Keming Date: Tue, 18 Mar 2025 10:45:51 +0800 Subject: [PATCH 6/7] fix remove table without condition Signed-off-by: Keming --- tests/test_table.py | 13 +++++++++---- vechord/client.py | 6 ++---- vechord/registry.py | 2 -- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_table.py b/tests/test_table.py index ecfbbad..4e66e5d 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1,4 +1,5 @@ from datetime import datetime +from os import environ from typing import Annotated import msgspec @@ -9,7 +10,11 @@ from vechord.registry import VechordRegistry from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector -TEST_POSTGRES = "postgresql://postgres:postgres@127.0.0.1:5432/" +URL = "127.0.0.1" +# for local container development environment, use the host machine's IP +if environ.get("REMOTE_CONTAINERS", "") == "true" or environ.get("USER", "") == "envd": + URL = "172.17.0.1" +TEST_POSTGRES = f"postgresql://postgres:postgres@{URL}:5432/" DenseVector = Vector[128] @@ -32,9 +37,9 @@ class Chunk(Table, kw_only=True): vector: DenseVector -@pytest.fixture -def registry(): - registry = VechordRegistry("test", TEST_POSTGRES) +@pytest.fixture(name="registry") +def fixture_registry(request): + registry = VechordRegistry(request.node.name, TEST_POSTGRES) registry.register([Document, Chunk]) yield registry logger.debug("clearing storage...") diff --git a/vechord/client.py b/vechord/client.py index a0cb405..8446734 100644 --- a/vechord/client.py +++ b/vechord/client.py @@ -143,10 +143,8 @@ def delete(self, name: str, kvs: dict): ) else: self.conn.execute( - sql.SQL( - "DELETE FROM {table};".format( - table=sql.Identifier(f"{self.ns}_{name}") - ) + sql.SQL("DELETE FROM {table};").format( + table=sql.Identifier(f"{self.ns}_{name}") ) ) diff --git a/vechord/registry.py b/vechord/registry.py index 0fdaaad..370fb1d 100644 --- a/vechord/registry.py +++ b/vechord/registry.py @@ -162,8 +162,6 @@ def remove_by(self, obj: Table): raise ValueError(f"unsupported class {type(obj)}") kvs = obj.todict() - if not kvs: - raise ValueError("empty object") self.client.delete(obj.__class__.name(), kvs) def insert(self, obj: Table): From 86adc6341c89a03a22afc8174dda537af585895b Mon Sep 17 00:00:00 2001 From: Keming Date: Tue, 18 Mar 2025 10:57:48 +0800 Subject: [PATCH 7/7] check if the container is health before running test Signed-off-by: Keming --- .github/workflows/check.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 2c28bda..43353d5 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -27,6 +27,17 @@ jobs: env: PYTEST_ADDOPTS: -s run: | - docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=1s --health-timeout=1s --health-retries=5 tensorchord/vchord-postgres:pg17-v0.2.1 + docker run --rm -d -p 5432:5432 --name vdb -e POSTGRES_PASSWORD=postgres --health-cmd="pg_isready -U postgres" --health-interval=1s --health-timeout=1s --health-retries=5 ghcr.io/tensorchord/vchord_bm25-postgres:pg17-v0.1.1 + + # Wait for the container to be healthy + for i in {1..10}; do + if [ "$(docker inspect --format='{{.State.Health.Status}}' vdb)" == "healthy" ]; then + echo "Database container is healthy." + break + fi + echo "Waiting for database container to become healthy... ($i/10)" + sleep 1 + done + make test docker stop vdb