tensorchord
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source/api.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/api.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source/example.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/example.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/beir.py‎
Lines changed: 6 additions & 3 deletions b/‎examples/beir.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/contextual.py‎
Lines changed: 12 additions & 7 deletions b/‎examples/contextual.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎examples/essay.py‎
Lines changed: 7 additions & 4 deletions b/‎examples/essay.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎examples/hybrid.py‎
Lines changed: 95 additions & 0 deletions b/‎examples/hybrid.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎examples/web.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/web.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/test_spec.py‎
Lines changed: 14 additions & 0 deletions b/‎tests/test_spec.py‎
Lines changed: 14 additions & 0 deletions
@@ -1,3 +1,5 @@
+# Vechord
+
 Python RAG framework built on top of PostgreSQL and [VectorChord](https://github.com/tensorchord/VectorChord/).
 
 ## Installation
@@ -16,7 +18,7 @@ pip install vechord
 ## Development
 
 ```bash
-docker run --rm -d --name vechord -e POSTGRES_PASSWORD=postgres -p 5432:5432 tensorchord/vchord-postgres:pg17-v0.2.1
+docker run --rm -d --name vdb -e POSTGRES_PASSWORD=postgres -p 5432:5432 ghcr.io/tensorchord/vchord_bm25-postgres:pg17-v0.1.1
 envd up
 # inside the envd env, sync all the dependencies
 make sync
 
@@ -62,6 +62,14 @@
    :show-inheritance:
 ```
 
+## Rerank
+
+```{eval-rst}
+.. automodule:: vechord.rerank
+   :members:
+   :show-inheritance:
+```
+
 ## Service
 
 ```{eval-rst}
 
@@ -23,3 +23,9 @@
 ```{include} ../../examples/essay.py
 :code: python
 ```
+
+## Hybrid search with rerank
+
+```{include} ../../examples/hybrid.py
+:code: python
+```
@@ -90,7 +90,7 @@ def load_corpus(dataset: str, output: Path) -> Iterator[Corpus]:
                 uid=item["_id"],
                 text=text,
                 title=title,
-                vector=vector,
+                vector=DenseVector(vector),
             )
 
 
@@ -115,13 +115,16 @@ def load_query(dataset: str, output: Path) -> Iterator[Query]:
                 continue
             text = item.get("text", "")
             yield Query(
-                uid=uid, cid=table[uid], text=text, vector=emb.vectorize_query(text)
+                uid=uid,
+                cid=table[uid],
+                text=text,
+                vector=DenseVector(emb.vectorize_query(text)),
             )
 
 
 @vr.inject(input=Query)
 def evaluate(cid: str, vector: DenseVector) -> Evaluation:
-    docs: list[Corpus] = vr.search(Corpus, vector, topk=TOP_K)
+    docs: list[Corpus] = vr.search_by_vector(Corpus, vector, topk=TOP_K)
     score = BaseEvaluator.evaluate_one(cid, [doc.uid for doc in docs])
     return Evaluation(
         map=score.get("map"),
 
@@ -67,7 +67,12 @@ def split_document(uid: int, text: str) -> list[Chunk]:
     chunker = RegexChunker(overlap=0)
     chunks = chunker.segment(text)
     return [
-        Chunk(doc_uid=uid, seq_id=i, text=chunk, vector=emb.vectorize_chunk(chunk))
+        Chunk(
+            doc_uid=uid,
+            seq_id=i,
+            text=chunk,
+            vector=DenseVector(emb.vectorize_chunk(chunk)),
+        )
         for i, chunk in enumerate(chunks)
     ]
 
@@ -89,7 +94,9 @@ def context_embedding(uid: int, text: str) -> list[ContextChunk]:
     ]
     return [
         ContextChunk(
-            chunk_uid=chunk_uid, text=augmented, vector=emb.vectorize_chunk(augmented)
+            chunk_uid=chunk_uid,
+            text=augmented,
+            vector=DenseVector(emb.vectorize_chunk(augmented)),
         )
         for (chunk_uid, augmented) in zip(
             [c.uid for c in chunks], context_chunks, strict=False
@@ -99,22 +106,20 @@ def context_embedding(uid: int, text: str) -> list[ContextChunk]:
 
 def query_chunk(query: str) -> list[Chunk]:
     vector = emb.vectorize_query(query)
-    res: list[Chunk] = vr.search(
+    res: list[Chunk] = vr.search_by_vector(
         Chunk,
         vector,
         topk=5,
-        return_vector=False,
     )
     return res
 
 
 def query_context_chunk(query: str) -> list[ContextChunk]:
     vector = emb.vectorize_query(query)
-    res: list[ContextChunk] = vr.search(
+    res: list[ContextChunk] = vr.search_by_vector(
         ContextChunk,
         vector,
         topk=5,
-        return_vector=False,
     )
     return res
 
@@ -125,7 +130,7 @@ def evaluate(uid: int, doc_uid: int, text: str):
     doc: Document = vr.select_by(Document.partial_init(uid=doc_uid))[0]
     query = evaluator.produce_query(doc.text, text)
     retrieved = query_chunk(query)
-    score = evaluator.evaluate_one(uid, [r.uid for r in retrieved])
+    score = evaluator.evaluate_one(str(uid), [str(r.uid) for r in retrieved])
     return score
 
 
 
@@ -75,19 +75,22 @@ class Evaluation:
 def segment_essay() -> list[Chunk]:
     chunker = RegexChunker()
     chunks = chunker.segment(doc)
-    return [Chunk(text=chunk, vector=emb.vectorize_chunk(chunk)) for chunk in chunks]
+    return [
+        Chunk(text=chunk, vector=DenseVector(emb.vectorize_chunk(chunk)))
+        for chunk in chunks
+    ]
 
 
 @vr.inject(input=Chunk, output=Query)
 def create_query(uid: int, text: str) -> Query:
     query = evaluator.produce_query(doc, text)
-    return Query(cid=uid, text=query, vector=emb.vectorize_chunk(query))
+    return Query(cid=uid, text=query, vector=DenseVector(emb.vectorize_chunk(query)))
 
 
 @vr.inject(input=Query)
 def evaluate(cid: int, vector: DenseVector) -> Evaluation:
-    chunks: list[Chunk] = vr.search(Chunk, vector, topk=TOP_K)
-    score = evaluator.evaluate_one(cid, [chunk.uid for chunk in chunks])
+    chunks: list[Chunk] = vr.search_by_vector(Chunk, vector, topk=TOP_K)
+    score = evaluator.evaluate_one(str(cid), [str(chunk.uid) for chunk in chunks])
     return Evaluation(
         map=score["map"], ndcg=score["ndcg"], recall=score[f"recall_{TOP_K}"]
     )
 
@@ -0,0 +1,95 @@
+from html.parser import HTMLParser
+from typing import Annotated
+
+import httpx
+
+from vechord.chunk import RegexChunker
+from vechord.embedding import SpacyDenseEmbedding
+from vechord.registry import VechordRegistry
+from vechord.rerank import CohereReranker
+from vechord.spec import ForeignKey, Keyword, PrimaryKeyAutoIncrease, Table, Vector
+
+URL = "https://paulgraham.com/{}.html"
+DenseVector = Vector[96]
+emb = SpacyDenseEmbedding()
+chunker = RegexChunker(size=1024, overlap=0)
+reranker = CohereReranker()
+
+
+class EssayParser(HTMLParser):
+    def __init__(self, *, convert_charrefs: bool = ...) -> None:
+        super().__init__(convert_charrefs=convert_charrefs)
+        self.content = []
+        self.skip = False
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag in ("script", "style"):
+            self.skip = True
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in ("script", "style"):
+            self.skip = False
+
+    def handle_data(self, data: str) -> None:
+        if not self.skip:
+            self.content.append(data.strip())
+
+
+class Document(Table, kw_only=True):
+    uid: PrimaryKeyAutoIncrease | None = None
+    title: str = ""
+    text: str
+
+
+class Chunk(Table, kw_only=True):
+    uid: PrimaryKeyAutoIncrease | None = None
+    doc_id: Annotated[int, ForeignKey[Document.uid]]
+    text: str
+    vector: DenseVector
+    keyword: Keyword
+
+
+vr = VechordRegistry("hybrid", "postgresql://postgres:postgres@172.17.0.1:5432/")
+vr.register([Document, Chunk])
+
+
+@vr.inject(output=Document)
+def load_document(title: str) -> Document:
+    with httpx.Client() as client:
+        resp = client.get(URL.format(title))
+        if resp.is_error:
+            raise RuntimeError(f"Failed to fetch the document `{title}`")
+    parser = EssayParser()
+    parser.feed(resp.text)
+    return Document(title=title, text="\n".join(t for t in parser.content if t))
+
+
+@vr.inject(input=Document, output=Chunk)
+def chunk_document(uid: int, text: str) -> list[Chunk]:
+    chunks = chunker.segment(text)
+    return [
+        Chunk(
+            doc_id=uid,
+            text=chunk,
+            vector=emb.vectorize_chunk(chunk),
+            keyword=Keyword(chunk),
+        )
+        for chunk in chunks
+    ]
+
+
+def search_and_rerank(query: str, topk: int) -> list[Chunk]:
+    text_retrieves = vr.search_by_keyword(Chunk, query, topk=topk)
+    vec_retrievse = vr.search_by_vector(Chunk, emb.vectorize_query(query), topk=topk)
+    chunks = list(
+        {chunk.uid: chunk for chunk in text_retrieves + vec_retrievse}.values()
+    )
+    indices = reranker.rerank(query, [chunk.text for chunk in chunks])
+    return [chunks[i] for i in indices[:topk]]
+
+
+if __name__ == "__main__":
+    load_document("smart")
+    chunk_document()
+    chunks = search_and_rerank("smart", 3)
+    print(chunks)
@@ -74,7 +74,7 @@ def load_document(title: str) -> Document:
 def chunk_document(uid: int, text: str) -> list[Chunk]:
     chunks = chunker.segment(text)
     return [
-        Chunk(doc_id=uid, text=chunk, vector=emb.vectorize_chunk(chunk))
+        Chunk(doc_id=uid, text=chunk, vector=DenseVector(emb.vectorize_chunk(chunk)))
         for chunk in chunks
     ]
 
 
@@ -34,6 +34,9 @@ spacy = [
 wordllama = [
     "wordllama>=0.3.8.post20",
 ]
+cohere = [
+    "cohere>=5.14.0",
+]
 
 [build-system]
 requires = ["pdm-backend"]
 
@@ -2,6 +2,7 @@
 from typing import Annotated
 
 import msgspec
+import numpy as np
 import pytest
 
 from vechord.spec import ForeignKey, PrimaryKeyAutoIncrease, Table, Vector
@@ -51,3 +52,16 @@ def find_schema_by_name(schema, name):
         "REFERENCES {namespace}_document(uid) ON DELETE CASCADE"
         in find_schema_by_name(Chunk.table_schema(), "doc_id")
     )
+
+
+def test_vector_type():
+    Dense = Vector[128]
+
+    # test the dim
+    with pytest.raises(ValueError):
+        Dense([0.1] * 100)
+
+    with pytest.raises(ValueError):
+        Dense(np.random.rand(123))
+
+    assert np.equal(Dense(np.ones(128)), Dense([1.0] * 128)).all()
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ def load_document(title: str) -> Document:`
`74`	`74`	`def chunk_document(uid: int, text: str) -> list[Chunk]:`
`75`	`75`	`chunks = chunker.segment(text)`
`76`	`76`	`return [`
`77`		`- Chunk(doc_id=uid, text=chunk, vector=emb.vectorize_chunk(chunk))`
	`77`	`+ Chunk(doc_id=uid, text=chunk, vector=DenseVector(emb.vectorize_chunk(chunk)))`
`78`	`78`	`for chunk in chunks`
`79`	`79`	`]`
`80`	`80`
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,9 @@ spacy = [`
`34`	`34`	`wordllama = [`
`35`	`35`	`"wordllama>=0.3.8.post20",`
`36`	`36`	`]`
	`37`	`+cohere = [`
	`38`	`+ "cohere>=5.14.0",`
	`39`	`+]`
`37`	`40`
`38`	`41`	`[build-system]`
`39`	`42`	`requires = ["pdm-backend"]`