vechord/examples/hybrid.py at main · tensorchord/vechord · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import httpx

from vechord.chunk import RegexChunker
from vechord.embedding import GeminiDenseEmbedding
from vechord.extract import SimpleExtractor
from vechord.registry import VechordRegistry
from vechord.rerank import CohereReranker
from vechord.spec import DefaultDocument, Keyword, create_chunk_with_dim

URL = "https://paulgraham.com/{}.html"
Chunk = create_chunk_with_dim(3072)
emb = GeminiDenseEmbedding()
chunker = RegexChunker(size=1024, overlap=0)
reranker = CohereReranker()
extractor = SimpleExtractor()


vr = VechordRegistry(
    "hybrid",
    "postgresql://postgres:postgres@172.17.0.1:5432/",
    tables=[DefaultDocument, Chunk],
)


@vr.inject(output=DefaultDocument)
async def load_document(title: str) -> DefaultDocument:
    async with httpx.AsyncClient() as client:
        resp = await client.get(URL.format(title))
        if resp.is_error:
            raise RuntimeError(f"Failed to fetch the document `{title}`")
        return DefaultDocument(title=title, text=extractor.extract_html(resp.text))


@vr.inject(input=DefaultDocument, output=Chunk)
async def chunk_document(uid: int, text: str) -> list[Chunk]:
    chunks = await chunker.segment(text)
    return [
        Chunk(
            doc_id=uid,
            text=chunk,
            vec=await emb.vectorize_chunk(chunk),
            keyword=Keyword(chunk),
        )
        for chunk in chunks
    ]


async def search_and_rerank(query: str, topk: int) -> list[Chunk]:
    text_retrieves = await vr.search_by_keyword(Chunk, query, topk=topk)
    vec_retrievse = await vr.search_by_vector(
        Chunk, await emb.vectorize_query(query), topk=topk
    )
    chunks = list(
        {chunk.uid: chunk for chunk in text_retrieves + vec_retrievse}.values()
    )
    indices = await reranker.rerank(query, [chunk.text for chunk in chunks])
    return [chunks[i] for i in indices[:topk]]


async def main():
    async with vr, emb, reranker:
        await load_document("smart")
        await chunk_document()
        chunks = await search_and_rerank("smart", 3)
        print(chunks)


if __name__ == "__main__":
    import asyncio

    asyncio.run(main())