44from pgvector .psycopg import register_vector
55
66from vechord .log import logger
7- from vechord .model import TextFile
8- from vechord .text import EN_TEXT_PROCESSOR
7+ from vechord .model import Chunk , Document
98
109
1110class VectorChordClient :
12- def __init__ (self , url : str , autocommit : bool = True ):
11+ def __init__ (self , namespace : str , url : str , autocommit : bool = True ):
12+ self .ns = namespace
1313 self .url = url
1414 self .conn = psycopg .connect (url , autocommit = autocommit )
1515 self .conn .execute ("CREATE EXTENSION IF NOT EXISTS vchord CASCADE" )
1616 register_vector (self .conn )
1717
18- def create_namespace (self , namespace : str , dim : int = 96 ):
18+ def create (self , dim ):
1919 config = """
2020 residual_quantization = true
2121 [build.internal]
@@ -24,17 +24,17 @@ def create_namespace(self, namespace: str, dim: int = 96):
2424 """
2525 try :
2626 self .conn .execute (
27- f"CREATE TABLE IF NOT EXISTS { namespace } _meta "
27+ f"CREATE TABLE IF NOT EXISTS { self . ns } _meta "
2828 "(id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, "
29- "name TEXT, digest TEXT)"
29+ "name TEXT, digest TEXT NOT NULL UNIQUE, updated_at TIMESTAMP )"
3030 )
3131 self .conn .execute (
32- f"CREATE TABLE IF NOT EXISTS { namespace } "
32+ f"CREATE TABLE IF NOT EXISTS { self . ns } "
3333 "(id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, "
3434 f"doc_id INT, content TEXT, embedding vector({ dim } ))"
3535 )
3636 self .conn .execute (
37- f"CREATE INDEX IF NOT EXISTS { namespace } _vector_idx ON { namespace } "
37+ f"CREATE INDEX IF NOT EXISTS { self . ns } _vector_idx ON { self . ns } "
3838 "USING vchordrq (embedding vector_l2_ops) WITH "
3939 f"(options = $${ config } $$)"
4040 )
@@ -44,35 +44,36 @@ def create_namespace(self, namespace: str, dim: int = 96):
4444 self .conn .rollback ()
4545 raise err
4646
47- def insert_text (self , namespace : str , textfile : TextFile ):
47+ def is_file_exists (self , doc : Document ) -> bool :
48+ cursor = self .conn .execute (
49+ f"SELECT id FROM { self .ns } _meta WHERE digest = %s" , (doc .digest ,)
50+ )
51+ return cursor .fetchone () is not None
52+
53+ def insert_text (self , doc : Document , chunks : list [Chunk ]):
4854 try :
4955 cursor = self .conn .execute (
50- f"INSERT INTO { namespace } _meta (name, digest) VALUES (%s, %s) RETURNING id" ,
51- (textfile . filename , textfile .digest ),
56+ f"INSERT INTO { self . ns } _meta (name, digest, updated_at ) VALUES (%s, %s, %s) RETURNING id" ,
57+ (doc . path , doc .digest , doc . updated_at ),
5258 )
5359 doc_id = cursor .fetchone ()[0 ]
54- for sentence in textfile . sentences :
60+ for chunk in chunks :
5561 self .conn .execute (
56- f"INSERT INTO { namespace } (doc_id, content, embedding) VALUES (%s, %s, %s)" ,
57- (doc_id , sentence .text , sentence .vector ),
62+ f"INSERT INTO { self . ns } (doc_id, content, embedding) VALUES (%s, %s, %s)" ,
63+ (doc_id , chunk .text , chunk .vector ),
5864 )
59- logger .debug (
60- "inserted %s sentences from file %s" ,
61- len (textfile .sentences ),
62- textfile .filename ,
63- )
65+ logger .debug ("inserted %s sentences from file %s" , len (chunks ), doc .path )
6466 except psycopg .errors .DatabaseError as err :
6567 logger .error (err )
6668 logger .info ("rollback from the previous error" )
6769 self .conn .rollback ()
6870 raise err
6971
70- def query (self , namespace : str , query : str , topk : int = 10 ):
72+ def query (self , query : Chunk , topk : int = 10 ) -> list [ str ] :
7173 start = perf_counter ()
72- query = EN_TEXT_PROCESSOR .process (query )
7374 try :
7475 cursor = self .conn .execute (
75- f"SELECT content FROM { namespace } ORDER BY embedding <-> %s LIMIT %s" ,
76+ f"SELECT content FROM { self . ns } ORDER BY embedding <-> %s LIMIT %s" ,
7677 (query .vector , topk ),
7778 )
7879 res = cursor .fetchall ()
0 commit comments