3737 RunRequest ,
3838 RunResponse ,
3939)
40- from vechord .rerank import CohereReranker , JinaReranker
40+ from vechord .rerank import BaseReranker , CohereReranker , JinaReranker
4141from vechord .spec import (
4242 AnyOf ,
4343 DefaultDocument ,
@@ -167,7 +167,7 @@ class DynamicPipeline(msgspec.Struct, kw_only=True):
167167 text_emb : Optional [BaseTextEmbedding ] = None
168168 multimodal_emb : Optional [BaseMultiModalEmbedding ] = None
169169 ocr : Optional [GeminiExtractor ] = None
170- rerank : Optional [CohereReranker ] = None
170+ rerank : Optional [BaseReranker ] = None
171171 index : Optional [IndexOption ] = None
172172 search : Optional [SearchOption ] = None
173173 graph : Optional [GeminiEntityRecognizer ] = None
@@ -280,71 +280,58 @@ async def run_index(self, request: RunRequest, vr: "VechordRegistry") -> RunAck:
280280 doc_id = doc .uid ,
281281 text = "" ,
282282 text_type = request .input_type .value ,
283- Keyword = None ,
283+ keyword = None ,
284284 vec = None ,
285285 )
286- if self .multimodal_emb :
287- chunks .append (
288- Chunk (
289- doc_id = doc .uid ,
290- text = base64 .b64encode (request .data ).decode ("utf-8" ),
291- text_type = request .input_type .value ,
292- keyword = None ,
293- vec = await self .multimodal_emb .vectorize_multimodal_chunk (
294- request .data
295- ),
296- )
286+ if self .multimodal_emb and request .input_type is not InputType .TEXT :
287+ # reuse the fake chunk to ensure the chunk uid is unique
288+ fake_chunk .text = base64 .b64encode (request .data ).decode ("utf-8" )
289+ fake_chunk .vec = await self .multimodal_emb .vectorize_multimodal_chunk (
290+ image = request .data
291+ )
292+ chunks .append (fake_chunk )
293+ if request .input_type is InputType .TEXT :
294+ doc .text = request .data .decode ("utf-8" )
295+ elif self .ocr :
296+ if request .input_type is InputType .PDF :
297+ doc .text = await self .ocr .extract_pdf (request .data )
298+ elif request .input_type is InputType .IMAGE :
299+ doc .text = await self .ocr .extract_image (request .data )
300+ if self .chunk :
301+ sentences .extend (await self .chunk .segment (doc .text ))
302+ elif doc .text :
303+ sentences .append (doc .text )
304+
305+ for sent in sentences :
306+ chunk = Chunk (
307+ vec = await self .text_emb .vectorize_chunk (sent ),
308+ doc_id = doc .uid ,
309+ text = sent ,
310+ keyword = None if not enable_keyword_index else Keyword (sent ),
297311 )
298- else :
299- if request .input_type is InputType .TEXT :
300- doc .text = request .data .decode ("utf-8" )
301- elif self .ocr :
302- if request .input_type is InputType .PDF :
303- doc .text = await self .ocr .extract_pdf (request .data )
304- elif request .input_type is InputType .IMAGE :
305- doc .text = await self .ocr .extract_image (request .data )
306- elif self .graph :
307- fake_chunk .text = base64 .b64encode (request .data ).decode ("utf-8" )
308- img_ents , img_rels = await self .graph .recognize_image (request .data )
312+ chunks .append (chunk )
313+ if self .graph and request .input_type is InputType .TEXT :
314+ chunk_ents , chunk_rels = await self .graph .recognize_with_relations (sent )
309315 conv_ents , conv_rels = self ._convert_from_extracted_graph (
310- fake_chunk .uid , img_ents , img_rels , Entity , Relation
316+ chunk .uid , chunk_ents , chunk_rels , Entity , Relation
311317 )
312318 ents .extend (conv_ents )
313319 rels .extend (conv_rels )
314- else :
315- raise RequestError (
316- f"No OCR or Graph provider for input type: { request .input_type } "
317- )
318320
319- if self .chunk :
320- sentences .extend (await self .chunk .segment (doc .text ))
321- elif doc .text :
322- sentences .append (doc .text )
323- for sent in sentences :
324- chunk = Chunk (
325- vec = await self .text_emb .vectorize_chunk (sent ),
326- doc_id = doc .uid ,
327- text = sent ,
328- keyword = None if not enable_keyword_index else Keyword (sent ),
329- )
330- if self .graph and request .input_type is InputType .TEXT :
331- chunk_ents , chunk_rels = await self .graph .recognize_with_relations (
332- sent
333- )
334- conv_ents , conv_rels = self ._convert_from_extracted_graph (
335- chunk .uid , chunk_ents , chunk_rels , Entity , Relation
336- )
337- ents .extend (conv_ents )
338- rels .extend (conv_rels )
339- chunks .append (chunk )
321+ if self .graph and request .input_type is not InputType .TEXT and not sentences :
322+ img_ents , img_rels = await self .graph .recognize_image (request .data )
323+ conv_ents , conv_rels = self ._convert_from_extracted_graph (
324+ fake_chunk .uid , img_ents , img_rels , Entity , Relation
325+ )
326+ ents .extend (conv_ents )
327+ rels .extend (conv_rels )
328+ if not self .multimodal_emb :
329+ chunks .append (fake_chunk )
340330
341331 await vr .insert (doc )
342332 for chunk in chunks :
343333 await vr .insert (chunk )
344334 if self .index .graph :
345- if request .input_type is not InputType .TEXT :
346- # insert the fake chunk for image/pdf
347- await vr .insert (fake_chunk )
348335 await self .graph_insert (
349336 ents = ents , rels = rels , ent_cls = Entity , rel_cls = Relation , vr = vr
350337 )
@@ -360,6 +347,11 @@ async def graph_insert(
360347 ):
361348 """Insert entities and relations into the graph index."""
362349 ent_map : dict [str , _Entity ] = {}
350+ emb_func = (
351+ self .text_emb .vectorize_chunk
352+ if self .text_emb
353+ else self .multimodal_emb .vectorize_multimodal_chunk
354+ )
363355 for ent in ents :
364356 if ent .text not in ent_map :
365357 ent_map [ent .text ] = ent
@@ -376,9 +368,7 @@ async def graph_insert(
376368 ent .chunk_uuids .extend (exist .chunk_uuids )
377369 ent .description += f"\n { exist .description } "
378370 await vr .remove_by (ent_cls .partial_init (uid = exist .uid ))
379- ent .vec = await self .text_emb .vectorize_chunk (
380- f"{ ent .text } \n { ent .description } "
381- )
371+ ent .vec = await emb_func (f"{ ent .text } \n { ent .description } " )
382372 await vr .insert (ent )
383373
384374 relation_map : dict [str , _Relation ] = {}
@@ -397,7 +387,7 @@ async def graph_insert(
397387 exist = exist_rel [0 ]
398388 rel .description += f"\n { exist .description } "
399389 await vr .remove_by (rel_cls .partial_init (uid = exist .uid ))
400- rel .vec = await self . text_emb . vectorize_chunk (f"{ rel .description } " )
390+ rel .vec = await emb_func (f"{ rel .description } " )
401391 await vr .insert (rel )
402392
403393 async def run_search (
@@ -437,7 +427,7 @@ class Relation(_Relation):
437427 if self .multimodal_emb :
438428 indices = await self .rerank .rerank_multimodal (
439429 query = query ,
440- chunks = [chunk .text for chunk in resp ],
430+ chunks = [chunk .text for chunk in resp . chunks ],
441431 doc_type = resp .chunk_type ,
442432 )
443433 else :
@@ -461,11 +451,16 @@ async def graph_search(
461451 vr : "VechordRegistry" ,
462452 ):
463453 ents , rels = await self .graph .recognize_with_relations (query )
454+ emb_func = (
455+ self .text_emb .vectorize_query
456+ if self .text_emb
457+ else self .multimodal_emb .vectorize_multimodal_query
458+ )
464459 if rels :
465460 rel_text = " " .join (rel .description for rel in rels )
466461 similar_rels = await vr .search_by_vector (
467462 rel_cls ,
468- await self . text_emb . vectorize_query (rel_text ),
463+ await emb_func (rel_text ),
469464 topk = self .search .graph .similar_k ,
470465 )
471466 ent_uuids = deduplicate_uid (
@@ -484,7 +479,7 @@ async def graph_search(
484479 ent_text = " " .join (f"{ ent .text } { ent .description } " for ent in ents )
485480 similar_ents = await vr .search_by_vector (
486481 ent_cls ,
487- await self . text_emb . vectorize_query (ent_text ),
482+ await emb_func (ent_text ),
488483 topk = self .search .graph .similar_k ,
489484 )
490485 chunk_uuids = deduplicate_uid (
0 commit comments