fix: force llm to generate descriptions for entity and relation (#63)

kemingy · web-flow · commit 5e8bb896d35e · 2025-08-07T12:20:45.000+08:00
Signed-off-by: Keming &lt;kemingyang@tensorchord.ai&gt;
diff --git a/vechord/embedding.py b/vechord/embedding.py
@@ -54,12 +54,18 @@ async def vectorize_query(self, text: str) -> np.ndarray:
 class BaseMultiModalEmbedding(BaseEmbedding):
     @abstractmethod
     async def vectorize_multimodal_chunk(
-        self, text: str, image: Optional[bytes] = None, image_url: Optional[str] = None
+        self,
+        text: Optional[str] = None,
+        image: Optional[bytes] = None,
+        image_url: Optional[str] = None,
     ) -> np.ndarray:
         raise NotImplementedError
 
     async def vectorize_multimodal_query(
-        self, text: str, image: Optional[bytes] = None, image_url: Optional[str] = None
+        self,
+        text: Optional[str] = None,
+        image: Optional[bytes] = None,
+        image_url: Optional[str] = None,
     ) -> np.ndarray:
         return await self.vectorize_multimodal_chunk(text, image, image_url)
 
@@ -184,7 +190,10 @@ def name(self) -> str:
         return f"jina_emb_{self.model}_{self.dim}"
 
     async def vectorize_multimodal_chunk(
-        self, text: str, image: Optional[bytes] = None, image_url: Optional[str] = None
+        self,
+        text: Optional[str] = None,
+        image: Optional[bytes] = None,
+        image_url: Optional[str] = None,
     ) -> np.ndarray:
         req = await self.query(
             JinaEmbeddingRequest.from_text_image(
@@ -198,7 +207,10 @@ async def vectorize_multimodal_chunk(
         return req.get_emb()
 
     async def vectorize_multimodal_query(
-        self, text: str, image: Optional[bytes] = None, image_url: Optional[str] = None
+        self,
+        text: Optional[str] = None,
+        image: Optional[bytes] = None,
+        image_url: Optional[str] = None,
     ) -> np.ndarray:
         req = await self.query(
             JinaEmbeddingRequest.from_text_image(
@@ -268,8 +280,8 @@ def vec_type(self) -> VecType:
 
     async def vectorize_multimodal_chunk(
         self,
-        image: Optional[bytes] = None,
         text: Optional[str] = None,
+        image: Optional[bytes] = None,
         image_url: Optional[str] = None,
     ):
         resp = await self.query(
@@ -285,8 +297,8 @@ async def vectorize_multimodal_chunk(
 
     async def vectorize_multimodal_query(
         self,
-        image: Optional[bytes] = None,
         text: Optional[str] = None,
+        image: Optional[bytes] = None,
         image_url: Optional[str] = None,
     ):
         resp = await self.query(
diff --git a/vechord/graph.py b/vechord/graph.py
@@ -134,6 +134,10 @@ def recognize_with_relations(
 Entity could be person, location, org, event or category.
 """
 RECOGNIZE_PROMPT_FIELD = """\n<document>\n{text}\n</document>\n"""
+RECOGNIZE_PROMPT_IMAGE = """
+Extract the readable text and generate a concise caption describing the image's content
+or scene. Use the text and caption as the passage text for named entity extraction.
+"""
 
 
 class GeminiEntityRecognizer(BaseEntityRecognizer, GeminiGenerateProvider):
@@ -184,13 +188,9 @@ async def recognize_image(
         self, img: bytes
     ) -> tuple[list[GraphEntity], list[GraphRelation]]:
         """Recognize entities & relations from the image."""
-        prompt = (
-            "Given the image, first summarize it and extract readable text."
-            f"{self.prompt}"
-        )
         resp = await self.query(
             GeminiGenerateRequest.from_prompt_data_structure_resp(
-                prompt=prompt,
+                prompt=self.prompt.format(text=RECOGNIZE_PROMPT_IMAGE),
                 mime_type=GeminiMimeType.JPEG,
                 data=img,
                 schema=msgspec.json.schema(list[GraphRelation]),
diff --git a/vechord/model/internal.py b/vechord/model/internal.py
@@ -14,8 +14,8 @@ class GraphEntity(msgspec.Struct, kw_only=True, frozen=True):
     """
 
     text: str
-    label: str = ""
-    description: str = ""
+    label: str
+    description: str
 
 
 class GraphRelation(msgspec.Struct, kw_only=True, frozen=True):
@@ -28,7 +28,7 @@ class GraphRelation(msgspec.Struct, kw_only=True, frozen=True):
 
     source: GraphEntity
     target: GraphEntity
-    description: str = ""
+    description: str
 
 
 class Document(msgspec.Struct, kw_only=True):
diff --git a/vechord/model/voyage.py b/vechord/model/voyage.py
@@ -84,8 +84,8 @@ class VoyageMultiModalEmbeddingRequest(msgspec.Struct, kw_only=True):
     def build(
         cls,
         text: Optional[str],
-        image_url: Optional[str],
         image: Optional[bytes],
+        image_url: Optional[str],
         model: str,
         input_type: VOYAGE_INPUT_TYPE,
     ) -> Self:
diff --git a/vechord/pipeline.py b/vechord/pipeline.py
@@ -37,7 +37,7 @@
     RunRequest,
     RunResponse,
 )
-from vechord.rerank import CohereReranker, JinaReranker
+from vechord.rerank import BaseReranker, CohereReranker, JinaReranker
 from vechord.spec import (
     AnyOf,
     DefaultDocument,
@@ -167,7 +167,7 @@ class DynamicPipeline(msgspec.Struct, kw_only=True):
     text_emb: Optional[BaseTextEmbedding] = None
     multimodal_emb: Optional[BaseMultiModalEmbedding] = None
     ocr: Optional[GeminiExtractor] = None
-    rerank: Optional[CohereReranker] = None
+    rerank: Optional[BaseReranker] = None
     index: Optional[IndexOption] = None
     search: Optional[SearchOption] = None
     graph: Optional[GeminiEntityRecognizer] = None
@@ -280,71 +280,58 @@ async def run_index(self, request: RunRequest, vr: "VechordRegistry") -> RunAck:
             doc_id=doc.uid,
             text="",
             text_type=request.input_type.value,
-            Keyword=None,
+            keyword=None,
             vec=None,
         )
-        if self.multimodal_emb:
-            chunks.append(
-                Chunk(
-                    doc_id=doc.uid,
-                    text=base64.b64encode(request.data).decode("utf-8"),
-                    text_type=request.input_type.value,
-                    keyword=None,
-                    vec=await self.multimodal_emb.vectorize_multimodal_chunk(
-                        request.data
-                    ),
-                )
+        if self.multimodal_emb and request.input_type is not InputType.TEXT:
+            # reuse the fake chunk to ensure the chunk uid is unique
+            fake_chunk.text = base64.b64encode(request.data).decode("utf-8")
+            fake_chunk.vec = await self.multimodal_emb.vectorize_multimodal_chunk(
+                image=request.data
+            )
+            chunks.append(fake_chunk)
+        if request.input_type is InputType.TEXT:
+            doc.text = request.data.decode("utf-8")
+        elif self.ocr:
+            if request.input_type is InputType.PDF:
+                doc.text = await self.ocr.extract_pdf(request.data)
+            elif request.input_type is InputType.IMAGE:
+                doc.text = await self.ocr.extract_image(request.data)
+        if self.chunk:
+            sentences.extend(await self.chunk.segment(doc.text))
+        elif doc.text:
+            sentences.append(doc.text)
+
+        for sent in sentences:
+            chunk = Chunk(
+                vec=await self.text_emb.vectorize_chunk(sent),
+                doc_id=doc.uid,
+                text=sent,
+                keyword=None if not enable_keyword_index else Keyword(sent),
             )
-        else:
-            if request.input_type is InputType.TEXT:
-                doc.text = request.data.decode("utf-8")
-            elif self.ocr:
-                if request.input_type is InputType.PDF:
-                    doc.text = await self.ocr.extract_pdf(request.data)
-                elif request.input_type is InputType.IMAGE:
-                    doc.text = await self.ocr.extract_image(request.data)
-            elif self.graph:
-                fake_chunk.text = base64.b64encode(request.data).decode("utf-8")
-                img_ents, img_rels = await self.graph.recognize_image(request.data)
+            chunks.append(chunk)
+            if self.graph and request.input_type is InputType.TEXT:
+                chunk_ents, chunk_rels = await self.graph.recognize_with_relations(sent)
                 conv_ents, conv_rels = self._convert_from_extracted_graph(
-                    fake_chunk.uid, img_ents, img_rels, Entity, Relation
+                    chunk.uid, chunk_ents, chunk_rels, Entity, Relation
                 )
                 ents.extend(conv_ents)
                 rels.extend(conv_rels)
-            else:
-                raise RequestError(
-                    f"No OCR or Graph provider for input type: {request.input_type}"
-                )
 
-            if self.chunk:
-                sentences.extend(await self.chunk.segment(doc.text))
-            elif doc.text:
-                sentences.append(doc.text)
-            for sent in sentences:
-                chunk = Chunk(
-                    vec=await self.text_emb.vectorize_chunk(sent),
-                    doc_id=doc.uid,
-                    text=sent,
-                    keyword=None if not enable_keyword_index else Keyword(sent),
-                )
-                if self.graph and request.input_type is InputType.TEXT:
-                    chunk_ents, chunk_rels = await self.graph.recognize_with_relations(
-                        sent
-                    )
-                    conv_ents, conv_rels = self._convert_from_extracted_graph(
-                        chunk.uid, chunk_ents, chunk_rels, Entity, Relation
-                    )
-                    ents.extend(conv_ents)
-                    rels.extend(conv_rels)
-                chunks.append(chunk)
+        if self.graph and request.input_type is not InputType.TEXT and not sentences:
+            img_ents, img_rels = await self.graph.recognize_image(request.data)
+            conv_ents, conv_rels = self._convert_from_extracted_graph(
+                fake_chunk.uid, img_ents, img_rels, Entity, Relation
+            )
+            ents.extend(conv_ents)
+            rels.extend(conv_rels)
+            if not self.multimodal_emb:
+                chunks.append(fake_chunk)
 
         await vr.insert(doc)
         for chunk in chunks:
             await vr.insert(chunk)
         if self.index.graph:
-            if request.input_type is not InputType.TEXT:
-                # insert the fake chunk for image/pdf
-                await vr.insert(fake_chunk)
             await self.graph_insert(
                 ents=ents, rels=rels, ent_cls=Entity, rel_cls=Relation, vr=vr
             )
@@ -360,6 +347,11 @@ async def graph_insert(
     ):
         """Insert entities and relations into the graph index."""
         ent_map: dict[str, _Entity] = {}
+        emb_func = (
+            self.text_emb.vectorize_chunk
+            if self.text_emb
+            else self.multimodal_emb.vectorize_multimodal_chunk
+        )
         for ent in ents:
             if ent.text not in ent_map:
                 ent_map[ent.text] = ent
@@ -376,9 +368,7 @@ async def graph_insert(
                 ent.chunk_uuids.extend(exist.chunk_uuids)
                 ent.description += f"\n{exist.description}"
                 await vr.remove_by(ent_cls.partial_init(uid=exist.uid))
-            ent.vec = await self.text_emb.vectorize_chunk(
-                f"{ent.text}\n{ent.description}"
-            )
+            ent.vec = await emb_func(f"{ent.text}\n{ent.description}")
             await vr.insert(ent)
 
         relation_map: dict[str, _Relation] = {}
@@ -397,7 +387,7 @@ async def graph_insert(
                 exist = exist_rel[0]
                 rel.description += f"\n{exist.description}"
                 await vr.remove_by(rel_cls.partial_init(uid=exist.uid))
-            rel.vec = await self.text_emb.vectorize_chunk(f"{rel.description}")
+            rel.vec = await emb_func(f"{rel.description}")
             await vr.insert(rel)
 
     async def run_search(
@@ -437,7 +427,7 @@ class Relation(_Relation):
             if self.multimodal_emb:
                 indices = await self.rerank.rerank_multimodal(
                     query=query,
-                    chunks=[chunk.text for chunk in resp],
+                    chunks=[chunk.text for chunk in resp.chunks],
                     doc_type=resp.chunk_type,
                 )
             else:
@@ -461,11 +451,16 @@ async def graph_search(
         vr: "VechordRegistry",
     ):
         ents, rels = await self.graph.recognize_with_relations(query)
+        emb_func = (
+            self.text_emb.vectorize_query
+            if self.text_emb
+            else self.multimodal_emb.vectorize_multimodal_query
+        )
         if rels:
             rel_text = " ".join(rel.description for rel in rels)
             similar_rels = await vr.search_by_vector(
                 rel_cls,
-                await self.text_emb.vectorize_query(rel_text),
+                await emb_func(rel_text),
                 topk=self.search.graph.similar_k,
             )
             ent_uuids = deduplicate_uid(
@@ -484,7 +479,7 @@ async def graph_search(
         ent_text = " ".join(f"{ent.text} {ent.description}" for ent in ents)
         similar_ents = await vr.search_by_vector(
             ent_cls,
-            await self.text_emb.vectorize_query(ent_text),
+            await emb_func(ent_text),
             topk=self.search.graph.similar_k,
         )
         chunk_uuids = deduplicate_uid(