From fa29289a912c7e3231ebea6d37e103cc2e3abeb9 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 11:38:32 -0700 Subject: [PATCH 01/11] update deprecated 2.5 Flash model name --- dsrag/dsparse/tests/unit/test_vlm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsrag/dsparse/tests/unit/test_vlm.py b/dsrag/dsparse/tests/unit/test_vlm.py index 51094604..9182c561 100644 --- a/dsrag/dsparse/tests/unit/test_vlm.py +++ b/dsrag/dsparse/tests/unit/test_vlm.py @@ -65,7 +65,7 @@ def test_gemini_2_5_with_simple_schema(self): result = make_llm_call_gemini( image_path=test_image_path, system_message="Describe this image in a JSON object with a 'description' field.", - model="gemini-2.5-flash-preview-04-17", + model="gemini-2.5-flash", response_schema=test_schema, temperature=0.2 ) @@ -169,7 +169,7 @@ def test_gemini_2_5_with_complex_schema(self): result = make_llm_call_gemini( image_path=test_image_path, system_message=system_message, - model="gemini-2.5-flash-preview-04-17", + model="gemini-2.5-flash", response_schema=complex_schema, temperature=0.5 ) From 83b4bd0dd4b59a2efdf906f4ec6bcb00bb7e2fbb Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 11:49:10 -0700 Subject: [PATCH 02/11] Task #426: Introduce VLM ABC and Provider Clients (Gemini, Vertex) with backward-compatible wrappers Create a reusable VLM client abstraction and concrete implementations (Gemini, Vertex) while keeping current free functions working. No wiring changes yet. --- dsrag/dsparse/file_parsing/vlm.py | 101 ++------ dsrag/dsparse/file_parsing/vlm_clients.py | 237 +++++++++++++++++++ dsrag/dsparse/tests/unit/test_vlm_clients.py | 42 ++++ 3 files changed, 300 insertions(+), 80 deletions(-) create mode 100644 dsrag/dsparse/file_parsing/vlm_clients.py create mode 100644 dsrag/dsparse/tests/unit/test_vlm_clients.py diff --git a/dsrag/dsparse/file_parsing/vlm.py b/dsrag/dsparse/file_parsing/vlm.py index 661e68bb..33ef8bc9 100644 --- a/dsrag/dsparse/file_parsing/vlm.py +++ b/dsrag/dsparse/file_parsing/vlm.py @@ -1,95 +1,36 @@ import PIL.Image -import os import io -from ..utils.imports import vertexai, genai_new +from .vlm_clients import GeminiVLM, VertexAIVLM def make_llm_call_gemini(image_path: str, system_message: str, model: str = "gemini-2.0-flash", response_schema: dict = None, max_tokens: int = 4000, temperature: float = 0.5) -> str: - # With the newer Google GenAI SDK, we need to create a client - client = genai_new.Client(api_key=os.environ["GEMINI_API_KEY"]) + """ + Backward-compatible free function that delegates to GeminiVLM. - # Create generation config with the correct GenerateContentConfig type - config = genai_new.types.GenerateContentConfig( + Signature and behavior are preserved for compatibility. + """ + client = GeminiVLM(model=model) + return client.make_llm_call( + image_path=image_path, + system_message=system_message, + response_schema=response_schema, + max_tokens=max_tokens, temperature=temperature, - max_output_tokens=max_tokens, - response_mime_type="application/json" ) - # Add response schema if provided - if response_schema is not None: - config.response_schema = response_schema - - try: - # Open and compress the image - image = PIL.Image.open(image_path) - compressed_image_bytes, _ = compress_image(image) # Quality is returned but not used here - - # Close the original image object now that compression is done - if image: - image.close() - # The 'image' variable still exists and will be handled by the finally block, - # PIL's close() is typically safe to call multiple times. - - # Create content parts using bytes - image_part = genai_new.types.Part.from_bytes(data=compressed_image_bytes, mime_type='image/jpeg') - content_parts = [image_part, system_message] - - # For Gemini 2.5 models, disable thinking - if model.startswith("gemini-2.5"): - # Create a new config with thinking disabled by setting thinking_config - gemini25_config = genai_new.types.GenerateContentConfig( - temperature=temperature, - max_output_tokens=max_tokens, - response_mime_type="application/json", - thinking_config=genai_new.types.ThinkingConfig(thinking_budget=0) - ) - - # Add response schema if provided - if response_schema is not None: - gemini25_config.response_schema = response_schema - - # Generate content with thinking disabled - response = client.models.generate_content( - model=model, - contents=content_parts, - config=gemini25_config - ) - else: - # Standard call for other Gemini models - response = client.models.generate_content( - model=model, - contents=content_parts, - config=config - ) - - return response.text - finally: - # Ensure image is closed even if an error occurs - if 'image' in locals() and image: # Check if image was defined and not None - try: - image.close() # Attempt to close; safe if already closed - except Exception: - pass # Ignore errors if it fails (e.g., trying to close a None object or already closed and problematic) - def make_llm_call_vertex(image_path: str, system_message: str, model: str, project_id: str, location: str, response_schema: dict = None, max_tokens: int = 4000, temperature: float = 0.5) -> str: """ - This function calls the Vertex AI Gemini API (not to be confused with the Gemini API) with an image and a system message and returns the response text. + Backward-compatible free function that delegates to VertexAIVLM. + + Signature and behavior are preserved for compatibility. """ - vertexai.init(project=project_id, location=location) - model = vertexai.generative_models.GenerativeModel(model) - - if response_schema is not None: - generation_config = vertexai.generative_models.GenerationConfig(temperature=temperature, max_output_tokens=max_tokens, response_mime_type="application/json", response_schema=response_schema) - else: - generation_config = vertexai.generative_models.GenerationConfig(temperature=temperature, max_output_tokens=max_tokens) - - response = model.generate_content( - [ - vertexai.generative_models.Part.from_image(vertexai.generative_models.Image.load_from_file(image_path)), - system_message, - ], - generation_config=generation_config, + client = VertexAIVLM(model=model, project_id=project_id, location=location) + return client.make_llm_call( + image_path=image_path, + system_message=system_message, + response_schema=response_schema, + max_tokens=max_tokens, + temperature=temperature, ) - return response.text def compress_image(image: PIL.Image.Image, max_size_bytes: int = 1097152, quality: int = 95) -> tuple[bytes, int]: """ diff --git a/dsrag/dsparse/file_parsing/vlm_clients.py b/dsrag/dsparse/file_parsing/vlm_clients.py new file mode 100644 index 00000000..14d8d410 --- /dev/null +++ b/dsrag/dsparse/file_parsing/vlm_clients.py @@ -0,0 +1,237 @@ +""" +Visual Language Model (VLM) abstraction and concrete clients. + +This module introduces an abstraction similar to LLM/Embedding/Reranker to allow +users to extend VLM providers/models via subclassing while maintaining backward +compatibility with existing free functions in vlm.py. + +Design notes: +- Subclasses are registered via __init_subclass__ for simple construction from + serialized config dicts. +- The public free functions in vlm.py delegate to these clients and preserve + their original signatures. +- To avoid circular imports with vlm.compress_image, GeminiVLM imports + compress_image inside the make_llm_call method scope. +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional, Type +import os + +from ..utils.imports import genai_new, vertexai # lazy loaders + + +class VLM(ABC): + """Abstract base class for Visual Language Model clients. + + Subclasses should implement the make_llm_call method to perform the actual + provider-specific API call and return the response text. + + Subclass registration + --------------------- + Each subclass is automatically registered by class name using + __init_subclass__. This allows simple construction from a configuration + dictionary via from_dict. + """ + + # Registry of subclass name -> subclass type + _subclasses: Dict[str, Type["VLM"]] = {} + + def __init_subclass__(cls, **kwargs): # type: ignore[override] + super().__init_subclass__(**kwargs) + # Register subclass by its class name for from_dict factory construction + VLM._subclasses[cls.__name__] = cls + + def to_dict(self) -> Dict[str, Any]: + """Serialize this VLM instance to a dictionary. + + Returns a dict containing the subclass name and public fields + required to reconstruct the instance with from_dict. + """ + # Default implementation serializes __dict__ (public fields) plus subclass name + data = {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + data["subclass_name"] = self.__class__.__name__ + return data + + @classmethod + def from_dict(cls, config: Dict[str, Any]) -> "VLM": + """Construct a VLM instance from a serialized config dictionary. + + The config must contain a "subclass_name" key identifying the + registered subclass. Remaining keys are forwarded to the subclass + constructor as keyword arguments. + """ + subclass_name = config.get("subclass_name") + if not subclass_name: + raise ValueError("config must include 'subclass_name'") + subclass = cls._subclasses.get(subclass_name) + if subclass is None: + raise ValueError(f"Unknown VLM subclass: {subclass_name}") + kwargs = {k: v for k, v in config.items() if k != "subclass_name"} + return subclass(**kwargs) # type: ignore[arg-type] + + @abstractmethod + def make_llm_call( + self, + image_path: str, + system_message: str, + response_schema: Optional[Dict[str, Any]] = None, + max_tokens: int = 4000, + temperature: float = 0.5, + ) -> str: + """Perform a VLM call and return the raw response text.""" + raise NotImplementedError + + +class GeminiVLM(VLM): + """VLM client for Google Gemini via the new google-genai SDK. + + Fields + ------ + - model: Gemini model name, default "gemini-2.0-flash". + + Behavior + -------- + - Uses dsrag.dsparse.utils.imports.genai_new (lazy) to construct a + Client(api_key=os.environ["GEMINI_API_KEY"]). + - Builds GenerateContentConfig with response_mime_type="application/json", + including optional response_schema when provided. + - For models starting with "gemini-2.5", sets a ThinkingConfig with + thinking_budget=0 to disable thinking as per existing behavior. + - Compresses images using compress_image from vlm.py. + """ + + def __init__(self, model: str = "gemini-2.0-flash") -> None: + self.model = model + + def make_llm_call( + self, + image_path: str, + system_message: str, + response_schema: Optional[Dict[str, Any]] = None, + max_tokens: int = 4000, + temperature: float = 0.5, + ) -> str: + # Local import to avoid circular dependency at module import time + from .vlm import compress_image # noqa: WPS433 (allow local import) + import PIL.Image # used only when this method is executed + import io + + # Create client using lazy loader + client = genai_new.Client(api_key=os.environ["GEMINI_API_KEY"]) # type: ignore[attr-defined] + + # Base generation config + config = genai_new.types.GenerateContentConfig( # type: ignore[attr-defined] + temperature=temperature, + max_output_tokens=max_tokens, + response_mime_type="application/json", + ) + if response_schema is not None: + config.response_schema = response_schema + + image = None + try: + # Open and compress the image + image = PIL.Image.open(image_path) + compressed_image_bytes, _ = compress_image(image) + + # Close the original image (safe to call multiple times) + if image: + image.close() + + # Create content parts using bytes + image_part = genai_new.types.Part.from_bytes( # type: ignore[attr-defined] + data=compressed_image_bytes, + mime_type="image/jpeg", + ) + content_parts = [image_part, system_message] + + # For Gemini 2.5 models, disable thinking + if self.model.startswith("gemini-2.5"): + gemini25_config = genai_new.types.GenerateContentConfig( # type: ignore[attr-defined] + temperature=temperature, + max_output_tokens=max_tokens, + response_mime_type="application/json", + thinking_config=genai_new.types.ThinkingConfig(thinking_budget=0), # type: ignore[attr-defined] + ) + if response_schema is not None: + gemini25_config.response_schema = response_schema + + response = client.models.generate_content( # type: ignore[attr-defined] + model=self.model, + contents=content_parts, + config=gemini25_config, + ) + else: + response = client.models.generate_content( # type: ignore[attr-defined] + model=self.model, + contents=content_parts, + config=config, + ) + return response.text + finally: + if image is not None: + try: + image.close() + except Exception: + pass + + +class VertexAIVLM(VLM): + """VLM client for Vertex AI Gemini API. + + Fields + ------ + - model: Vertex AI model name (required) + - project_id: GCP project id (required) + - location: GCP location/region (required) + + Behavior + -------- + - Uses dsrag.dsparse.utils.imports.vertexai (lazy) to initialize and call + the vertexai.generative_models API. + - Builds GenerationConfig with optional response_schema and returns + response.text + """ + + def __init__(self, model: str, project_id: str, location: str) -> None: + self.model = model + self.project_id = project_id + self.location = location + + def make_llm_call( + self, + image_path: str, + system_message: str, + response_schema: Optional[Dict[str, Any]] = None, + max_tokens: int = 4000, + temperature: float = 0.5, + ) -> str: + # Initialize Vertex AI client + vertexai.init(project=self.project_id, location=self.location) # type: ignore[attr-defined] + model = vertexai.generative_models.GenerativeModel(self.model) # type: ignore[attr-defined] + + if response_schema is not None: + generation_config = vertexai.generative_models.GenerationConfig( # type: ignore[attr-defined] + temperature=temperature, + max_output_tokens=max_tokens, + response_mime_type="application/json", + response_schema=response_schema, + ) + else: + generation_config = vertexai.generative_models.GenerationConfig( # type: ignore[attr-defined] + temperature=temperature, + max_output_tokens=max_tokens, + ) + + response = model.generate_content( + [ + vertexai.generative_models.Part.from_image( # type: ignore[attr-defined] + vertexai.generative_models.Image.load_from_file(image_path) # type: ignore[attr-defined] + ), + system_message, + ], + generation_config=generation_config, + ) + return response.text diff --git a/dsrag/dsparse/tests/unit/test_vlm_clients.py b/dsrag/dsparse/tests/unit/test_vlm_clients.py new file mode 100644 index 00000000..70833bc9 --- /dev/null +++ b/dsrag/dsparse/tests/unit/test_vlm_clients.py @@ -0,0 +1,42 @@ +import os +import sys +import unittest + +# Add the project root to the path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))) + +from dsrag.dsparse.file_parsing.vlm_clients import VLM, GeminiVLM, VertexAIVLM + + +class TestVLMClients(unittest.TestCase): + def test_registry_contains_subclasses(self): + self.assertIn("GeminiVLM", VLM._subclasses) + self.assertIn("VertexAIVLM", VLM._subclasses) + self.assertIs(VLM._subclasses["GeminiVLM"], GeminiVLM) + self.assertIs(VLM._subclasses["VertexAIVLM"], VertexAIVLM) + + def test_gemini_to_from_dict_roundtrip(self): + client = GeminiVLM() + as_dict = client.to_dict() + self.assertEqual(as_dict.get("subclass_name"), "GeminiVLM") + self.assertIn("model", as_dict) + rebuilt = VLM.from_dict(as_dict) + self.assertIsInstance(rebuilt, GeminiVLM) + self.assertEqual(rebuilt.model, client.model) + + def test_vertex_to_from_dict_roundtrip(self): + client = VertexAIVLM(model="gemini-1.5-flash", project_id="proj", location="us-central1") + as_dict = client.to_dict() + self.assertEqual(as_dict.get("subclass_name"), "VertexAIVLM") + self.assertEqual(as_dict.get("model"), "gemini-1.5-flash") + self.assertEqual(as_dict.get("project_id"), "proj") + self.assertEqual(as_dict.get("location"), "us-central1") + rebuilt = VLM.from_dict(as_dict) + self.assertIsInstance(rebuilt, VertexAIVLM) + self.assertEqual(rebuilt.model, client.model) + self.assertEqual(rebuilt.project_id, client.project_id) + self.assertEqual(rebuilt.location, client.location) + + +if __name__ == "__main__": + unittest.main() From 0b6a2b5ac444c06d00964efc19a377d99affb993 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 12:10:57 -0700 Subject: [PATCH 03/11] formatting fixes --- dsrag/dsparse/file_parsing/vlm_clients.py | 15 --------------- dsrag/dsparse/tests/unit/test_vlm.py | 1 - 2 files changed, 16 deletions(-) diff --git a/dsrag/dsparse/file_parsing/vlm_clients.py b/dsrag/dsparse/file_parsing/vlm_clients.py index 14d8d410..e628838a 100644 --- a/dsrag/dsparse/file_parsing/vlm_clients.py +++ b/dsrag/dsparse/file_parsing/vlm_clients.py @@ -1,18 +1,3 @@ -""" -Visual Language Model (VLM) abstraction and concrete clients. - -This module introduces an abstraction similar to LLM/Embedding/Reranker to allow -users to extend VLM providers/models via subclassing while maintaining backward -compatibility with existing free functions in vlm.py. - -Design notes: -- Subclasses are registered via __init_subclass__ for simple construction from - serialized config dicts. -- The public free functions in vlm.py delegate to these clients and preserve - their original signatures. -- To avoid circular imports with vlm.compress_image, GeminiVLM imports - compress_image inside the make_llm_call method scope. -""" from __future__ import annotations from abc import ABC, abstractmethod diff --git a/dsrag/dsparse/tests/unit/test_vlm.py b/dsrag/dsparse/tests/unit/test_vlm.py index 9182c561..ed6b23a4 100644 --- a/dsrag/dsparse/tests/unit/test_vlm.py +++ b/dsrag/dsparse/tests/unit/test_vlm.py @@ -3,7 +3,6 @@ import unittest import json - # Add the project root to the path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))) From 1e8ce0b7e5adbd6e7cb73c37ffbb2df7df42c015 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 12:26:18 -0700 Subject: [PATCH 04/11] Task #427: Wire VLM ABC into parsing and KnowledgeBase (backward compatible) Integrate the new VLM clients into dsparse parsing and KnowledgeBase while preserving legacy dict-based configs and behavior. --- .../dsparse/file_parsing/vlm_file_parsing.py | 64 +++++++++++++++++-- dsrag/dsparse/main.py | 12 +++- dsrag/dsparse/models/types.py | 5 +- dsrag/knowledge_base.py | 52 +++++++++++++-- 4 files changed, 120 insertions(+), 13 deletions(-) diff --git a/dsrag/dsparse/file_parsing/vlm_file_parsing.py b/dsrag/dsparse/file_parsing/vlm_file_parsing.py index 3fe646ea..02dce94c 100644 --- a/dsrag/dsparse/file_parsing/vlm_file_parsing.py +++ b/dsrag/dsparse/file_parsing/vlm_file_parsing.py @@ -1,6 +1,8 @@ from .vlm import make_llm_call_gemini, make_llm_call_vertex from ..models.types import ElementType, Element, VLMConfig from .file_system import FileSystem +from .vlm_clients import VLM, GeminiVLM, VertexAIVLM +from typing import Optional from .element_types import ( get_visual_elements_as_str, get_non_visual_elements_as_str, @@ -133,7 +135,7 @@ def save_single_image(args): logger.info(f"Converted total {len(all_image_paths)} pages to images", extra=base_extra) return all_image_paths -def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: int, vlm_config: VLMConfig, element_types: list[ElementType]) -> tuple[int, list[Element]]: +def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: int, vlm_config: VLMConfig, element_types: list[ElementType], vlm_client: Optional[VLM] = None, vlm_fallback_client: Optional[VLM] = None) -> tuple[int, list[Element]]: """ Given an image of a page, use LLM to extract the content of the page. This function includes retry logic and a fallback model mechanism. @@ -161,6 +163,31 @@ def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: in fallback_provider = vlm_config.get("fallback_provider") fallback_model = vlm_config.get("fallback_model") + # Determine primary and fallback clients + primary_client: Optional[VLM] = vlm_client + fallback_client: Optional[VLM] = vlm_fallback_client + + if primary_client is None: + if primary_provider == "gemini": + primary_client = GeminiVLM(model=primary_model) + elif primary_provider == "vertex_ai": + project_id = vlm_config.get("project_id") + location = vlm_config.get("location") + if project_id and location: + primary_client = VertexAIVLM(model=primary_model, project_id=project_id, location=location) + else: + primary_client = None # fall back to legacy path if insufficient config + # else: invalid provider handled later in loop + + if fallback_client is None and fallback_provider and fallback_model: + if fallback_provider == "gemini": + fallback_client = GeminiVLM(model=fallback_model) + elif fallback_provider == "vertex_ai": + project_id_fb = vlm_config.get("project_id") + location_fb = vlm_config.get("location") + if project_id_fb and location_fb: + fallback_client = VertexAIVLM(model=fallback_model, project_id=project_id_fb, location=location_fb) + while tries < max_retries: # Determine which model to use for this attempt current_vlm_config = vlm_config.copy() @@ -194,7 +221,34 @@ def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: in llm_output = None rate_limited = False - if current_vlm_config["provider"] == "vertex_ai": + # Choose instance client if available based on attempt + use_fallback = attempt_number > 3 and (fallback_provider and fallback_model) + current_client = None + if use_fallback and fallback_client is not None: + current_client = fallback_client + logger.debug("Using instance-based VLM fallback client", extra=base_extra) + elif primary_client is not None: + current_client = primary_client + logger.debug("Using instance-based VLM primary client", extra=base_extra) + + if current_client is not None: + try: + temperature = current_vlm_config.get("temperature", 0.5) + llm_output = current_client.make_llm_call( + image_path=page_image_path, + system_message=system_message, + response_schema=response_schema, + max_tokens=current_vlm_config.get("max_tokens", 4000), + temperature=temperature, + ) + except Exception as e: + if "429" in str(e): + logger.warning(f"Rate limit exceeded with instance VLM: {e}", extra=base_extra) + rate_limited = True + else: + logger.error(f"Error with instance VLM: {e}", extra=base_extra) + llm_output = "" + elif current_vlm_config["provider"] == "vertex_ai": try: temperature = current_vlm_config.get("temperature", 0.5) llm_output = make_llm_call_vertex( @@ -272,7 +326,7 @@ def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: in return page_number, [{"type": "NarrativeText", "content": "Failed to process page after multiple attempts", "page_number": page_number}] -def parse_file(pdf_path: str, kb_id: str, doc_id: str, vlm_config: VLMConfig, file_system: FileSystem) -> list[Element]: +def parse_file(pdf_path: str, kb_id: str, doc_id: str, vlm_config: VLMConfig, file_system: FileSystem, vlm_client: Optional[VLM] = None, vlm_fallback_client: Optional[VLM] = None) -> list[Element]: """ Given a PDF file, extract the content of each page using a VLM model. @@ -320,7 +374,9 @@ def parse_file(pdf_path: str, kb_id: str, doc_id: str, vlm_config: VLMConfig, fi file_system, i + 1, vlm_config, - element_types + element_types, + vlm_client, + vlm_fallback_client, ): i for i in range(len(image_file_paths)) } diff --git a/dsrag/dsparse/main.py b/dsrag/dsparse/main.py index 253da808..f510d1df 100644 --- a/dsrag/dsparse/main.py +++ b/dsrag/dsparse/main.py @@ -12,12 +12,14 @@ from .models.types import FileParsingConfig, VLMConfig, SemanticSectioningConfig, ChunkingConfig, Section, Chunk from .file_parsing.file_system import FileSystem, LocalFileSystem -from typing import List, Tuple +from typing import List, Tuple, Optional import json # Get the dsparse logger logger = logging.getLogger("dsrag.dsparse") +from .file_parsing.vlm_clients import VLM + def parse_and_chunk( kb_id: str, doc_id: str, @@ -27,6 +29,8 @@ def parse_and_chunk( file_system: FileSystem = {}, file_path: str = None, text: str = None, + vlm_client: Optional[VLM] = None, + vlm_fallback_client: Optional[VLM] = None, ) -> Tuple[List[Section], List[Chunk]]: """ Inputs @@ -124,6 +128,8 @@ def parse_and_chunk( vlm_config=vlm_config, semantic_sectioning_config=semantic_sectioning_config, chunking_config=chunking_config, + vlm_client=vlm_client, + vlm_fallback_client=vlm_fallback_client, ) duration = time.perf_counter() - start_time @@ -205,6 +211,8 @@ def parse_and_chunk( def parse_and_chunk_vlm( file_path: str, kb_id: str, doc_id: str, file_system: FileSystem, vlm_config: VLMConfig, semantic_sectioning_config: SemanticSectioningConfig, chunking_config: ChunkingConfig, + vlm_client: Optional[VLM] = None, + vlm_fallback_client: Optional[VLM] = None, testing_mode: bool = False) -> Tuple[List[Section], List[Chunk]]: # Create base logging context @@ -220,6 +228,8 @@ def parse_and_chunk_vlm( doc_id=doc_id, vlm_config=vlm_config, file_system=file_system, + vlm_client=vlm_client, + vlm_fallback_client=vlm_fallback_client, ) parse_duration = time.perf_counter() - parse_start_time diff --git a/dsrag/dsparse/models/types.py b/dsrag/dsparse/models/types.py index 4efee3be..74d54796 100644 --- a/dsrag/dsparse/models/types.py +++ b/dsrag/dsparse/models/types.py @@ -57,4 +57,7 @@ class ChunkingConfig(TypedDict): class FileParsingConfig(TypedDict): use_vlm: Optional[bool] vlm_config: Optional[VLMConfig] - always_save_page_images: Optional[bool] \ No newline at end of file + always_save_page_images: Optional[bool] + # Optional serialized VLM clients for first-class instance usage + vlm: Optional[dict] + vlm_fallback: Optional[dict] \ No newline at end of file diff --git a/dsrag/knowledge_base.py b/dsrag/knowledge_base.py index 44364110..5ea28924 100644 --- a/dsrag/knowledge_base.py +++ b/dsrag/knowledge_base.py @@ -30,6 +30,7 @@ from dsrag.dsparse.file_parsing.file_system import FileSystem, LocalFileSystem from dsrag.metadata import MetadataStorage, LocalMetadataStorage from dsrag.chat.citations import convert_elements_to_page_content +from dsrag.dsparse.file_parsing.vlm_clients import VLM class KnowledgeBase: def __init__( @@ -48,7 +49,8 @@ def __init__( file_system: Optional[FileSystem] = None, exists_ok: bool = True, save_metadata_to_disk: bool = True, - metadata_storage: Optional[MetadataStorage] = None + metadata_storage: Optional[MetadataStorage] = None, + vlm_client: Optional[VLM] = None, ): """Initialize a KnowledgeBase instance. @@ -87,7 +89,7 @@ def __init__( # load the KB if it exists; otherwise, initialize it and save it to disk if self.metadata_storage.kb_exists(self.kb_id) and exists_ok: self._load( - auto_context_model, reranker, file_system, chunk_db, vector_db + auto_context_model, reranker, file_system, chunk_db, vector_db, vlm_client ) self._save() elif self.metadata_storage.kb_exists(self.kb_id) and not exists_ok: @@ -104,7 +106,7 @@ def __init__( "created_on": created_time, } self._initialize_components( - embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system + embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system, vlm_client ) self._save() # save the config for the KB to disk else: @@ -115,7 +117,7 @@ def __init__( "supp_id": supp_id, } self._initialize_components( - embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system + embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system, vlm_client ) def _get_metadata_path(self): @@ -134,6 +136,7 @@ def _initialize_components( vector_db: Optional[VectorDB], chunk_db: Optional[ChunkDB], file_system: Optional[FileSystem], + vlm_client: Optional[VLM] = None, ): """Initialize the knowledge base components. @@ -153,6 +156,9 @@ def _initialize_components( chunk_db if chunk_db else BasicChunkDB(self.kb_id, self.storage_directory) ) self.file_system = file_system if file_system else LocalFileSystem(base_path=os.path.join(self.storage_directory, "page_images")) + # Store optional VLM client for first-class use + self.vlm_client = vlm_client + self.vector_dimension = self.embedding_model.dimension def _save(self): @@ -168,13 +174,15 @@ def _save(self): "vector_db": self.vector_db.to_dict(), "chunk_db": self.chunk_db.to_dict(), "file_system": self.file_system.to_dict(), + # Persist VLM client if present + "vlm_client": (self.vlm_client.to_dict() if getattr(self, "vlm_client", None) else None), } # Combine metadata and components full_data = {**self.kb_metadata, "components": components} self.metadata_storage.save(full_data, self.kb_id) - def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_db=None, vector_db=None): + def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_db=None, vector_db=None, vlm_client: Optional[VLM] = None): """Load a knowledge base configuration from disk. Internal method to deserialize components and metadata. @@ -237,6 +245,17 @@ def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_ # If the file system does not exist and is not provided, default to LocalFileSystem self.file_system = LocalFileSystem(base_path=self.storage_directory) + # Load optional VLM client + stored_vlm_dict = components.get("vlm_client") + if vlm_client is not None: + logging.warning(f"Overriding stored vlm_client for KB '{self.kb_id}' during load.", extra=base_extra) + self.vlm_client = vlm_client + elif stored_vlm_dict: + # Use VLM factory to reconstruct + self.vlm_client = VLM.from_dict(stored_vlm_dict) + else: + self.vlm_client = None + self.vector_dimension = self.embedding_model.dimension def delete(self): @@ -440,6 +459,23 @@ def add_document( # --- Parsing and Chunking Step --- step_start_time = time.perf_counter() + # Resolve VLM clients precedence: config serialized > instance on KB > legacy dict + resolved_vlm_client = None + resolved_vlm_fallback_client = None + try: + vlm_serialized = file_parsing_config.get("vlm") if isinstance(file_parsing_config, dict) else None + vlm_fallback_serialized = file_parsing_config.get("vlm_fallback") if isinstance(file_parsing_config, dict) else None + if vlm_serialized: + resolved_vlm_client = VLM.from_dict(vlm_serialized) + elif getattr(self, "vlm_client", None) is not None: + resolved_vlm_client = self.vlm_client + if vlm_fallback_serialized: + resolved_vlm_fallback_client = VLM.from_dict(vlm_fallback_serialized) + except Exception as e: + ingestion_logger.warning("Failed to deserialize VLM clients from file_parsing_config; falling back to legacy config", extra={**base_extra, "error": str(e)}) + resolved_vlm_client = getattr(self, "vlm_client", None) + resolved_vlm_fallback_client = None + sections, chunks = parse_and_chunk( kb_id=self.kb_id, doc_id=doc_id, @@ -449,6 +485,8 @@ def add_document( semantic_sectioning_config=semantic_sectioning_config, chunking_config=chunking_config, file_system=self.file_system, + vlm_client=resolved_vlm_client, + vlm_fallback_client=resolved_vlm_fallback_client, ) step_duration = time.perf_counter() - step_start_time ingestion_logger.debug("Parsing and Chunking complete", extra={ @@ -519,8 +557,8 @@ def add_document( "chunk_db": self.chunk_db.__class__.__name__ }) - # Convert elements to page content if the document was processed with page numbers - if file_path and file_parsing_config.get('use_vlm', False): + # Convert elements to page content if elements are present (works for both VLM paths) + if file_path: try: step_start_time = time.perf_counter() elements = self.file_system.load_data(kb_id=self.kb_id, doc_id=doc_id, data_name="elements") From f292144f99af6463836e67957fbb6df407cadf80 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 12:44:03 -0700 Subject: [PATCH 05/11] Task #428: Polish VLM integration: typing, error UX, tests, and docs Finalize VLM ABC integration by aligning types, improving error messages, adding missing tests, and documenting usage. --- README.md | 51 +++++++++ dsrag/dsparse/file_parsing/vlm_clients.py | 7 +- dsrag/dsparse/models/types.py | 3 + .../test_add_document_with_serialized_vlm.py | 102 ++++++++++++++++++ .../tests/unit/test_vlm_config_alignment.py | 51 +++++++++ .../tests/unit/test_vlm_metadata_roundtrip.py | 51 +++++++++ 6 files changed, 263 insertions(+), 2 deletions(-) create mode 100644 dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py create mode 100644 dsrag/dsparse/tests/unit/test_vlm_config_alignment.py create mode 100644 dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py diff --git a/README.md b/README.md index cbf8a65c..3526fc62 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,57 @@ file_parsing_config - save_path: the path to save intermediate files created during VLM processing - exclude_elements: a list of element types to exclude from the parsed text. Default is ["Header", "Footer"]. +VLM class-based clients and fallback +- You can pass a first-class VLM client instance to KnowledgeBase for default usage: + +```python +from dsrag.knowledge_base import KnowledgeBase +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM + +kb = KnowledgeBase( + kb_id="my_kb", + vlm_client=GeminiVLM(model="gemini-2.0-flash"), # used by default for VLM parsing +) +``` + +- You can also override the VLM on a per-document basis by passing a serialized client via file_parsing_config["vlm"]. This is useful when you want different models per document: + +```python +vlm_override = GeminiVLM(model="gemini-2.0-flash").to_dict() +kb.add_document( + doc_id="doc1", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm": vlm_override, # per-document VLM client + "vlm_config": {"images_already_exist": False}, + }, + auto_context_config={ + "use_generated_title": False, + "get_document_summary": False, + "get_section_summaries": False, + }, +) +``` + +- Fallback configuration: you can provide a serialized fallback client via file_parsing_config["vlm_fallback"]. The system will alternate between primary and fallback after the first few retries when needed. Legacy fallback using vlm_config["fallback_provider"/"fallback_model"] is also supported. + +```python +primary = GeminiVLM(model="gemini-2.0-flash").to_dict() +fallback = GeminiVLM(model="gemini-2.5-flash").to_dict() + +kb.add_document( + doc_id="doc2", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm": primary, + "vlm_fallback": fallback, + "vlm_config": {"max_pages": 10}, + }, +) +``` + semantic_sectioning_config - llm_provider: the LLM provider to use for semantic sectioning - "openai", "anthropic", and "gemini" are supported - model: the LLM model to use for semantic sectioning (e.g., "gpt-4.1-mini", "claude-3-5-haiku-latest", "gemini-2.0-flash") diff --git a/dsrag/dsparse/file_parsing/vlm_clients.py b/dsrag/dsparse/file_parsing/vlm_clients.py index e628838a..c41dc9a7 100644 --- a/dsrag/dsparse/file_parsing/vlm_clients.py +++ b/dsrag/dsparse/file_parsing/vlm_clients.py @@ -103,8 +103,11 @@ def make_llm_call( import PIL.Image # used only when this method is executed import io - # Create client using lazy loader - client = genai_new.Client(api_key=os.environ["GEMINI_API_KEY"]) # type: ignore[attr-defined] + # Create client using lazy loader with explicit API key check + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + raise ValueError("GEMINI_API_KEY not set; required for GeminiVLM") + client = genai_new.Client(api_key=api_key) # type: ignore[attr-defined] # Base generation config config = genai_new.types.GenerateContentConfig( # type: ignore[attr-defined] diff --git a/dsrag/dsparse/models/types.py b/dsrag/dsparse/models/types.py index 74d54796..0a4631b4 100644 --- a/dsrag/dsparse/models/types.py +++ b/dsrag/dsparse/models/types.py @@ -42,6 +42,9 @@ class VLMConfig(TypedDict): exclude_elements: Optional[list[str]] element_types: Optional[list[ElementType]] max_workers: Optional[int] + # Additional optional configuration used by VLM parsing + dpi: Optional[int] + vlm_max_concurrent_requests: Optional[int] class SemanticSectioningConfig(TypedDict): diff --git a/dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py b/dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py new file mode 100644 index 00000000..14c3ec68 --- /dev/null +++ b/dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py @@ -0,0 +1,102 @@ +import os +import sys +import shutil +import tempfile +import unittest +from glob import glob + +# Ensure project root is on sys.path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) + +from dsrag.knowledge_base import KnowledgeBase +from dsrag.dsparse.file_parsing.file_system import LocalFileSystem +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM +from dsrag.reranker import NoReranker +from dsrag.embedding import Embedding +import numpy as np + + +class DummyEmbedding(Embedding): + """Minimal embedding implementation for tests (no network).""" + def __init__(self, dimension: int = 8) -> None: + self.dimension = dimension + + def get_embeddings(self, text: list[str], input_type: str = ""): + # Return deterministic zero vectors of the configured dimension + return [np.zeros(self.dimension, dtype=np.float32) for _ in text] + + def to_dict(self): + # Provide a minimal serializable dict (not used for from_dict in this test) + return {"subclass_name": "DummyEmbedding", "dimension": self.dimension} + + +class TestAddDocumentWithSerializedVLM(unittest.TestCase): + @unittest.skipIf('GEMINI_API_KEY' not in os.environ, "GEMINI_API_KEY not found in environment") + def test_add_document_with_serialized_vlm_and_images_already_exist(self): + temp_dir = tempfile.mkdtemp(prefix="dsrag_kb_add_doc_") + try: + kb_id = "kb_serialized_vlm" + doc_id = "doc_serialized_vlm" + + # Set up file system and pre-create images under kb/doc + fs = LocalFileSystem(base_path=os.path.join(temp_dir, "page_images")) + fs.create_directory(kb_id, doc_id) + + # Copy an existing test image into the kb/doc as page_1.jpg + src_img = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../tests/data/page_7.jpg")) + dst_img = os.path.join(temp_dir, "page_images", kb_id, doc_id, "page_1.jpg") + shutil.copyfile(src_img, dst_img) + + # Build KB with dummy embedding and no reranker network + kb = KnowledgeBase( + kb_id=kb_id, + storage_directory=temp_dir, + embedding_model=DummyEmbedding(dimension=8), + reranker=NoReranker(), + file_system=fs, + ) + + # Serialized VLM client for per-document override + serialized_vlm = GeminiVLM(model="gemini-2.0-flash").to_dict() + + # Disable AutoContext LLM calls by not generating titles/summaries + auto_context_config = { + "use_generated_title": False, + "get_document_summary": False, + "get_section_summaries": False, + } + + # Perform add_document using existing images and serialized VLM + kb.add_document( + doc_id=doc_id, + file_path="dummy.pdf", # not used when images_already_exist=True + document_title="Test Doc", + auto_context_config=auto_context_config, + file_parsing_config={ + "use_vlm": True, + "vlm_config": {"images_already_exist": True}, + "vlm": serialized_vlm, + }, + semantic_sectioning_config={ + "llm_provider": "openai", # value not used when auto_context disabled + "model": "gpt-4o-mini", + }, + chunking_config={}, + ) + + # Assert elements.json exists + elements_path = os.path.join(temp_dir, "page_images", kb_id, doc_id, "elements.json") + self.assertTrue(os.path.exists(elements_path)) + + # Assert page content jsons were created (convert_elements_to_page_content) + page_content_files = glob(os.path.join(temp_dir, "page_images", kb_id, doc_id, "page_content_*.json")) + self.assertTrue(len(page_content_files) >= 1) + finally: + try: + shutil.rmtree(temp_dir) + except Exception: + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/dsrag/dsparse/tests/unit/test_vlm_config_alignment.py b/dsrag/dsparse/tests/unit/test_vlm_config_alignment.py new file mode 100644 index 00000000..3f695930 --- /dev/null +++ b/dsrag/dsparse/tests/unit/test_vlm_config_alignment.py @@ -0,0 +1,51 @@ +import os +import sys +import shutil +import tempfile +import unittest + +# Ensure project root is on sys.path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) + +from dsrag.dsparse.file_parsing.vlm_file_parsing import parse_file +from dsrag.dsparse.file_parsing.file_system import LocalFileSystem + + +class TestVLMConfigTypedDictAlignment(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.mkdtemp(prefix="dsparse_vlm_cfg_") + self.kb_id = "kb_vlm_cfg" + self.doc_id = "doc_vlm_cfg" + self.fs = LocalFileSystem(base_path=self.temp_dir) + # Ensure directory exists but contains no images + self.fs.create_directory(self.kb_id, self.doc_id) + + def tearDown(self): + try: + shutil.rmtree(self.temp_dir) + except Exception: + pass + + def test_vlm_config_typedict_alignment(self): + # Provide dpi and vlm_max_concurrent_requests; zero images and images_already_exist=True + vlm_config = { + "images_already_exist": True, + "dpi": 120, + "vlm_max_concurrent_requests": 2, + } + # pdf_path can be empty when images_already_exist is True + elements = parse_file( + pdf_path="", + kb_id=self.kb_id, + doc_id=self.doc_id, + vlm_config=vlm_config, # type: ignore[arg-type] + file_system=self.fs, + ) + # Should return an empty list and write elements.json + self.assertIsInstance(elements, list) + self.assertEqual(len(elements), 0) + self.assertTrue(os.path.exists(os.path.join(self.temp_dir, self.kb_id, self.doc_id, "elements.json"))) + + +if __name__ == "__main__": + unittest.main() diff --git a/dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py b/dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py new file mode 100644 index 00000000..644c9cd9 --- /dev/null +++ b/dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py @@ -0,0 +1,51 @@ +import os +import sys +import shutil +import tempfile +import unittest + +# Ensure project root is on sys.path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) + +from dsrag.knowledge_base import KnowledgeBase +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM +from dsrag.reranker import NoReranker + + +class TestVLMMetaRoundTrip(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.mkdtemp(prefix="dsrag_kb_test_") + self.kb_id = "kb_vlm_meta_rt" + + def tearDown(self): + try: + shutil.rmtree(self.temp_dir) + except Exception: + pass + + def test_kb_persists_vlm_client_in_metadata(self): + # Create KB with a GeminiVLM client set + kb = KnowledgeBase( + kb_id=self.kb_id, + storage_directory=self.temp_dir, + vlm_client=GeminiVLM(model="gemini-2.0-flash"), + reranker=NoReranker(), + ) + # Force a save and then reload + kb._save() + + # Re-initialize KB - should load from metadata + kb2 = KnowledgeBase( + kb_id=self.kb_id, + storage_directory=self.temp_dir, + exists_ok=True, + ) + self.assertIsNotNone(kb2.vlm_client) + # Ensure correct type + model preserved + from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM as _GeminiVLM + self.assertIsInstance(kb2.vlm_client, _GeminiVLM) + self.assertEqual(kb2.vlm_client.model, "gemini-2.0-flash") + + +if __name__ == "__main__": + unittest.main() From 847308148fa659d1020b72fcc1bf950abaebec6d Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 13:11:03 -0700 Subject: [PATCH 06/11] README update --- dsrag/dsparse/README.md | 2 +- dsrag/knowledge_base.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dsrag/dsparse/README.md b/dsrag/dsparse/README.md index 89bcfb34..75623ae2 100644 --- a/dsrag/dsparse/README.md +++ b/dsrag/dsparse/README.md @@ -97,4 +97,4 @@ Semantic sectioning cost calculation (`gpt-4o-mini`) - Output: 50 tokens x $0.60/10^6 per token = $0.00003 - Total: $0.00015/page or **$0.15 per 1000 pages** -Document text is processed in ~5000 token mega-chunks, which is roughly ten pages on average. But these mega-chunks have to be processed sequentially for each document. Processing each mega-chunk only takes a couple seconds, though, so even a large document of a few hundred pages will only take 20-60 seconds. Rate limits for the OpenAI API are heavily dependent on the usage tier you're in. +Document text is processed in ~5000 token mega-chunks, which is roughly ten pages on average. These mega-chunks are processed in parallel for each document. Processing each mega-chunk only takes a few seconds, though, so even a large document of a few hundred pages should only take 5-10 seconds. Rate limits for the OpenAI API are heavily dependent on the usage tier you're in. diff --git a/dsrag/knowledge_base.py b/dsrag/knowledge_base.py index 5ea28924..c2f65959 100644 --- a/dsrag/knowledge_base.py +++ b/dsrag/knowledge_base.py @@ -459,6 +459,7 @@ def add_document( # --- Parsing and Chunking Step --- step_start_time = time.perf_counter() + # Resolve VLM clients precedence: config serialized > instance on KB > legacy dict resolved_vlm_client = None resolved_vlm_fallback_client = None From d8b82c9a8c764cc8a6391098a54bf1837fa329f7 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 13:17:05 -0700 Subject: [PATCH 07/11] update VLM pricing calculation in README --- dsrag/dsparse/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dsrag/dsparse/README.md b/dsrag/dsparse/README.md index 75623ae2..6f062cd5 100644 --- a/dsrag/dsparse/README.md +++ b/dsrag/dsparse/README.md @@ -81,13 +81,14 @@ The default model for semantic sectioning is `gpt-4o-mini`, but similar or stron An obvious concern with using a VLM to parse documents is the cost. Let's run the numbers: VLM file parsing cost calculation (`gemini-2.0-flash`) -- Text input (prompt) + image input: 400 (text) + 258 (image) tokens x $0.10/10^6 per token = $0.000066 -- Text output: 600 tokens x $0.40/10^6 per token = $0.000240 -- Total: $0.000306/page or **$0.31 per 1000 pages** +- Input tokens for images are calculated based on the number of 768x768 tiles needed. At the standard dpi of 100 (or even up to around 150), this usually means 4 tiles. Each tile is counted as 258 tokens. +- Text input (prompt) + image input: 500 (text) + 4x258 (image) tokens x $0.10/10^6 per token = $0.0001532 +- Text output: 700 tokens x $0.40/10^6 per token = $0.0002800 +- Total: $0.0004332/page or **$0.43 per 1000 pages** -This is substantially cheaper than commercially available OCR/PDF parsing services. Unstructured and Azure Document Intelligence, for example, both cost $10 per 1000 pages. +This is substantially cheaper than commercially available OCR/PDF parsing services. Unstructured and Azure Document Intelligence, for example, both cost $10 per 1000 pages. Reducto is generally $10-20 per 1000 pages. -What about latency and throughput? Since each page is processed independently, this is a highly parallelizable problem. The main limiting factor then is the rate limits imposed by the VLM provider. The current rate limit for `gemini-2.0-flash` is 2000 requests per minute. Since dsParse uses one request per page, that means the limit is 2000 pages per minute. Processing a single page takes around 15-20 seconds, so that's the minimum latency for processing a document. +What about latency and throughput? Since each page is processed independently, this is a highly parallelizable problem. The main limiting factor then is the rate limits imposed by the VLM provider. The current rate limit for `gemini-2.0-flash` on the highest tier is 30k requests per minute. Since dsParse uses one request per page, that means the limit is 30k pages per minute. Processing a single page takes around 15-20 seconds, so that's the minimum latency for processing a document. ### Semantic sectioning Semantic sectioning produces far fewer output tokens, so it ends up being a bit cheaper than the file parsing step. From da65ec3665b2ee06dbe28feb088d9e88f4c9254b Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 13:24:59 -0700 Subject: [PATCH 08/11] Task #429: Add integration test for serialized VLM client in VLM file parsing Add a real-API, env-gated test to validate the new VLM client (serialized) path in parse_and_chunk. --- .../integration/test_vlm_file_parsing.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py b/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py index 81a060b9..eedffb21 100644 --- a/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py +++ b/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py @@ -111,6 +111,46 @@ def test_non_vlm_file_parsing(self): self.assertTrue(len(sections[0]["title"]) > 0) self.assertTrue(len(sections[0]["content"]) > 0) + def test__parse_and_chunk_vlm_with_serialized_client(self): + import os as _os + if 'GEMINI_API_KEY' not in _os.environ: + self.skipTest("GEMINI_API_KEY not found in environment") + + from dsparse.file_parsing.vlm_clients import GeminiVLM + + file_parsing_config = { + "use_vlm": True, + "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(), + "always_save_page_images": True, + } + sections, chunks = parse_and_chunk( + kb_id=self.kb_id, + # use a distinct doc_id to avoid collisions with other tests + doc_id=f"{self.doc_id}_vlm_client", + file_path=self.test_data_path, + file_parsing_config=file_parsing_config, + semantic_sectioning_config={ + "llm_provider": "openai", + "model": "gpt-4o-mini", + "language": "en", + }, + chunking_config={}, + file_system=self.file_system, + ) + + # Assertions mirror existing tests + self.assertTrue(len(sections) > 0) + self.assertTrue(len(chunks) > 0) + for key, expected_type in Section.__annotations__.items(): + self.assertIsInstance(sections[0][key], expected_type) + for key, expected_type in Chunk.__annotations__.items(): + self.assertIsInstance(chunks[0][key], expected_type) + + # elements.json should be emitted for the document + self.assertTrue(os.path.exists( + os.path.join(self.save_path, self.kb_id, f"{self.doc_id}_vlm_client", "elements.json") + )) + @classmethod def tearDownClass(self): From e7feafc21f7ee047bbaf9cd95814bf3d4d3a5e94 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 13:35:36 -0700 Subject: [PATCH 09/11] Task #430: Add top-level integration test for serialized VLM client Add a real-API, env-gated test in the top-level integration suite to validate the serialized VLM client path. --- tests/integration/test_vlm_file_parsing.py | 63 ++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/integration/test_vlm_file_parsing.py b/tests/integration/test_vlm_file_parsing.py index fd98b3b7..568fe248 100644 --- a/tests/integration/test_vlm_file_parsing.py +++ b/tests/integration/test_vlm_file_parsing.py @@ -5,6 +5,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) from dsrag.dsparse.file_parsing.file_system import LocalFileSystem from dsrag.knowledge_base import KnowledgeBase +from dsrag.dsparse.main import parse_and_chunk +from dsrag.dsparse.models.types import Section, Chunk class TestRetrieval(unittest.TestCase): @@ -271,5 +273,66 @@ def cleanup(self): os.system("rm -rf ~/dsrag_test_mck_energy") +class TestVLMFileParsing(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Shared fixtures for VLM parse_and_chunk test + cls.kb_id = "mck_energy_test" + cls.doc_id = "mck_energy_report" + + # Save path and file system + cls.save_path = os.path.expanduser("~/dsrag_test_mck_energy") + cls.file_system = LocalFileSystem(base_path=cls.save_path) + + # Test data path (PDF) + cls.test_data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../data/mck_energy_first_5_pages.pdf") + ) + + def test__parse_and_chunk_vlm_with_serialized_client(self): + import os as _os + if 'GEMINI_API_KEY' not in _os.environ: + self.skipTest("GEMINI_API_KEY not found in environment") + + # Use serialized VLM client config + from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM, VLM + + client_dict = GeminiVLM(model="gemini-2.0-flash").to_dict() + + file_parsing_config = { + "use_vlm": True, + "vlm": client_dict, + "always_save_page_images": True, + } + + sections, chunks = parse_and_chunk( + kb_id=self.kb_id, + # Use a distinct doc_id to avoid collisions with other tests + doc_id=f"{self.doc_id}_vlm_client", + file_path=self.test_data_path, + file_parsing_config=file_parsing_config, + semantic_sectioning_config={ + "use_semantic_sectioning": False, + }, + chunking_config={}, + file_system=self.file_system, + vlm_client=VLM.from_dict(client_dict), + ) + + # Assertions mirror existing patterns + self.assertTrue(len(sections) > 0) + self.assertTrue(len(chunks) > 0) + # Validate structure of first section and chunk + for key, expected_type in Section.__annotations__.items(): + self.assertIsInstance(sections[0][key], expected_type) + for key, expected_type in Chunk.__annotations__.items(): + self.assertIsInstance(chunks[0][key], expected_type) + + # elements.json should be emitted for the document + self.assertTrue(os.path.exists( + os.path.join(self.save_path, self.kb_id, f"{self.doc_id}_vlm_client", "elements.json") + )) + + if __name__ == "__main__": unittest.main() \ No newline at end of file From 4446c44cc045dab163be413d45f3016d79780ec4 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 13:43:19 -0700 Subject: [PATCH 10/11] Task #431: Update READMEs for VLM refactor (class-based VLM, fallback, and back-compat) Revise top-level README.md and dsrag/dsparse/README.md to document the new VLM ABC usage, fallback configuration, and backward compatibility with legacy dict configs. --- README.md | 61 +++++++++++++++++++++++++++++ dsrag/dsparse/README.md | 85 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/README.md b/README.md index 3526fc62..15e2b927 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,67 @@ For the `S3FileSystem`, the following parameters are needed: The `base_path` is used when downloading files from S3. The files have to be stored locally in order to be used in the retrieval system. +#### VLM Clients (new) +Visual Language Models (VLMs) now follow the same class-based abstraction pattern (ABC) as LLM, Embedding, and Reranker components. You can supply a first-class VLM client instance to the KnowledgeBase, or override per document by passing a serialized client. Backward compatibility is maintained: legacy provider/model dictionaries in `file_parsing_config['vlm_config']` still work. + +- Class-based usage (KB default): +```python +from dsrag.knowledge_base import KnowledgeBase +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM + +kb = KnowledgeBase( + kb_id="my_kb", + vlm_client=GeminiVLM(model="gemini-2.0-flash"), # default VLM used for VLM parsing +) +``` + +- Per-document override (serialized client): +```python +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM + +kb.add_document( + doc_id="doc1", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(), # per-document override + "vlm_config": {"max_pages": 10}, + }, + auto_context_config={ + "use_generated_title": False, + "get_document_summary": False, + "get_section_summaries": False, + }, +) +``` + +- Fallback configuration (preferred, class-based): +```python +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM + +primary = GeminiVLM(model="gemini-2.0-flash").to_dict() +fallback = GeminiVLM(model="gemini-2.5-flash").to_dict() + +kb.add_document( + doc_id="doc2", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm": primary, + "vlm_fallback": fallback, + "vlm_config": {"max_pages": 10}, + }, +) +``` +Legacy dict-based fallback remains supported via `vlm_config["fallback_provider"/"fallback_model"]`. + +- Backward compatibility and precedence + - The legacy dict path (e.g., `provider`, `model`, `images_already_exist`, etc.) continues to work. + - When both a serialized client (`vlm`) and a legacy `provider`/`model` are supplied, the system prefers `vlm`/`vlm_fallback`. + +- Environment variables + - `GEMINI_API_KEY` is required for `GeminiVLM`; a clear error is raised if missing. + ## Config dictionaries Since there are a lot of configuration parameters available, they're organized into a few config dictionaries. There are four config dictionaries that can be passed in to `add_document` (`auto_context_config`, `file_parsing_config`, `semantic_sectioning_config`, and `chunking_config`) and one that can be passed in to `query` (`rse_params`). diff --git a/dsrag/dsparse/README.md b/dsrag/dsparse/README.md index 6f062cd5..f013ad0b 100644 --- a/dsrag/dsparse/README.md +++ b/dsrag/dsparse/README.md @@ -34,6 +34,91 @@ kb.add_document( ) ``` +## VLM clients +VLMs now support a class-based client abstraction (similar to LLM/Embedding/Reranker) that you can pass either at the KB level or per document. Legacy dict-based `vlm_config` remains fully supported. + +- Quickstart with class-based client (serialized) and LocalFileSystem +```python +from dsrag.dsparse.main import parse_and_chunk +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM +from dsrag.dsparse.file_parsing.file_system import LocalFileSystem + +sections, chunks = parse_and_chunk( + kb_id="sample_kb", + doc_id="sample_doc", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(), + "vlm_config": {"max_pages": 5, "vlm_max_concurrent_requests": 2}, + }, + file_system=LocalFileSystem(base_path="~/dsParse"), +) +``` + +- Fallback (preferred, class-based): +```python +from dsrag.dsparse.main import parse_and_chunk +from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM + +primary = GeminiVLM(model="gemini-2.0-flash").to_dict() +fallback = GeminiVLM(model="gemini-2.5-flash").to_dict() + +sections, chunks = parse_and_chunk( + kb_id="kb", + doc_id="doc", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm": primary, + "vlm_fallback": fallback, + "vlm_config": {"max_pages": 5}, + }, +) +``` + +- Legacy path (still valid): +```python +from dsrag.dsparse.main import parse_and_chunk + +sections, chunks = parse_and_chunk( + kb_id="kb", + doc_id="doc", + file_path="/path/to/file.pdf", + file_parsing_config={ + "use_vlm": True, + "vlm_config": { + "provider": "gemini", + "model": "gemini-2.0-flash", + "max_pages": 5, + # Optional legacy fallback + "fallback_provider": "gemini", + "fallback_model": "gemini-2.5-flash", + }, + }, +) +``` + +- Images already exist +If you’ve pre-extracted page images into the configured FileSystem directory structure, you can reuse them: +```python +sections, chunks = parse_and_chunk( + kb_id="kb", + doc_id="doc", + file_path="/path/to/file.pdf", # path still required for metadata, but images won’t be regenerated + file_parsing_config={ + "use_vlm": True, + "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(), + "vlm_config": {"images_already_exist": True}, + }, +) +``` + +- Notes + - Parallelism controls and DPI are in `vlm_config` (e.g., `vlm_max_concurrent_requests`, `dpi`). + - Page images and `elements.json` are saved via the configured `FileSystem`. + - Environment variable `GEMINI_API_KEY` is required for `GeminiVLM`. Clear errors are raised if missing. + ## Installation If you want to use dsParse on its own, without installing the full `dsrag` package, there is a standalone Python package available for dsParse, which can be installed with `pip install dsparse`. If you already have `dsrag` installed, you DO NOT need to separately install `dsparse`. From a1f28905d0db7bd628d3f4b1806276495619dde9 Mon Sep 17 00:00:00 2001 From: Zach Date: Wed, 5 Nov 2025 13:49:32 -0700 Subject: [PATCH 11/11] KnowledgeBase docstring updates --- dsrag/knowledge_base.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dsrag/knowledge_base.py b/dsrag/knowledge_base.py index c2f65959..7ad78136 100644 --- a/dsrag/knowledge_base.py +++ b/dsrag/knowledge_base.py @@ -77,6 +77,8 @@ def __init__( save_metadata_to_disk (bool, optional): Whether to persist metadata. Defaults to True. metadata_storage (Optional[MetadataStorage], optional): Storage for KB metadata. Defaults to LocalMetadataStorage. + vlm_client (Optional[VLM], optional): VLM client for parsing documents. + Defaults to None. Raises: ValueError: If KB exists and exists_ok is False. @@ -193,7 +195,7 @@ def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_ file_system (Optional[FileSystem], optional): Override stored file system. chunk_db (Optional[ChunkDB], optional): Override stored chunk database. vector_db (Optional[VectorDB], optional): Override stored vector database. - + vlm_client (Optional[VLM], optional): Override stored VLM client. Note: Only auto_context_model and reranker can safely override stored components. Other component overrides may break functionality if not compatible. @@ -344,20 +346,23 @@ def add_document( { # Whether to use VLM for parsing "use_vlm": False, + + # VLM client for parsing documents + "vlm": serialized VLM client, # VLM configuration (ignored if use_vlm is False) "vlm_config": { # VLM provider (currently only "gemini" and "vertex_ai" supported) - "provider": "vertex_ai", + "provider": "vertex_ai", - ignored if vlm is provided # The VLM model to use - "model": "model_name", + "model": "model_name", - ignored if vlm is provided # GCP project ID (required for "vertex_ai") - "project_id": "your-project-id", + "project_id": "your-project-id", - ignored if vlm is provided # GCP location (required for "vertex_ai") - "location": "us-central1", + "location": "us-central1", - ignored if vlm is provided # Path to save intermediate files "save_path": "/path/to/save",