From fa29289a912c7e3231ebea6d37e103cc2e3abeb9 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 11:38:32 -0700
Subject: [PATCH 01/11] update deprecated 2.5 Flash model name

---
 dsrag/dsparse/tests/unit/test_vlm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dsrag/dsparse/tests/unit/test_vlm.py b/dsrag/dsparse/tests/unit/test_vlm.py
index 51094604..9182c561 100644
--- a/dsrag/dsparse/tests/unit/test_vlm.py
+++ b/dsrag/dsparse/tests/unit/test_vlm.py
@@ -65,7 +65,7 @@ def test_gemini_2_5_with_simple_schema(self):
             result = make_llm_call_gemini(
                 image_path=test_image_path,
                 system_message="Describe this image in a JSON object with a 'description' field.",
-                model="gemini-2.5-flash-preview-04-17",
+                model="gemini-2.5-flash",
                 response_schema=test_schema,
                 temperature=0.2
             )
@@ -169,7 +169,7 @@ def test_gemini_2_5_with_complex_schema(self):
             result = make_llm_call_gemini(
                 image_path=test_image_path,
                 system_message=system_message,
-                model="gemini-2.5-flash-preview-04-17",
+                model="gemini-2.5-flash",
                 response_schema=complex_schema,
                 temperature=0.5
             )

From 83b4bd0dd4b59a2efdf906f4ec6bcb00bb7e2fbb Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 11:49:10 -0700
Subject: [PATCH 02/11] Task #426: Introduce VLM ABC and Provider Clients
 (Gemini, Vertex) with backward-compatible wrappers

Create a reusable VLM client abstraction and concrete implementations (Gemini, Vertex) while keeping current free functions working. No wiring changes yet.
---
 dsrag/dsparse/file_parsing/vlm.py            | 101 ++------
 dsrag/dsparse/file_parsing/vlm_clients.py    | 237 +++++++++++++++++++
 dsrag/dsparse/tests/unit/test_vlm_clients.py |  42 ++++
 3 files changed, 300 insertions(+), 80 deletions(-)
 create mode 100644 dsrag/dsparse/file_parsing/vlm_clients.py
 create mode 100644 dsrag/dsparse/tests/unit/test_vlm_clients.py

diff --git a/dsrag/dsparse/file_parsing/vlm.py b/dsrag/dsparse/file_parsing/vlm.py
index 661e68bb..33ef8bc9 100644
--- a/dsrag/dsparse/file_parsing/vlm.py
+++ b/dsrag/dsparse/file_parsing/vlm.py
@@ -1,95 +1,36 @@
 import PIL.Image
-import os
 import io
-from ..utils.imports import vertexai, genai_new
+from .vlm_clients import GeminiVLM, VertexAIVLM
 
 def make_llm_call_gemini(image_path: str, system_message: str, model: str = "gemini-2.0-flash", response_schema: dict = None, max_tokens: int = 4000, temperature: float = 0.5) -> str:
-    # With the newer Google GenAI SDK, we need to create a client
-    client = genai_new.Client(api_key=os.environ["GEMINI_API_KEY"])
+    """
+    Backward-compatible free function that delegates to GeminiVLM.
 
-    # Create generation config with the correct GenerateContentConfig type
-    config = genai_new.types.GenerateContentConfig(
+    Signature and behavior are preserved for compatibility.
+    """
+    client = GeminiVLM(model=model)
+    return client.make_llm_call(
+        image_path=image_path,
+        system_message=system_message,
+        response_schema=response_schema,
+        max_tokens=max_tokens,
         temperature=temperature,
-        max_output_tokens=max_tokens,
-        response_mime_type="application/json"
     )
 
-    # Add response schema if provided
-    if response_schema is not None:
-        config.response_schema = response_schema
-
-    try:
-        # Open and compress the image
-        image = PIL.Image.open(image_path)
-        compressed_image_bytes, _ = compress_image(image) # Quality is returned but not used here
-
-        # Close the original image object now that compression is done
-        if image:
-            image.close()
-            # The 'image' variable still exists and will be handled by the finally block,
-            # PIL's close() is typically safe to call multiple times.
-
-        # Create content parts using bytes
-        image_part = genai_new.types.Part.from_bytes(data=compressed_image_bytes, mime_type='image/jpeg')
-        content_parts = [image_part, system_message]
-
-        # For Gemini 2.5 models, disable thinking
-        if model.startswith("gemini-2.5"):
-            # Create a new config with thinking disabled by setting thinking_config
-            gemini25_config = genai_new.types.GenerateContentConfig(
-                temperature=temperature,
-                max_output_tokens=max_tokens,
-                response_mime_type="application/json",
-                thinking_config=genai_new.types.ThinkingConfig(thinking_budget=0)
-            )
-
-            # Add response schema if provided
-            if response_schema is not None:
-                gemini25_config.response_schema = response_schema
-
-            # Generate content with thinking disabled
-            response = client.models.generate_content(
-                model=model,
-                contents=content_parts,
-                config=gemini25_config
-            )
-        else:
-            # Standard call for other Gemini models
-            response = client.models.generate_content(
-                model=model,
-                contents=content_parts,
-                config=config
-            )
-
-        return response.text
-    finally:
-        # Ensure image is closed even if an error occurs
-        if 'image' in locals() and image: # Check if image was defined and not None
-            try:
-                image.close() # Attempt to close; safe if already closed
-            except Exception:
-                pass # Ignore errors if it fails (e.g., trying to close a None object or already closed and problematic)
-
 def make_llm_call_vertex(image_path: str, system_message: str, model: str, project_id: str, location: str, response_schema: dict = None, max_tokens: int = 4000, temperature: float = 0.5) -> str:
     """
-    This function calls the Vertex AI Gemini API (not to be confused with the Gemini API) with an image and a system message and returns the response text.
+    Backward-compatible free function that delegates to VertexAIVLM.
+
+    Signature and behavior are preserved for compatibility.
     """
-    vertexai.init(project=project_id, location=location)
-    model = vertexai.generative_models.GenerativeModel(model)
-    
-    if response_schema is not None:
-        generation_config = vertexai.generative_models.GenerationConfig(temperature=temperature, max_output_tokens=max_tokens, response_mime_type="application/json", response_schema=response_schema)
-    else:
-        generation_config = vertexai.generative_models.GenerationConfig(temperature=temperature, max_output_tokens=max_tokens)
-    
-    response = model.generate_content(
-        [
-            vertexai.generative_models.Part.from_image(vertexai.generative_models.Image.load_from_file(image_path)),
-            system_message,
-        ],
-        generation_config=generation_config,
+    client = VertexAIVLM(model=model, project_id=project_id, location=location)
+    return client.make_llm_call(
+        image_path=image_path,
+        system_message=system_message,
+        response_schema=response_schema,
+        max_tokens=max_tokens,
+        temperature=temperature,
     )
-    return response.text
 
 def compress_image(image: PIL.Image.Image, max_size_bytes: int = 1097152, quality: int = 95) -> tuple[bytes, int]:
     """
diff --git a/dsrag/dsparse/file_parsing/vlm_clients.py b/dsrag/dsparse/file_parsing/vlm_clients.py
new file mode 100644
index 00000000..14d8d410
--- /dev/null
+++ b/dsrag/dsparse/file_parsing/vlm_clients.py
@@ -0,0 +1,237 @@
+"""
+Visual Language Model (VLM) abstraction and concrete clients.
+
+This module introduces an abstraction similar to LLM/Embedding/Reranker to allow
+users to extend VLM providers/models via subclassing while maintaining backward
+compatibility with existing free functions in vlm.py.
+
+Design notes:
+- Subclasses are registered via __init_subclass__ for simple construction from
+  serialized config dicts.
+- The public free functions in vlm.py delegate to these clients and preserve
+  their original signatures.
+- To avoid circular imports with vlm.compress_image, GeminiVLM imports
+  compress_image inside the make_llm_call method scope.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Type
+import os
+
+from ..utils.imports import genai_new, vertexai  # lazy loaders
+
+
+class VLM(ABC):
+    """Abstract base class for Visual Language Model clients.
+
+    Subclasses should implement the make_llm_call method to perform the actual
+    provider-specific API call and return the response text.
+
+    Subclass registration
+    ---------------------
+    Each subclass is automatically registered by class name using
+    __init_subclass__. This allows simple construction from a configuration
+    dictionary via from_dict.
+    """
+
+    # Registry of subclass name -> subclass type
+    _subclasses: Dict[str, Type["VLM"]] = {}
+
+    def __init_subclass__(cls, **kwargs):  # type: ignore[override]
+        super().__init_subclass__(**kwargs)
+        # Register subclass by its class name for from_dict factory construction
+        VLM._subclasses[cls.__name__] = cls
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize this VLM instance to a dictionary.
+
+        Returns a dict containing the subclass name and public fields
+        required to reconstruct the instance with from_dict.
+        """
+        # Default implementation serializes __dict__ (public fields) plus subclass name
+        data = {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
+        data["subclass_name"] = self.__class__.__name__
+        return data
+
+    @classmethod
+    def from_dict(cls, config: Dict[str, Any]) -> "VLM":
+        """Construct a VLM instance from a serialized config dictionary.
+
+        The config must contain a "subclass_name" key identifying the
+        registered subclass. Remaining keys are forwarded to the subclass
+        constructor as keyword arguments.
+        """
+        subclass_name = config.get("subclass_name")
+        if not subclass_name:
+            raise ValueError("config must include 'subclass_name'")
+        subclass = cls._subclasses.get(subclass_name)
+        if subclass is None:
+            raise ValueError(f"Unknown VLM subclass: {subclass_name}")
+        kwargs = {k: v for k, v in config.items() if k != "subclass_name"}
+        return subclass(**kwargs)  # type: ignore[arg-type]
+
+    @abstractmethod
+    def make_llm_call(
+        self,
+        image_path: str,
+        system_message: str,
+        response_schema: Optional[Dict[str, Any]] = None,
+        max_tokens: int = 4000,
+        temperature: float = 0.5,
+    ) -> str:
+        """Perform a VLM call and return the raw response text."""
+        raise NotImplementedError
+
+
+class GeminiVLM(VLM):
+    """VLM client for Google Gemini via the new google-genai SDK.
+
+    Fields
+    ------
+    - model: Gemini model name, default "gemini-2.0-flash".
+
+    Behavior
+    --------
+    - Uses dsrag.dsparse.utils.imports.genai_new (lazy) to construct a
+      Client(api_key=os.environ["GEMINI_API_KEY"]).
+    - Builds GenerateContentConfig with response_mime_type="application/json",
+      including optional response_schema when provided.
+    - For models starting with "gemini-2.5", sets a ThinkingConfig with
+      thinking_budget=0 to disable thinking as per existing behavior.
+    - Compresses images using compress_image from vlm.py.
+    """
+
+    def __init__(self, model: str = "gemini-2.0-flash") -> None:
+        self.model = model
+
+    def make_llm_call(
+        self,
+        image_path: str,
+        system_message: str,
+        response_schema: Optional[Dict[str, Any]] = None,
+        max_tokens: int = 4000,
+        temperature: float = 0.5,
+    ) -> str:
+        # Local import to avoid circular dependency at module import time
+        from .vlm import compress_image  # noqa: WPS433 (allow local import)
+        import PIL.Image  # used only when this method is executed
+        import io
+
+        # Create client using lazy loader
+        client = genai_new.Client(api_key=os.environ["GEMINI_API_KEY"])  # type: ignore[attr-defined]
+
+        # Base generation config
+        config = genai_new.types.GenerateContentConfig(  # type: ignore[attr-defined]
+            temperature=temperature,
+            max_output_tokens=max_tokens,
+            response_mime_type="application/json",
+        )
+        if response_schema is not None:
+            config.response_schema = response_schema
+
+        image = None
+        try:
+            # Open and compress the image
+            image = PIL.Image.open(image_path)
+            compressed_image_bytes, _ = compress_image(image)
+
+            # Close the original image (safe to call multiple times)
+            if image:
+                image.close()
+
+            # Create content parts using bytes
+            image_part = genai_new.types.Part.from_bytes(  # type: ignore[attr-defined]
+                data=compressed_image_bytes,
+                mime_type="image/jpeg",
+            )
+            content_parts = [image_part, system_message]
+
+            # For Gemini 2.5 models, disable thinking
+            if self.model.startswith("gemini-2.5"):
+                gemini25_config = genai_new.types.GenerateContentConfig(  # type: ignore[attr-defined]
+                    temperature=temperature,
+                    max_output_tokens=max_tokens,
+                    response_mime_type="application/json",
+                    thinking_config=genai_new.types.ThinkingConfig(thinking_budget=0),  # type: ignore[attr-defined]
+                )
+                if response_schema is not None:
+                    gemini25_config.response_schema = response_schema
+
+                response = client.models.generate_content(  # type: ignore[attr-defined]
+                    model=self.model,
+                    contents=content_parts,
+                    config=gemini25_config,
+                )
+            else:
+                response = client.models.generate_content(  # type: ignore[attr-defined]
+                    model=self.model,
+                    contents=content_parts,
+                    config=config,
+                )
+            return response.text
+        finally:
+            if image is not None:
+                try:
+                    image.close()
+                except Exception:
+                    pass
+
+
+class VertexAIVLM(VLM):
+    """VLM client for Vertex AI Gemini API.
+
+    Fields
+    ------
+    - model: Vertex AI model name (required)
+    - project_id: GCP project id (required)
+    - location: GCP location/region (required)
+
+    Behavior
+    --------
+    - Uses dsrag.dsparse.utils.imports.vertexai (lazy) to initialize and call
+      the vertexai.generative_models API.
+    - Builds GenerationConfig with optional response_schema and returns
+      response.text
+    """
+
+    def __init__(self, model: str, project_id: str, location: str) -> None:
+        self.model = model
+        self.project_id = project_id
+        self.location = location
+
+    def make_llm_call(
+        self,
+        image_path: str,
+        system_message: str,
+        response_schema: Optional[Dict[str, Any]] = None,
+        max_tokens: int = 4000,
+        temperature: float = 0.5,
+    ) -> str:
+        # Initialize Vertex AI client
+        vertexai.init(project=self.project_id, location=self.location)  # type: ignore[attr-defined]
+        model = vertexai.generative_models.GenerativeModel(self.model)  # type: ignore[attr-defined]
+
+        if response_schema is not None:
+            generation_config = vertexai.generative_models.GenerationConfig(  # type: ignore[attr-defined]
+                temperature=temperature,
+                max_output_tokens=max_tokens,
+                response_mime_type="application/json",
+                response_schema=response_schema,
+            )
+        else:
+            generation_config = vertexai.generative_models.GenerationConfig(  # type: ignore[attr-defined]
+                temperature=temperature,
+                max_output_tokens=max_tokens,
+            )
+
+        response = model.generate_content(
+            [
+                vertexai.generative_models.Part.from_image(  # type: ignore[attr-defined]
+                    vertexai.generative_models.Image.load_from_file(image_path)  # type: ignore[attr-defined]
+                ),
+                system_message,
+            ],
+            generation_config=generation_config,
+        )
+        return response.text
diff --git a/dsrag/dsparse/tests/unit/test_vlm_clients.py b/dsrag/dsparse/tests/unit/test_vlm_clients.py
new file mode 100644
index 00000000..70833bc9
--- /dev/null
+++ b/dsrag/dsparse/tests/unit/test_vlm_clients.py
@@ -0,0 +1,42 @@
+import os
+import sys
+import unittest
+
+# Add the project root to the path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
+
+from dsrag.dsparse.file_parsing.vlm_clients import VLM, GeminiVLM, VertexAIVLM
+
+
+class TestVLMClients(unittest.TestCase):
+    def test_registry_contains_subclasses(self):
+        self.assertIn("GeminiVLM", VLM._subclasses)
+        self.assertIn("VertexAIVLM", VLM._subclasses)
+        self.assertIs(VLM._subclasses["GeminiVLM"], GeminiVLM)
+        self.assertIs(VLM._subclasses["VertexAIVLM"], VertexAIVLM)
+
+    def test_gemini_to_from_dict_roundtrip(self):
+        client = GeminiVLM()
+        as_dict = client.to_dict()
+        self.assertEqual(as_dict.get("subclass_name"), "GeminiVLM")
+        self.assertIn("model", as_dict)
+        rebuilt = VLM.from_dict(as_dict)
+        self.assertIsInstance(rebuilt, GeminiVLM)
+        self.assertEqual(rebuilt.model, client.model)
+
+    def test_vertex_to_from_dict_roundtrip(self):
+        client = VertexAIVLM(model="gemini-1.5-flash", project_id="proj", location="us-central1")
+        as_dict = client.to_dict()
+        self.assertEqual(as_dict.get("subclass_name"), "VertexAIVLM")
+        self.assertEqual(as_dict.get("model"), "gemini-1.5-flash")
+        self.assertEqual(as_dict.get("project_id"), "proj")
+        self.assertEqual(as_dict.get("location"), "us-central1")
+        rebuilt = VLM.from_dict(as_dict)
+        self.assertIsInstance(rebuilt, VertexAIVLM)
+        self.assertEqual(rebuilt.model, client.model)
+        self.assertEqual(rebuilt.project_id, client.project_id)
+        self.assertEqual(rebuilt.location, client.location)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0b6a2b5ac444c06d00964efc19a377d99affb993 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 12:10:57 -0700
Subject: [PATCH 03/11] formatting fixes

---
 dsrag/dsparse/file_parsing/vlm_clients.py | 15 ---------------
 dsrag/dsparse/tests/unit/test_vlm.py      |  1 -
 2 files changed, 16 deletions(-)

diff --git a/dsrag/dsparse/file_parsing/vlm_clients.py b/dsrag/dsparse/file_parsing/vlm_clients.py
index 14d8d410..e628838a 100644
--- a/dsrag/dsparse/file_parsing/vlm_clients.py
+++ b/dsrag/dsparse/file_parsing/vlm_clients.py
@@ -1,18 +1,3 @@
-"""
-Visual Language Model (VLM) abstraction and concrete clients.
-
-This module introduces an abstraction similar to LLM/Embedding/Reranker to allow
-users to extend VLM providers/models via subclassing while maintaining backward
-compatibility with existing free functions in vlm.py.
-
-Design notes:
-- Subclasses are registered via __init_subclass__ for simple construction from
-  serialized config dicts.
-- The public free functions in vlm.py delegate to these clients and preserve
-  their original signatures.
-- To avoid circular imports with vlm.compress_image, GeminiVLM imports
-  compress_image inside the make_llm_call method scope.
-"""
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
diff --git a/dsrag/dsparse/tests/unit/test_vlm.py b/dsrag/dsparse/tests/unit/test_vlm.py
index 9182c561..ed6b23a4 100644
--- a/dsrag/dsparse/tests/unit/test_vlm.py
+++ b/dsrag/dsparse/tests/unit/test_vlm.py
@@ -3,7 +3,6 @@
 import unittest
 import json
 
-
 # Add the project root to the path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
 

From 1e8ce0b7e5adbd6e7cb73c37ffbb2df7df42c015 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 12:26:18 -0700
Subject: [PATCH 04/11] Task #427: Wire VLM ABC into parsing and KnowledgeBase
 (backward compatible)

Integrate the new VLM clients into dsparse parsing and KnowledgeBase while preserving legacy dict-based configs and behavior.
---
 .../dsparse/file_parsing/vlm_file_parsing.py  | 64 +++++++++++++++++--
 dsrag/dsparse/main.py                         | 12 +++-
 dsrag/dsparse/models/types.py                 |  5 +-
 dsrag/knowledge_base.py                       | 52 +++++++++++++--
 4 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/dsrag/dsparse/file_parsing/vlm_file_parsing.py b/dsrag/dsparse/file_parsing/vlm_file_parsing.py
index 3fe646ea..02dce94c 100644
--- a/dsrag/dsparse/file_parsing/vlm_file_parsing.py
+++ b/dsrag/dsparse/file_parsing/vlm_file_parsing.py
@@ -1,6 +1,8 @@
 from .vlm import make_llm_call_gemini, make_llm_call_vertex
 from ..models.types import ElementType, Element, VLMConfig
 from .file_system import FileSystem
+from .vlm_clients import VLM, GeminiVLM, VertexAIVLM
+from typing import Optional
 from .element_types import (
     get_visual_elements_as_str, 
     get_non_visual_elements_as_str, 
@@ -133,7 +135,7 @@ def save_single_image(args):
     logger.info(f"Converted total {len(all_image_paths)} pages to images", extra=base_extra)
     return all_image_paths
 
-def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: int, vlm_config: VLMConfig, element_types: list[ElementType]) -> tuple[int, list[Element]]:
+def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: int, vlm_config: VLMConfig, element_types: list[ElementType], vlm_client: Optional[VLM] = None, vlm_fallback_client: Optional[VLM] = None) -> tuple[int, list[Element]]:
     """
     Given an image of a page, use LLM to extract the content of the page.
     This function includes retry logic and a fallback model mechanism.
@@ -161,6 +163,31 @@ def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: in
     fallback_provider = vlm_config.get("fallback_provider")
     fallback_model = vlm_config.get("fallback_model")
 
+    # Determine primary and fallback clients
+    primary_client: Optional[VLM] = vlm_client
+    fallback_client: Optional[VLM] = vlm_fallback_client
+
+    if primary_client is None:
+        if primary_provider == "gemini":
+            primary_client = GeminiVLM(model=primary_model)
+        elif primary_provider == "vertex_ai":
+            project_id = vlm_config.get("project_id")
+            location = vlm_config.get("location")
+            if project_id and location:
+                primary_client = VertexAIVLM(model=primary_model, project_id=project_id, location=location)
+            else:
+                primary_client = None  # fall back to legacy path if insufficient config
+        # else: invalid provider handled later in loop
+
+    if fallback_client is None and fallback_provider and fallback_model:
+        if fallback_provider == "gemini":
+            fallback_client = GeminiVLM(model=fallback_model)
+        elif fallback_provider == "vertex_ai":
+            project_id_fb = vlm_config.get("project_id")
+            location_fb = vlm_config.get("location")
+            if project_id_fb and location_fb:
+                fallback_client = VertexAIVLM(model=fallback_model, project_id=project_id_fb, location=location_fb)
+
     while tries < max_retries:
         # Determine which model to use for this attempt
         current_vlm_config = vlm_config.copy()
@@ -194,7 +221,34 @@ def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: in
         llm_output = None
         rate_limited = False
 
-        if current_vlm_config["provider"] == "vertex_ai":
+        # Choose instance client if available based on attempt
+        use_fallback = attempt_number > 3 and (fallback_provider and fallback_model)
+        current_client = None
+        if use_fallback and fallback_client is not None:
+            current_client = fallback_client
+            logger.debug("Using instance-based VLM fallback client", extra=base_extra)
+        elif primary_client is not None:
+            current_client = primary_client
+            logger.debug("Using instance-based VLM primary client", extra=base_extra)
+
+        if current_client is not None:
+            try:
+                temperature = current_vlm_config.get("temperature", 0.5)
+                llm_output = current_client.make_llm_call(
+                    image_path=page_image_path,
+                    system_message=system_message,
+                    response_schema=response_schema,
+                    max_tokens=current_vlm_config.get("max_tokens", 4000),
+                    temperature=temperature,
+                )
+            except Exception as e:
+                if "429" in str(e):
+                    logger.warning(f"Rate limit exceeded with instance VLM: {e}", extra=base_extra)
+                    rate_limited = True
+                else:
+                    logger.error(f"Error with instance VLM: {e}", extra=base_extra)
+                    llm_output = ""
+        elif current_vlm_config["provider"] == "vertex_ai":
             try:
                 temperature = current_vlm_config.get("temperature", 0.5)
                 llm_output = make_llm_call_vertex(
@@ -272,7 +326,7 @@ def parse_page(kb_id: str, doc_id: str, file_system: FileSystem, page_number: in
     return page_number, [{"type": "NarrativeText", "content": "Failed to process page after multiple attempts", "page_number": page_number}]
 
 
-def parse_file(pdf_path: str, kb_id: str, doc_id: str, vlm_config: VLMConfig, file_system: FileSystem) -> list[Element]:
+def parse_file(pdf_path: str, kb_id: str, doc_id: str, vlm_config: VLMConfig, file_system: FileSystem, vlm_client: Optional[VLM] = None, vlm_fallback_client: Optional[VLM] = None) -> list[Element]:
     """
     Given a PDF file, extract the content of each page using a VLM model.
     
@@ -320,7 +374,9 @@ def parse_file(pdf_path: str, kb_id: str, doc_id: str, vlm_config: VLMConfig, fi
                 file_system, 
                 i + 1, 
                 vlm_config, 
-                element_types
+                element_types,
+                vlm_client,
+                vlm_fallback_client,
             ): i 
             for i in range(len(image_file_paths))
         }
diff --git a/dsrag/dsparse/main.py b/dsrag/dsparse/main.py
index 253da808..f510d1df 100644
--- a/dsrag/dsparse/main.py
+++ b/dsrag/dsparse/main.py
@@ -12,12 +12,14 @@
 from .models.types import FileParsingConfig, VLMConfig, SemanticSectioningConfig, ChunkingConfig, Section, Chunk
 from .file_parsing.file_system import FileSystem, LocalFileSystem
 
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 import json
 
 # Get the dsparse logger
 logger = logging.getLogger("dsrag.dsparse")
 
+from .file_parsing.vlm_clients import VLM
+
 def parse_and_chunk(
     kb_id: str, 
     doc_id: str, 
@@ -27,6 +29,8 @@ def parse_and_chunk(
     file_system: FileSystem = {}, 
     file_path: str = None, 
     text: str = None,
+    vlm_client: Optional[VLM] = None,
+    vlm_fallback_client: Optional[VLM] = None,
 ) -> Tuple[List[Section], List[Chunk]]:
     """
     Inputs
@@ -124,6 +128,8 @@ def parse_and_chunk(
                 vlm_config=vlm_config,
                 semantic_sectioning_config=semantic_sectioning_config,
                 chunking_config=chunking_config,
+                vlm_client=vlm_client,
+                vlm_fallback_client=vlm_fallback_client,
             )
             duration = time.perf_counter() - start_time
             
@@ -205,6 +211,8 @@ def parse_and_chunk(
 def parse_and_chunk_vlm(
     file_path: str, kb_id: str, doc_id: str, file_system: FileSystem, vlm_config: VLMConfig,
     semantic_sectioning_config: SemanticSectioningConfig, chunking_config: ChunkingConfig,
+    vlm_client: Optional[VLM] = None,
+    vlm_fallback_client: Optional[VLM] = None,
     testing_mode: bool = False) -> Tuple[List[Section], List[Chunk]]:
     
     # Create base logging context
@@ -220,6 +228,8 @@ def parse_and_chunk_vlm(
         doc_id=doc_id, 
         vlm_config=vlm_config, 
         file_system=file_system,
+        vlm_client=vlm_client,
+        vlm_fallback_client=vlm_fallback_client,
     )
     parse_duration = time.perf_counter() - parse_start_time
     
diff --git a/dsrag/dsparse/models/types.py b/dsrag/dsparse/models/types.py
index 4efee3be..74d54796 100644
--- a/dsrag/dsparse/models/types.py
+++ b/dsrag/dsparse/models/types.py
@@ -57,4 +57,7 @@ class ChunkingConfig(TypedDict):
 class FileParsingConfig(TypedDict):
     use_vlm: Optional[bool]
     vlm_config: Optional[VLMConfig]
-    always_save_page_images: Optional[bool]
\ No newline at end of file
+    always_save_page_images: Optional[bool]
+    # Optional serialized VLM clients for first-class instance usage
+    vlm: Optional[dict]
+    vlm_fallback: Optional[dict]
\ No newline at end of file
diff --git a/dsrag/knowledge_base.py b/dsrag/knowledge_base.py
index 44364110..5ea28924 100644
--- a/dsrag/knowledge_base.py
+++ b/dsrag/knowledge_base.py
@@ -30,6 +30,7 @@
 from dsrag.dsparse.file_parsing.file_system import FileSystem, LocalFileSystem
 from dsrag.metadata import MetadataStorage, LocalMetadataStorage
 from dsrag.chat.citations import convert_elements_to_page_content
+from dsrag.dsparse.file_parsing.vlm_clients import VLM
 
 class KnowledgeBase:
     def __init__(
@@ -48,7 +49,8 @@ def __init__(
         file_system: Optional[FileSystem] = None,
         exists_ok: bool = True,
         save_metadata_to_disk: bool = True,
-        metadata_storage: Optional[MetadataStorage] = None
+        metadata_storage: Optional[MetadataStorage] = None,
+        vlm_client: Optional[VLM] = None,
     ):
         """Initialize a KnowledgeBase instance.
 
@@ -87,7 +89,7 @@ def __init__(
             # load the KB if it exists; otherwise, initialize it and save it to disk
             if self.metadata_storage.kb_exists(self.kb_id) and exists_ok:
                 self._load(
-                    auto_context_model, reranker, file_system, chunk_db, vector_db
+                    auto_context_model, reranker, file_system, chunk_db, vector_db, vlm_client
                 )
                 self._save()
             elif self.metadata_storage.kb_exists(self.kb_id) and not exists_ok:
@@ -104,7 +106,7 @@ def __init__(
                     "created_on": created_time,
                 }
                 self._initialize_components(
-                    embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system
+                    embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system, vlm_client
                 )
                 self._save()  # save the config for the KB to disk
         else:
@@ -115,7 +117,7 @@ def __init__(
                 "supp_id": supp_id,
             }
             self._initialize_components(
-                embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system
+                embedding_model, reranker, auto_context_model, vector_db, chunk_db, file_system, vlm_client
             )
 
     def _get_metadata_path(self):
@@ -134,6 +136,7 @@ def _initialize_components(
         vector_db: Optional[VectorDB],
         chunk_db: Optional[ChunkDB],
         file_system: Optional[FileSystem],
+        vlm_client: Optional[VLM] = None,
     ):
         """Initialize the knowledge base components.
 
@@ -153,6 +156,9 @@ def _initialize_components(
             chunk_db if chunk_db else BasicChunkDB(self.kb_id, self.storage_directory)
         )
         self.file_system = file_system if file_system else LocalFileSystem(base_path=os.path.join(self.storage_directory, "page_images"))
+        # Store optional VLM client for first-class use
+        self.vlm_client = vlm_client
+
         self.vector_dimension = self.embedding_model.dimension
 
     def _save(self):
@@ -168,13 +174,15 @@ def _save(self):
             "vector_db": self.vector_db.to_dict(),
             "chunk_db": self.chunk_db.to_dict(),
             "file_system": self.file_system.to_dict(),
+            # Persist VLM client if present
+            "vlm_client": (self.vlm_client.to_dict() if getattr(self, "vlm_client", None) else None),
         }
         # Combine metadata and components
         full_data = {**self.kb_metadata, "components": components}
 
         self.metadata_storage.save(full_data, self.kb_id)
 
-    def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_db=None, vector_db=None):
+    def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_db=None, vector_db=None, vlm_client: Optional[VLM] = None):
         """Load a knowledge base configuration from disk.
 
         Internal method to deserialize components and metadata.
@@ -237,6 +245,17 @@ def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_
             # If the file system does not exist and is not provided, default to LocalFileSystem
             self.file_system = LocalFileSystem(base_path=self.storage_directory)
 
+        # Load optional VLM client
+        stored_vlm_dict = components.get("vlm_client")
+        if vlm_client is not None:
+            logging.warning(f"Overriding stored vlm_client for KB '{self.kb_id}' during load.", extra=base_extra)
+            self.vlm_client = vlm_client
+        elif stored_vlm_dict:
+            # Use VLM factory to reconstruct
+            self.vlm_client = VLM.from_dict(stored_vlm_dict)
+        else:
+            self.vlm_client = None
+
         self.vector_dimension = self.embedding_model.dimension
 
     def delete(self):
@@ -440,6 +459,23 @@ def add_document(
             
             # --- Parsing and Chunking Step ---
             step_start_time = time.perf_counter()
+            # Resolve VLM clients precedence: config serialized > instance on KB > legacy dict
+            resolved_vlm_client = None
+            resolved_vlm_fallback_client = None
+            try:
+                vlm_serialized = file_parsing_config.get("vlm") if isinstance(file_parsing_config, dict) else None
+                vlm_fallback_serialized = file_parsing_config.get("vlm_fallback") if isinstance(file_parsing_config, dict) else None
+                if vlm_serialized:
+                    resolved_vlm_client = VLM.from_dict(vlm_serialized)
+                elif getattr(self, "vlm_client", None) is not None:
+                    resolved_vlm_client = self.vlm_client
+                if vlm_fallback_serialized:
+                    resolved_vlm_fallback_client = VLM.from_dict(vlm_fallback_serialized)
+            except Exception as e:
+                ingestion_logger.warning("Failed to deserialize VLM clients from file_parsing_config; falling back to legacy config", extra={**base_extra, "error": str(e)})
+                resolved_vlm_client = getattr(self, "vlm_client", None)
+                resolved_vlm_fallback_client = None
+
             sections, chunks = parse_and_chunk(
                 kb_id=self.kb_id,
                 doc_id=doc_id,
@@ -449,6 +485,8 @@ def add_document(
                 semantic_sectioning_config=semantic_sectioning_config, 
                 chunking_config=chunking_config,
                 file_system=self.file_system,
+                vlm_client=resolved_vlm_client,
+                vlm_fallback_client=resolved_vlm_fallback_client,
             )
             step_duration = time.perf_counter() - step_start_time
             ingestion_logger.debug("Parsing and Chunking complete", extra={
@@ -519,8 +557,8 @@ def add_document(
                 "chunk_db": self.chunk_db.__class__.__name__
             })
 
-            # Convert elements to page content if the document was processed with page numbers
-            if file_path and file_parsing_config.get('use_vlm', False):
+            # Convert elements to page content if elements are present (works for both VLM paths)
+            if file_path:
                 try:
                     step_start_time = time.perf_counter()
                     elements = self.file_system.load_data(kb_id=self.kb_id, doc_id=doc_id, data_name="elements")

From f292144f99af6463836e67957fbb6df407cadf80 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 12:44:03 -0700
Subject: [PATCH 05/11] Task #428: Polish VLM integration: typing, error UX,
 tests, and docs

Finalize VLM ABC integration by aligning types, improving error messages, adding missing tests, and documenting usage.
---
 README.md                                     |  51 +++++++++
 dsrag/dsparse/file_parsing/vlm_clients.py     |   7 +-
 dsrag/dsparse/models/types.py                 |   3 +
 .../test_add_document_with_serialized_vlm.py  | 102 ++++++++++++++++++
 .../tests/unit/test_vlm_config_alignment.py   |  51 +++++++++
 .../tests/unit/test_vlm_metadata_roundtrip.py |  51 +++++++++
 6 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py
 create mode 100644 dsrag/dsparse/tests/unit/test_vlm_config_alignment.py
 create mode 100644 dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py

diff --git a/README.md b/README.md
index cbf8a65c..3526fc62 100644
--- a/README.md
+++ b/README.md
@@ -258,6 +258,57 @@ file_parsing_config
     - save_path: the path to save intermediate files created during VLM processing
     - exclude_elements: a list of element types to exclude from the parsed text. Default is ["Header", "Footer"].
 
+VLM class-based clients and fallback
+- You can pass a first-class VLM client instance to KnowledgeBase for default usage:
+
+```python
+from dsrag.knowledge_base import KnowledgeBase
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+
+kb = KnowledgeBase(
+    kb_id="my_kb",
+    vlm_client=GeminiVLM(model="gemini-2.0-flash"),  # used by default for VLM parsing
+)
+```
+
+- You can also override the VLM on a per-document basis by passing a serialized client via file_parsing_config["vlm"]. This is useful when you want different models per document:
+
+```python
+vlm_override = GeminiVLM(model="gemini-2.0-flash").to_dict()
+kb.add_document(
+    doc_id="doc1",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": vlm_override,  # per-document VLM client
+        "vlm_config": {"images_already_exist": False},
+    },
+    auto_context_config={
+        "use_generated_title": False,
+        "get_document_summary": False,
+        "get_section_summaries": False,
+    },
+)
+```
+
+- Fallback configuration: you can provide a serialized fallback client via file_parsing_config["vlm_fallback"]. The system will alternate between primary and fallback after the first few retries when needed. Legacy fallback using vlm_config["fallback_provider"/"fallback_model"] is also supported.
+
+```python
+primary = GeminiVLM(model="gemini-2.0-flash").to_dict()
+fallback = GeminiVLM(model="gemini-2.5-flash").to_dict()
+
+kb.add_document(
+    doc_id="doc2",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": primary,
+        "vlm_fallback": fallback,
+        "vlm_config": {"max_pages": 10},
+    },
+)
+```
+
 semantic_sectioning_config
 - llm_provider: the LLM provider to use for semantic sectioning - "openai", "anthropic", and "gemini" are supported
 - model: the LLM model to use for semantic sectioning (e.g., "gpt-4.1-mini", "claude-3-5-haiku-latest", "gemini-2.0-flash")
diff --git a/dsrag/dsparse/file_parsing/vlm_clients.py b/dsrag/dsparse/file_parsing/vlm_clients.py
index e628838a..c41dc9a7 100644
--- a/dsrag/dsparse/file_parsing/vlm_clients.py
+++ b/dsrag/dsparse/file_parsing/vlm_clients.py
@@ -103,8 +103,11 @@ def make_llm_call(
         import PIL.Image  # used only when this method is executed
         import io
 
-        # Create client using lazy loader
-        client = genai_new.Client(api_key=os.environ["GEMINI_API_KEY"])  # type: ignore[attr-defined]
+        # Create client using lazy loader with explicit API key check
+        api_key = os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not set; required for GeminiVLM")
+        client = genai_new.Client(api_key=api_key)  # type: ignore[attr-defined]
 
         # Base generation config
         config = genai_new.types.GenerateContentConfig(  # type: ignore[attr-defined]
diff --git a/dsrag/dsparse/models/types.py b/dsrag/dsparse/models/types.py
index 74d54796..0a4631b4 100644
--- a/dsrag/dsparse/models/types.py
+++ b/dsrag/dsparse/models/types.py
@@ -42,6 +42,9 @@ class VLMConfig(TypedDict):
     exclude_elements: Optional[list[str]]
     element_types: Optional[list[ElementType]]
     max_workers: Optional[int]
+    # Additional optional configuration used by VLM parsing
+    dpi: Optional[int]
+    vlm_max_concurrent_requests: Optional[int]
 
 
 class SemanticSectioningConfig(TypedDict):
diff --git a/dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py b/dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py
new file mode 100644
index 00000000..14c3ec68
--- /dev/null
+++ b/dsrag/dsparse/tests/integration/test_add_document_with_serialized_vlm.py
@@ -0,0 +1,102 @@
+import os
+import sys
+import shutil
+import tempfile
+import unittest
+from glob import glob
+
+# Ensure project root is on sys.path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
+
+from dsrag.knowledge_base import KnowledgeBase
+from dsrag.dsparse.file_parsing.file_system import LocalFileSystem
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+from dsrag.reranker import NoReranker
+from dsrag.embedding import Embedding
+import numpy as np
+
+
+class DummyEmbedding(Embedding):
+    """Minimal embedding implementation for tests (no network)."""
+    def __init__(self, dimension: int = 8) -> None:
+        self.dimension = dimension
+
+    def get_embeddings(self, text: list[str], input_type: str = ""):
+        # Return deterministic zero vectors of the configured dimension
+        return [np.zeros(self.dimension, dtype=np.float32) for _ in text]
+
+    def to_dict(self):
+        # Provide a minimal serializable dict (not used for from_dict in this test)
+        return {"subclass_name": "DummyEmbedding", "dimension": self.dimension}
+
+
+class TestAddDocumentWithSerializedVLM(unittest.TestCase):
+    @unittest.skipIf('GEMINI_API_KEY' not in os.environ, "GEMINI_API_KEY not found in environment")
+    def test_add_document_with_serialized_vlm_and_images_already_exist(self):
+        temp_dir = tempfile.mkdtemp(prefix="dsrag_kb_add_doc_")
+        try:
+            kb_id = "kb_serialized_vlm"
+            doc_id = "doc_serialized_vlm"
+
+            # Set up file system and pre-create images under kb/doc
+            fs = LocalFileSystem(base_path=os.path.join(temp_dir, "page_images"))
+            fs.create_directory(kb_id, doc_id)
+
+            # Copy an existing test image into the kb/doc as page_1.jpg
+            src_img = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../tests/data/page_7.jpg"))
+            dst_img = os.path.join(temp_dir, "page_images", kb_id, doc_id, "page_1.jpg")
+            shutil.copyfile(src_img, dst_img)
+
+            # Build KB with dummy embedding and no reranker network
+            kb = KnowledgeBase(
+                kb_id=kb_id,
+                storage_directory=temp_dir,
+                embedding_model=DummyEmbedding(dimension=8),
+                reranker=NoReranker(),
+                file_system=fs,
+            )
+
+            # Serialized VLM client for per-document override
+            serialized_vlm = GeminiVLM(model="gemini-2.0-flash").to_dict()
+
+            # Disable AutoContext LLM calls by not generating titles/summaries
+            auto_context_config = {
+                "use_generated_title": False,
+                "get_document_summary": False,
+                "get_section_summaries": False,
+            }
+
+            # Perform add_document using existing images and serialized VLM
+            kb.add_document(
+                doc_id=doc_id,
+                file_path="dummy.pdf",  # not used when images_already_exist=True
+                document_title="Test Doc",
+                auto_context_config=auto_context_config,
+                file_parsing_config={
+                    "use_vlm": True,
+                    "vlm_config": {"images_already_exist": True},
+                    "vlm": serialized_vlm,
+                },
+                semantic_sectioning_config={
+                    "llm_provider": "openai",  # value not used when auto_context disabled
+                    "model": "gpt-4o-mini",
+                },
+                chunking_config={},
+            )
+
+            # Assert elements.json exists
+            elements_path = os.path.join(temp_dir, "page_images", kb_id, doc_id, "elements.json")
+            self.assertTrue(os.path.exists(elements_path))
+
+            # Assert page content jsons were created (convert_elements_to_page_content)
+            page_content_files = glob(os.path.join(temp_dir, "page_images", kb_id, doc_id, "page_content_*.json"))
+            self.assertTrue(len(page_content_files) >= 1)
+        finally:
+            try:
+                shutil.rmtree(temp_dir)
+            except Exception:
+                pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dsrag/dsparse/tests/unit/test_vlm_config_alignment.py b/dsrag/dsparse/tests/unit/test_vlm_config_alignment.py
new file mode 100644
index 00000000..3f695930
--- /dev/null
+++ b/dsrag/dsparse/tests/unit/test_vlm_config_alignment.py
@@ -0,0 +1,51 @@
+import os
+import sys
+import shutil
+import tempfile
+import unittest
+
+# Ensure project root is on sys.path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
+
+from dsrag.dsparse.file_parsing.vlm_file_parsing import parse_file
+from dsrag.dsparse.file_parsing.file_system import LocalFileSystem
+
+
+class TestVLMConfigTypedDictAlignment(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp(prefix="dsparse_vlm_cfg_")
+        self.kb_id = "kb_vlm_cfg"
+        self.doc_id = "doc_vlm_cfg"
+        self.fs = LocalFileSystem(base_path=self.temp_dir)
+        # Ensure directory exists but contains no images
+        self.fs.create_directory(self.kb_id, self.doc_id)
+
+    def tearDown(self):
+        try:
+            shutil.rmtree(self.temp_dir)
+        except Exception:
+            pass
+
+    def test_vlm_config_typedict_alignment(self):
+        # Provide dpi and vlm_max_concurrent_requests; zero images and images_already_exist=True
+        vlm_config = {
+            "images_already_exist": True,
+            "dpi": 120,
+            "vlm_max_concurrent_requests": 2,
+        }
+        # pdf_path can be empty when images_already_exist is True
+        elements = parse_file(
+            pdf_path="",
+            kb_id=self.kb_id,
+            doc_id=self.doc_id,
+            vlm_config=vlm_config,  # type: ignore[arg-type]
+            file_system=self.fs,
+        )
+        # Should return an empty list and write elements.json
+        self.assertIsInstance(elements, list)
+        self.assertEqual(len(elements), 0)
+        self.assertTrue(os.path.exists(os.path.join(self.temp_dir, self.kb_id, self.doc_id, "elements.json")))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py b/dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py
new file mode 100644
index 00000000..644c9cd9
--- /dev/null
+++ b/dsrag/dsparse/tests/unit/test_vlm_metadata_roundtrip.py
@@ -0,0 +1,51 @@
+import os
+import sys
+import shutil
+import tempfile
+import unittest
+
+# Ensure project root is on sys.path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
+
+from dsrag.knowledge_base import KnowledgeBase
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+from dsrag.reranker import NoReranker
+
+
+class TestVLMMetaRoundTrip(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp(prefix="dsrag_kb_test_")
+        self.kb_id = "kb_vlm_meta_rt"
+
+    def tearDown(self):
+        try:
+            shutil.rmtree(self.temp_dir)
+        except Exception:
+            pass
+
+    def test_kb_persists_vlm_client_in_metadata(self):
+        # Create KB with a GeminiVLM client set
+        kb = KnowledgeBase(
+            kb_id=self.kb_id,
+            storage_directory=self.temp_dir,
+            vlm_client=GeminiVLM(model="gemini-2.0-flash"),
+            reranker=NoReranker(),
+        )
+        # Force a save and then reload
+        kb._save()
+
+        # Re-initialize KB - should load from metadata
+        kb2 = KnowledgeBase(
+            kb_id=self.kb_id,
+            storage_directory=self.temp_dir,
+            exists_ok=True,
+        )
+        self.assertIsNotNone(kb2.vlm_client)
+        # Ensure correct type + model preserved
+        from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM as _GeminiVLM
+        self.assertIsInstance(kb2.vlm_client, _GeminiVLM)
+        self.assertEqual(kb2.vlm_client.model, "gemini-2.0-flash")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 847308148fa659d1020b72fcc1bf950abaebec6d Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 13:11:03 -0700
Subject: [PATCH 06/11] README update

---
 dsrag/dsparse/README.md | 2 +-
 dsrag/knowledge_base.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dsrag/dsparse/README.md b/dsrag/dsparse/README.md
index 89bcfb34..75623ae2 100644
--- a/dsrag/dsparse/README.md
+++ b/dsrag/dsparse/README.md
@@ -97,4 +97,4 @@ Semantic sectioning cost calculation (`gpt-4o-mini`)
 - Output: 50 tokens x $0.60/10^6 per token = $0.00003
 - Total: $0.00015/page or **$0.15 per 1000 pages**
 
-Document text is processed in ~5000 token mega-chunks, which is roughly ten pages on average. But these mega-chunks have to be processed sequentially for each document. Processing each mega-chunk only takes a couple seconds, though, so even a large document of a few hundred pages will only take 20-60 seconds. Rate limits for the OpenAI API are heavily dependent on the usage tier you're in.
+Document text is processed in ~5000 token mega-chunks, which is roughly ten pages on average. These mega-chunks are processed in parallel for each document. Processing each mega-chunk only takes a few seconds, though, so even a large document of a few hundred pages should only take 5-10 seconds. Rate limits for the OpenAI API are heavily dependent on the usage tier you're in.
diff --git a/dsrag/knowledge_base.py b/dsrag/knowledge_base.py
index 5ea28924..c2f65959 100644
--- a/dsrag/knowledge_base.py
+++ b/dsrag/knowledge_base.py
@@ -459,6 +459,7 @@ def add_document(
             
             # --- Parsing and Chunking Step ---
             step_start_time = time.perf_counter()
+            
             # Resolve VLM clients precedence: config serialized > instance on KB > legacy dict
             resolved_vlm_client = None
             resolved_vlm_fallback_client = None

From d8b82c9a8c764cc8a6391098a54bf1837fa329f7 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 13:17:05 -0700
Subject: [PATCH 07/11] update VLM pricing calculation in README

---
 dsrag/dsparse/README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/dsrag/dsparse/README.md b/dsrag/dsparse/README.md
index 75623ae2..6f062cd5 100644
--- a/dsrag/dsparse/README.md
+++ b/dsrag/dsparse/README.md
@@ -81,13 +81,14 @@ The default model for semantic sectioning is `gpt-4o-mini`, but similar or stron
 An obvious concern with using a VLM to parse documents is the cost. Let's run the numbers:
 
 VLM file parsing cost calculation (`gemini-2.0-flash`)
-- Text input (prompt) + image input: 400 (text) + 258 (image) tokens x $0.10/10^6 per token = $0.000066
-- Text output: 600 tokens x $0.40/10^6 per token = $0.000240
-- Total: $0.000306/page or **$0.31 per 1000 pages**
+- Input tokens for images are calculated based on the number of 768x768 tiles needed. At the standard dpi of 100 (or even up to around 150), this usually means 4 tiles. Each tile is counted as 258 tokens.
+- Text input (prompt) + image input: 500 (text) + 4x258 (image) tokens x $0.10/10^6 per token = $0.0001532
+- Text output: 700 tokens x $0.40/10^6 per token = $0.0002800
+- Total: $0.0004332/page or **$0.43 per 1000 pages**
 
-This is substantially cheaper than commercially available OCR/PDF parsing services. Unstructured and Azure Document Intelligence, for example, both cost $10 per 1000 pages. 
+This is substantially cheaper than commercially available OCR/PDF parsing services. Unstructured and Azure Document Intelligence, for example, both cost $10 per 1000 pages. Reducto is generally $10-20 per 1000 pages.
 
-What about latency and throughput? Since each page is processed independently, this is a highly parallelizable problem. The main limiting factor then is the rate limits imposed by the VLM provider. The current rate limit for `gemini-2.0-flash` is 2000 requests per minute. Since dsParse uses one request per page, that means the limit is 2000 pages per minute. Processing a single page takes around 15-20 seconds, so that's the minimum latency for processing a document.
+What about latency and throughput? Since each page is processed independently, this is a highly parallelizable problem. The main limiting factor then is the rate limits imposed by the VLM provider. The current rate limit for `gemini-2.0-flash` on the highest tier is 30k requests per minute. Since dsParse uses one request per page, that means the limit is 30k pages per minute. Processing a single page takes around 15-20 seconds, so that's the minimum latency for processing a document.
 
 ### Semantic sectioning
 Semantic sectioning produces far fewer output tokens, so it ends up being a bit cheaper than the file parsing step.

From da65ec3665b2ee06dbe28feb088d9e88f4c9254b Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 13:24:59 -0700
Subject: [PATCH 08/11] Task #429: Add integration test for serialized VLM
 client in VLM file parsing

Add a real-API, env-gated test to validate the new VLM client (serialized) path in parse_and_chunk.
---
 .../integration/test_vlm_file_parsing.py      | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py b/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py
index 81a060b9..eedffb21 100644
--- a/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py
+++ b/dsrag/dsparse/tests/integration/test_vlm_file_parsing.py
@@ -111,6 +111,46 @@ def test_non_vlm_file_parsing(self):
         self.assertTrue(len(sections[0]["title"]) > 0)
         self.assertTrue(len(sections[0]["content"]) > 0)
 
+    def test__parse_and_chunk_vlm_with_serialized_client(self):
+        import os as _os
+        if 'GEMINI_API_KEY' not in _os.environ:
+            self.skipTest("GEMINI_API_KEY not found in environment")
+
+        from dsparse.file_parsing.vlm_clients import GeminiVLM
+
+        file_parsing_config = {
+            "use_vlm": True,
+            "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(),
+            "always_save_page_images": True,
+        }
+        sections, chunks = parse_and_chunk(
+            kb_id=self.kb_id,
+            # use a distinct doc_id to avoid collisions with other tests
+            doc_id=f"{self.doc_id}_vlm_client",
+            file_path=self.test_data_path,
+            file_parsing_config=file_parsing_config,
+            semantic_sectioning_config={
+                "llm_provider": "openai",
+                "model": "gpt-4o-mini",
+                "language": "en",
+            },
+            chunking_config={},
+            file_system=self.file_system,
+        )
+
+        # Assertions mirror existing tests
+        self.assertTrue(len(sections) > 0)
+        self.assertTrue(len(chunks) > 0)
+        for key, expected_type in Section.__annotations__.items():
+            self.assertIsInstance(sections[0][key], expected_type)
+        for key, expected_type in Chunk.__annotations__.items():
+            self.assertIsInstance(chunks[0][key], expected_type)
+
+        # elements.json should be emitted for the document
+        self.assertTrue(os.path.exists(
+            os.path.join(self.save_path, self.kb_id, f"{self.doc_id}_vlm_client", "elements.json")
+        ))
+
 
     @classmethod
     def tearDownClass(self):

From e7feafc21f7ee047bbaf9cd95814bf3d4d3a5e94 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 13:35:36 -0700
Subject: [PATCH 09/11] Task #430: Add top-level integration test for
 serialized VLM client

Add a real-API, env-gated test in the top-level integration suite to validate the serialized VLM client path.
---
 tests/integration/test_vlm_file_parsing.py | 63 ++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/tests/integration/test_vlm_file_parsing.py b/tests/integration/test_vlm_file_parsing.py
index fd98b3b7..568fe248 100644
--- a/tests/integration/test_vlm_file_parsing.py
+++ b/tests/integration/test_vlm_file_parsing.py
@@ -5,6 +5,8 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 from dsrag.dsparse.file_parsing.file_system import LocalFileSystem
 from dsrag.knowledge_base import KnowledgeBase
+from dsrag.dsparse.main import parse_and_chunk
+from dsrag.dsparse.models.types import Section, Chunk
 
 
 class TestRetrieval(unittest.TestCase):
@@ -271,5 +273,66 @@ def cleanup(self):
         os.system("rm -rf ~/dsrag_test_mck_energy")
 
         
+class TestVLMFileParsing(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Shared fixtures for VLM parse_and_chunk test
+        cls.kb_id = "mck_energy_test"
+        cls.doc_id = "mck_energy_report"
+
+        # Save path and file system
+        cls.save_path = os.path.expanduser("~/dsrag_test_mck_energy")
+        cls.file_system = LocalFileSystem(base_path=cls.save_path)
+
+        # Test data path (PDF)
+        cls.test_data_path = os.path.abspath(
+            os.path.join(os.path.dirname(__file__), "../data/mck_energy_first_5_pages.pdf")
+        )
+
+    def test__parse_and_chunk_vlm_with_serialized_client(self):
+        import os as _os
+        if 'GEMINI_API_KEY' not in _os.environ:
+            self.skipTest("GEMINI_API_KEY not found in environment")
+
+        # Use serialized VLM client config
+        from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM, VLM
+
+        client_dict = GeminiVLM(model="gemini-2.0-flash").to_dict()
+
+        file_parsing_config = {
+            "use_vlm": True,
+            "vlm": client_dict,
+            "always_save_page_images": True,
+        }
+
+        sections, chunks = parse_and_chunk(
+            kb_id=self.kb_id,
+            # Use a distinct doc_id to avoid collisions with other tests
+            doc_id=f"{self.doc_id}_vlm_client",
+            file_path=self.test_data_path,
+            file_parsing_config=file_parsing_config,
+            semantic_sectioning_config={
+                "use_semantic_sectioning": False,
+            },
+            chunking_config={},
+            file_system=self.file_system,
+            vlm_client=VLM.from_dict(client_dict),
+        )
+
+        # Assertions mirror existing patterns
+        self.assertTrue(len(sections) > 0)
+        self.assertTrue(len(chunks) > 0)
+        # Validate structure of first section and chunk
+        for key, expected_type in Section.__annotations__.items():
+            self.assertIsInstance(sections[0][key], expected_type)
+        for key, expected_type in Chunk.__annotations__.items():
+            self.assertIsInstance(chunks[0][key], expected_type)
+
+        # elements.json should be emitted for the document
+        self.assertTrue(os.path.exists(
+            os.path.join(self.save_path, self.kb_id, f"{self.doc_id}_vlm_client", "elements.json")
+        ))
+
+
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file

From 4446c44cc045dab163be413d45f3016d79780ec4 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 13:43:19 -0700
Subject: [PATCH 10/11] Task #431: Update READMEs for VLM refactor (class-based
 VLM, fallback, and back-compat)

Revise top-level README.md and dsrag/dsparse/README.md to document the new VLM ABC usage, fallback configuration, and backward compatibility with legacy dict configs.
---
 README.md               | 61 +++++++++++++++++++++++++++++
 dsrag/dsparse/README.md | 85 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+)

diff --git a/README.md b/README.md
index 3526fc62..15e2b927 100644
--- a/README.md
+++ b/README.md
@@ -235,6 +235,67 @@ For the `S3FileSystem`, the following parameters are needed:
 
 The `base_path` is used when downloading files from S3. The files have to be stored locally in order to be used in the retrieval system. 
 
+#### VLM Clients (new)
+Visual Language Models (VLMs) now follow the same class-based abstraction pattern (ABC) as LLM, Embedding, and Reranker components. You can supply a first-class VLM client instance to the KnowledgeBase, or override per document by passing a serialized client. Backward compatibility is maintained: legacy provider/model dictionaries in `file_parsing_config['vlm_config']` still work.
+
+- Class-based usage (KB default):
+```python
+from dsrag.knowledge_base import KnowledgeBase
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+
+kb = KnowledgeBase(
+    kb_id="my_kb",
+    vlm_client=GeminiVLM(model="gemini-2.0-flash"),  # default VLM used for VLM parsing
+)
+```
+
+- Per-document override (serialized client):
+```python
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+
+kb.add_document(
+    doc_id="doc1",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(),  # per-document override
+        "vlm_config": {"max_pages": 10},
+    },
+    auto_context_config={
+        "use_generated_title": False,
+        "get_document_summary": False,
+        "get_section_summaries": False,
+    },
+)
+```
+
+- Fallback configuration (preferred, class-based):
+```python
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+
+primary = GeminiVLM(model="gemini-2.0-flash").to_dict()
+fallback = GeminiVLM(model="gemini-2.5-flash").to_dict()
+
+kb.add_document(
+    doc_id="doc2",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": primary,
+        "vlm_fallback": fallback,
+        "vlm_config": {"max_pages": 10},
+    },
+)
+```
+Legacy dict-based fallback remains supported via `vlm_config["fallback_provider"/"fallback_model"]`.
+
+- Backward compatibility and precedence
+  - The legacy dict path (e.g., `provider`, `model`, `images_already_exist`, etc.) continues to work.
+  - When both a serialized client (`vlm`) and a legacy `provider`/`model` are supplied, the system prefers `vlm`/`vlm_fallback`.
+
+- Environment variables
+  - `GEMINI_API_KEY` is required for `GeminiVLM`; a clear error is raised if missing.
+
 ## Config dictionaries
 Since there are a lot of configuration parameters available, they're organized into a few config dictionaries. There are four config dictionaries that can be passed in to `add_document` (`auto_context_config`, `file_parsing_config`, `semantic_sectioning_config`, and `chunking_config`) and one that can be passed in to `query` (`rse_params`).
 
diff --git a/dsrag/dsparse/README.md b/dsrag/dsparse/README.md
index 6f062cd5..f013ad0b 100644
--- a/dsrag/dsparse/README.md
+++ b/dsrag/dsparse/README.md
@@ -34,6 +34,91 @@ kb.add_document(
 )
 ```
 
+## VLM clients
+VLMs now support a class-based client abstraction (similar to LLM/Embedding/Reranker) that you can pass either at the KB level or per document. Legacy dict-based `vlm_config` remains fully supported.
+
+- Quickstart with class-based client (serialized) and LocalFileSystem
+```python
+from dsrag.dsparse.main import parse_and_chunk
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+from dsrag.dsparse.file_parsing.file_system import LocalFileSystem
+
+sections, chunks = parse_and_chunk(
+    kb_id="sample_kb",
+    doc_id="sample_doc",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(),
+        "vlm_config": {"max_pages": 5, "vlm_max_concurrent_requests": 2},
+    },
+    file_system=LocalFileSystem(base_path="~/dsParse"),
+)
+```
+
+- Fallback (preferred, class-based):
+```python
+from dsrag.dsparse.main import parse_and_chunk
+from dsrag.dsparse.file_parsing.vlm_clients import GeminiVLM
+
+primary = GeminiVLM(model="gemini-2.0-flash").to_dict()
+fallback = GeminiVLM(model="gemini-2.5-flash").to_dict()
+
+sections, chunks = parse_and_chunk(
+    kb_id="kb",
+    doc_id="doc",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": primary,
+        "vlm_fallback": fallback,
+        "vlm_config": {"max_pages": 5},
+    },
+)
+```
+
+- Legacy path (still valid):
+```python
+from dsrag.dsparse.main import parse_and_chunk
+
+sections, chunks = parse_and_chunk(
+    kb_id="kb",
+    doc_id="doc",
+    file_path="/path/to/file.pdf",
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm_config": {
+            "provider": "gemini",
+            "model": "gemini-2.0-flash",
+            "max_pages": 5,
+            # Optional legacy fallback
+            "fallback_provider": "gemini",
+            "fallback_model": "gemini-2.5-flash",
+        },
+    },
+)
+```
+
+- Images already exist
+If you’ve pre-extracted page images into the configured FileSystem directory structure, you can reuse them:
+```python
+sections, chunks = parse_and_chunk(
+    kb_id="kb",
+    doc_id="doc",
+    file_path="/path/to/file.pdf",  # path still required for metadata, but images won’t be regenerated
+    file_parsing_config={
+        "use_vlm": True,
+        "vlm": GeminiVLM(model="gemini-2.0-flash").to_dict(),
+        "vlm_config": {"images_already_exist": True},
+    },
+)
+```
+
+- Notes
+  - Parallelism controls and DPI are in `vlm_config` (e.g., `vlm_max_concurrent_requests`, `dpi`).
+  - Page images and `elements.json` are saved via the configured `FileSystem`.
+  - Environment variable `GEMINI_API_KEY` is required for `GeminiVLM`. Clear errors are raised if missing.
+
 ## Installation
 If you want to use dsParse on its own, without installing the full `dsrag` package, there is a standalone Python package available for dsParse, which can be installed with `pip install dsparse`. If you already have `dsrag` installed, you DO NOT need to separately install `dsparse`.
 

From a1f28905d0db7bd628d3f4b1806276495619dde9 Mon Sep 17 00:00:00 2001
From: Zach <zacharymccormick755@gmail.com>
Date: Wed, 5 Nov 2025 13:49:32 -0700
Subject: [PATCH 11/11] KnowledgeBase docstring updates

---
 dsrag/knowledge_base.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/dsrag/knowledge_base.py b/dsrag/knowledge_base.py
index c2f65959..7ad78136 100644
--- a/dsrag/knowledge_base.py
+++ b/dsrag/knowledge_base.py
@@ -77,6 +77,8 @@ def __init__(
             save_metadata_to_disk (bool, optional): Whether to persist metadata. Defaults to True.
             metadata_storage (Optional[MetadataStorage], optional): Storage for KB metadata. 
                 Defaults to LocalMetadataStorage.
+            vlm_client (Optional[VLM], optional): VLM client for parsing documents. 
+                Defaults to None.
 
         Raises:
             ValueError: If KB exists and exists_ok is False.
@@ -193,7 +195,7 @@ def _load(self, auto_context_model=None, reranker=None, file_system=None, chunk_
             file_system (Optional[FileSystem], optional): Override stored file system.
             chunk_db (Optional[ChunkDB], optional): Override stored chunk database.
             vector_db (Optional[VectorDB], optional): Override stored vector database.
-
+            vlm_client (Optional[VLM], optional): Override stored VLM client.
         Note:
             Only auto_context_model and reranker can safely override stored components.
             Other component overrides may break functionality if not compatible.
@@ -344,20 +346,23 @@ def add_document(
                 {
                     # Whether to use VLM for parsing
                     "use_vlm": False,
+
+                    # VLM client for parsing documents
+                    "vlm": serialized VLM client,
                     
                     # VLM configuration (ignored if use_vlm is False)
                     "vlm_config": {
                         # VLM provider (currently only "gemini" and "vertex_ai" supported)
-                        "provider": "vertex_ai",
+                        "provider": "vertex_ai", - ignored if vlm is provided
                         
                         # The VLM model to use
-                        "model": "model_name",
+                        "model": "model_name", - ignored if vlm is provided
                         
                         # GCP project ID (required for "vertex_ai")
-                        "project_id": "your-project-id",
+                        "project_id": "your-project-id", - ignored if vlm is provided
                         
                         # GCP location (required for "vertex_ai")
-                        "location": "us-central1",
+                        "location": "us-central1", - ignored if vlm is provided
                         
                         # Path to save intermediate files
                         "save_path": "/path/to/save",