cloudera
diff --git a/‎llm-service/app/ai/indexing/base.py
Lines changed: 22 additions & 10 deletions b/‎llm-service/app/ai/indexing/base.py
Lines changed: 22 additions & 10 deletions
diff --git a/‎llm-service/app/ai/indexing/readers/docling_reader.py
Lines changed: 95 additions & 0 deletions b/‎llm-service/app/ai/indexing/readers/docling_reader.py
Lines changed: 95 additions & 0 deletions
diff --git a/‎llm-service/app/ai/indexing/readers/pdf.py
Lines changed: 13 additions & 17 deletions b/‎llm-service/app/ai/indexing/readers/pdf.py
Lines changed: 13 additions & 17 deletions
diff --git a/‎llm-service/app/ai/indexing/summary_indexer.py
Lines changed: 70 additions & 0 deletions b/‎llm-service/app/ai/indexing/summary_indexer.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎llm-service/app/config.py
Lines changed: 4 additions & 0 deletions b/‎llm-service/app/config.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎llm-service/app/routers/index/data_source/__init__.py
Lines changed: 5 additions & 1 deletion b/‎llm-service/app/routers/index/data_source/__init__.py
Lines changed: 5 additions & 1 deletion
@@ -1,3 +1,4 @@
+import logging
 import os
 from abc import abstractmethod
 from dataclasses import dataclass
@@ -6,13 +7,17 @@
 
 from .readers.base_reader import BaseReader, ReaderConfig
 from .readers.csv import CSVReader
+from .readers.docling_reader import DoclingReader
 from .readers.docx import DocxReader
 from .readers.images import ImagesReader
 from .readers.json import JSONReader
 from .readers.markdown import MdReader
 from .readers.pdf import PDFReader
 from .readers.pptx import PptxReader
 from .readers.simple_file import SimpleFileReader
+from ...config import settings
+
+logger = logging.getLogger(__name__)
 
 READERS: Dict[str, Type[BaseReader]] = {
     ".pdf": PDFReader,
@@ -29,6 +34,11 @@
     ".png": ImagesReader,
 }
 
+DOCLING_READERS: Dict[str, Type[BaseReader]] = {
+    ".pdf": DoclingReader,
+    ".html": DoclingReader,
+}
+
 
 @dataclass
 class NotSupportedFileExtensionError(Exception):
@@ -50,17 +60,19 @@ def index_file(self, file_path: Path, doc_id: str) -> None:
 
     def _get_reader_class(self, file_path: Path) -> Type[BaseReader]:
         file_extension = os.path.splitext(file_path)[1]
-        reader_cls = READERS.get(file_extension)
+        reader_cls: Optional[Type[BaseReader]] = None
+        if settings.advanced_pdf_parsing and DOCLING_READERS.get(file_extension):
+            try:
+                reader_cls = DoclingReader
+            except Exception as e:
+                logger.error(
+                    "Error initializing DoclingReader, falling back to default readers",
+                    e,
+                )
+                reader_cls = READERS.get(file_extension)
+        else:
+            reader_cls = READERS.get(file_extension)
         if not reader_cls:
             raise NotSupportedFileExtensionError(file_extension)
 
         return reader_cls
-
-
-def get_reader_class(file_path: Path) -> Type[BaseReader]:
-    file_extension = os.path.splitext(file_path)[1]
-    reader_cls = READERS.get(file_extension)
-    if not reader_cls:
-        raise NotSupportedFileExtensionError(file_extension)
-
-    return reader_cls
 
@@ -0,0 +1,95 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import logging
+from pathlib import Path
+from typing import List, Any
+
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
+from docling_core.transforms.chunker.base import BaseChunk
+from docling_core.transforms.serializer.base import SerializationResult
+from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
+from llama_index.core.schema import Document, TextNode, NodeRelationship
+
+from .base_reader import BaseReader
+from .base_reader import ChunksResult
+from .pdf import MarkdownSerializerProvider
+
+logger = logging.getLogger(__name__)
+
+class DoclingReader(BaseReader):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+    def load_chunks(self, file_path: Path) -> ChunksResult:
+        document = Document()
+        document.id_ = self.document_id
+        self._add_document_metadata(document, file_path)
+        parent = document.as_related_node_info()
+
+        converted_chunks: List[TextNode] = []
+        logger.debug(f"{file_path=}")
+        docling_doc: ConversionResult = DocumentConverter().convert(file_path)
+        chunky_chunks = HierarchicalChunker(serializer_provider=MarkdownSerializerProvider()).chunk(docling_doc.document)
+        chunky_chunk: BaseChunk
+        serializer = MarkdownDocSerializer(doc=docling_doc.document)
+        for i, chunky_chunk in enumerate(chunky_chunks):
+            text = ""
+            page_number: int = 0
+            if not hasattr(chunky_chunk.meta, "doc_items"):
+                logger.warning(f"Chunk {i} is empty, skipping")
+                continue
+            for item in chunky_chunk.meta.doc_items:
+                page_number= item.prov[0].page_no if item.prov else None
+                item_ser: SerializationResult = serializer.serialize(item=item)
+                text += item_ser.text
+            node = TextNode(text=text)
+            if page_number:
+                node.metadata["page_number"] = page_number
+            node.metadata["file_name"] = document.metadata["file_name"]
+            node.metadata["document_id"] = document.metadata["document_id"]
+            node.metadata["data_source_id"] = document.metadata["data_source_id"]
+            node.metadata["chunk_number"] = i
+            node.metadata["chunk_format"] = "markdown"
+            node.relationships.update(
+                {NodeRelationship.SOURCE: parent}
+            )
+            converted_chunks.append(node)
+        return ChunksResult(converted_chunks)
@@ -35,17 +35,18 @@
 #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 #  DATA.
 #
-import os
 import logging
 from pathlib import Path
 from typing import Any, List
 
+from docling_core.transforms.serializer.base import BaseSerializerProvider, BaseDocSerializer
+from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
+from docling_core.types.doc.document import DoclingDocument
 from llama_index.core.schema import Document, TextNode
 from llama_index.readers.file import PDFReader as LlamaIndexPDFReader
+from typing_extensions import override
 
-from ....exceptions import DocumentParseError
 from .base_reader import BaseReader, ChunksResult
-from .docling import load_chunks
 from .markdown import MdReader
 
 logger = logging.getLogger(__name__)
@@ -88,27 +89,22 @@ def populate_chunk_page_numbers(self, chunks: List[TextNode]) -> None:
                 chunk.metadata["page_number"] = chunk_label
 
 
+class MarkdownSerializerProvider(BaseSerializerProvider):
+    """Serializer provider used for chunking purposes."""
+
+    @override
+    def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
+        """Get the associated serializer."""
+        return MarkdownDocSerializer(doc=doc)
+
+
 class PDFReader(BaseReader):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.inner = LlamaIndexPDFReader(return_full_document=False)
         self.markdown_reader = MdReader(*args, **kwargs)
 
     def load_chunks(self, file_path: Path) -> ChunksResult:
-        docling_enabled: bool = (
-            os.getenv("USE_ENHANCED_PDF_PROCESSING", "false").lower() == "true"
-        )
-        logger.info(f"{docling_enabled=}")
-        try:
-            if docling_enabled:
-                logger.debug(f"{file_path=}")
-                chunks: list[TextNode] = load_chunks(self.markdown_reader, file_path)
-                if chunks:
-                    # todo: handle pii & secrets
-                    return ChunksResult(chunks=chunks)
-        except DocumentParseError as e:
-            logger.warning(f"Failed to parse document with docling: {e}")
-
         ret = ChunksResult()
 
         pages: list[Document] = self.inner.load_data(file_path)
 
@@ -37,6 +37,7 @@
 #
 import logging
 import os
+import random
 import shutil
 from pathlib import Path
 from threading import Lock
@@ -277,6 +278,9 @@ def index_file(self, file_path: Path, document_id: str) -> None:
         chunks: ChunksResult = reader.load_chunks(file_path)
         nodes: List[TextNode] = chunks.chunks
 
+        nodes = self.sample_nodes(nodes, 1000, 20)
+        logger.debug(f"Using {len(nodes)} nodes from {len(chunks.chunks)} total nodes")
+
         if not nodes:
             logger.warning(f"No chunks found for file {file_path}")
             return
@@ -358,6 +362,72 @@ def __update_global_summary_store(
         global_summary_store.insert_nodes(new_nodes)
         global_summary_store.storage_context.persist(persist_dir=global_persist_dir)
 
+    def sample_nodes(
+        self,
+        nodes: List[TextNode],
+        max_number_to_sample: int = 1000,
+        sample_block_size: int = 20,
+    ) -> List[TextNode]:
+        """
+        Sample max_number_to_sample in contiguous blocks of sample_block_size if we have more than max_number_to_sample nodes.
+        This sampling helps reduce processing time for very large documents while maintaining context coherence.
+
+        Args:
+            nodes: List of TextNode objects to sample from
+            max_number_to_sample: max number of nodes to sample
+            sample_block_size: how big the contiguous blocks should be
+
+        Returns:
+            A list of sampled TextNode objects, or the original list if it has 1000 or fewer nodes
+        """
+        if len(nodes) <= max_number_to_sample:
+            return nodes
+
+        num_blocks = max_number_to_sample // sample_block_size
+        block_size = sample_block_size
+
+        # Calculate the maximum valid starting index for a block
+        max_block_start_index = len(nodes) - block_size
+
+        # Randomly select starting indices for blocks, ensuring they're at least block_size apart
+        # to avoid overlapping blocks
+        available_indices = list(range(max_block_start_index + 1))
+        block_start_indices: list[int] = []
+
+        # Try to get num_blocks non-overlapping blocks
+        while len(block_start_indices) < num_blocks and available_indices:
+            # Randomly select an index from available indices
+            if not available_indices:
+                break
+            idx = random.choice(available_indices)
+            block_start_indices.append(idx)
+
+            # Remove this index and all indices that would create overlapping blocks
+            # (i.e., all indices within block_size of the selected index)
+            for i in range(
+                max(0, idx - block_size + 1), min(len(nodes), idx + block_size)
+            ):
+                if i in available_indices:
+                    available_indices.remove(i)
+
+        # Sort the indices to maintain order
+        block_start_indices.sort()
+
+        # Extract blocks of block_size contiguous nodes
+        sampled_nodes = []
+        for start_idx in block_start_indices:
+            sampled_nodes.extend(nodes[start_idx : start_idx + block_size])
+
+        # If we couldn't get enough blocks (if document is not large enough)
+        # but still larger than 1000, take the first 1000
+        if (
+            len(sampled_nodes) < max_number_to_sample
+            and len(nodes) >= max_number_to_sample
+        ):
+            return nodes[:max_number_to_sample]
+        else:
+            return sampled_nodes
+
     def get_summary(self, document_id: str) -> Optional[str]:
         with _write_lock:
             persist_dir = self.__persist_dir()
 
@@ -101,6 +101,10 @@ def qdrant_host(self) -> str:
     def qdrant_port(self) -> int:
         return int(os.environ.get("QDRANT_PORT", "6333"))
 
+    @property
+    def advanced_pdf_parsing(self) -> bool:
+        return os.environ.get("USE_ENHANCED_PDF_PROCESSING", "false").lower() == "true"
+
     @property
     def vector_db_provider(self) -> Optional[str]:
         return os.environ.get("VECTOR_DB_PROVIDER")
 
@@ -252,7 +252,11 @@ def summarize_document(
             try:
                 indexer.index_file(file_path, doc_id)
                 summary = indexer.get_summary(doc_id)
-                assert summary is not None
+                if summary is None:
+                    raise HTTPException(
+                        status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
+                        detail="No content to summarize.",
+                    )
                 return summary
             except NotSupportedFileExtensionError as e:
                 raise HTTPException(