Skip to content

Commit d04759d

Browse files
authored
Get docling fixes onto main (#258)
* use the hybrid chunker for docling to improve chunk sizes * update docling to fix concurrency issue * fix imports
1 parent b2162f8 commit d04759d

File tree

3 files changed

+23
-27
lines changed

3 files changed

+23
-27
lines changed

llm-service/app/ai/indexing/readers/docling_reader.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,8 @@
4242

4343
from docling.datamodel.document import ConversionResult
4444
from docling.document_converter import DocumentConverter
45-
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
4645
from docling_core.transforms.chunker.base import BaseChunk
47-
from docling_core.transforms.serializer.base import SerializationResult
48-
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
46+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
4947
from llama_index.core.schema import Document, TextNode, NodeRelationship
5048

5149
from .base_reader import BaseReader
@@ -67,20 +65,16 @@ def load_chunks(self, file_path: Path) -> ChunksResult:
6765
converted_chunks: List[TextNode] = []
6866
logger.debug(f"{file_path=}")
6967
docling_doc: ConversionResult = DocumentConverter().convert(file_path)
70-
chunky_chunks = HierarchicalChunker(serializer_provider=MarkdownSerializerProvider()).chunk(docling_doc.document)
68+
chunky_chunks = HybridChunker(serializer_provider=MarkdownSerializerProvider()).chunk(docling_doc.document)
7169
chunky_chunk: BaseChunk
72-
serializer = MarkdownDocSerializer(doc=docling_doc.document)
7370
for i, chunky_chunk in enumerate(chunky_chunks):
74-
text = ""
7571
page_number: int = 0
7672
if not hasattr(chunky_chunk.meta, "doc_items"):
7773
logger.warning(f"Chunk {i} is empty, skipping")
7874
continue
7975
for item in chunky_chunk.meta.doc_items:
8076
page_number= item.prov[0].page_no if item.prov else None
81-
item_ser: SerializationResult = serializer.serialize(item=item)
82-
text += item_ser.text
83-
node = TextNode(text=text)
77+
node = TextNode(text=chunky_chunk.text)
8478
if page_number:
8579
node.metadata["page_number"] = page_number
8680
node.metadata["file_name"] = document.metadata["file_name"]

llm-service/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dependencies = [
2626
"torch>=2.5.1",
2727
"pillow>=10.4.0",
2828
"transformers>=4.46.3",
29-
"docling>=2.15.0",
29+
"docling>=2.40.0",
3030
"llvmlite==0.43.0",
3131
"llama-index-llms-bedrock-converse>=0.4.10",
3232
"presidio-analyzer>=2.2.355",
@@ -57,6 +57,7 @@ license = {text = "APACHE"}
5757
override-dependencies = [
5858
"boto3-stubs==1.36.1",
5959
"botocore-stubs==1.36.1",
60+
"docling-ibm-models==3.7.0"
6061
]
6162

6263
[dependency-groups]

llm-service/uv.lock

Lines changed: 18 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)