42
42
43
43
from docling .datamodel .document import ConversionResult
44
44
from docling .document_converter import DocumentConverter
45
- from docling_core .transforms .chunker .hierarchical_chunker import HierarchicalChunker
46
45
from docling_core .transforms .chunker .base import BaseChunk
47
- from docling_core .transforms .serializer .base import SerializationResult
48
- from docling_core .transforms .serializer .markdown import MarkdownDocSerializer
46
+ from docling_core .transforms .chunker .hybrid_chunker import HybridChunker
49
47
from llama_index .core .schema import Document , TextNode , NodeRelationship
50
48
51
49
from .base_reader import BaseReader
@@ -67,20 +65,16 @@ def load_chunks(self, file_path: Path) -> ChunksResult:
67
65
converted_chunks : List [TextNode ] = []
68
66
logger .debug (f"{ file_path = } " )
69
67
docling_doc : ConversionResult = DocumentConverter ().convert (file_path )
70
- chunky_chunks = HierarchicalChunker (serializer_provider = MarkdownSerializerProvider ()).chunk (docling_doc .document )
68
+ chunky_chunks = HybridChunker (serializer_provider = MarkdownSerializerProvider ()).chunk (docling_doc .document )
71
69
chunky_chunk : BaseChunk
72
- serializer = MarkdownDocSerializer (doc = docling_doc .document )
73
70
for i , chunky_chunk in enumerate (chunky_chunks ):
74
- text = ""
75
71
page_number : int = 0
76
72
if not hasattr (chunky_chunk .meta , "doc_items" ):
77
73
logger .warning (f"Chunk { i } is empty, skipping" )
78
74
continue
79
75
for item in chunky_chunk .meta .doc_items :
80
76
page_number = item .prov [0 ].page_no if item .prov else None
81
- item_ser : SerializationResult = serializer .serialize (item = item )
82
- text += item_ser .text
83
- node = TextNode (text = text )
77
+ node = TextNode (text = chunky_chunk .text )
84
78
if page_number :
85
79
node .metadata ["page_number" ] = page_number
86
80
node .metadata ["file_name" ] = document .metadata ["file_name" ]
0 commit comments