Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
},
pybind11::arg("key"),
pybind11::arg("filename"),
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Load a document by key and filename.

Expand All @@ -268,6 +269,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
},
pybind11::arg("key"),
pybind11::arg("bytes_io"),
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Load a document by key from a BytesIO-like object.

Expand Down Expand Up @@ -310,6 +312,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
return self.get_annotations(key);
},
pybind11::arg("key"),
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Retrieve annotations for the document identified by its unique key and return them as JSON.

Expand All @@ -324,6 +327,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
return self.get_table_of_contents(key);
},
pybind11::arg("key"),
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Retrieve the table of contents for the document identified by its unique key and return it as JSON.

Expand All @@ -338,6 +342,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
return self.get_meta_xml(key);
},
pybind11::arg("key"),
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Retrieve the meta data in string or None.

Expand All @@ -357,6 +362,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
pybind11::arg("key"),
pybind11::arg("page_boundary") = "crop_box", // media_box
pybind11::arg("do_sanitization") = true, // media_box
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Parse the PDF document identified by its unique key and return a JSON representation.

Expand All @@ -380,6 +386,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
pybind11::arg("page"),
pybind11::arg("page_boundary") = "crop_box", // media_box
pybind11::arg("do_sanitization") = true, // media_box
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Parse a specific page of the PDF document identified by its unique key and return a JSON representation.

Expand Down Expand Up @@ -417,6 +424,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
pybind11::arg("enforce_same_font")=true,
pybind11::arg("space_width_factor_for_merge")=1.5,
pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Sanitize table cells with specified parameters and return the processed JSON.

Expand Down Expand Up @@ -457,6 +465,7 @@ Sanitize table cells with specified parameters and return the processed JSON.
pybind11::arg("enforce_same_font")=true,
pybind11::arg("space_width_factor_for_merge")=1.5,
pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
pybind11::call_guard<pybind11::gil_scoped_release>(),
R"(
Sanitize table cells in a given bounding box with specified parameters and return the processed JSON.

Expand Down
97 changes: 96 additions & 1 deletion docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Parser for PDF files"""

import asyncio
import hashlib
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple, Union
from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union

from docling_core.types.doc.base import BoundingBox, CoordOrigin
from docling_core.types.doc.page import (
Expand Down Expand Up @@ -34,6 +35,12 @@ def iterate_pages(
for page_no in range(self.number_of_pages()):
yield page_no + 1, self.get_page(page_no + 1)

async def iterate_pages_async(
self,
) -> AsyncIterator[Tuple[int, SegmentedPdfPage]]:
for page_no in range(self.number_of_pages()):
yield page_no + 1, await self.get_page_async(page_no + 1)

def __init__(
self,
parser: "pdf_parser_v2",
Expand Down Expand Up @@ -149,6 +156,36 @@ def get_page(

return SegmentedPdfPage()

async def get_page_async(
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> SegmentedPdfPage:
if page_no in self._pages.keys():
return self._pages[page_no]
else:
if 1 <= page_no <= self.number_of_pages():
doc_dict = await asyncio.to_thread(
self._parser.parse_pdf_from_key_on_page,
key=self._key,
page=page_no - 1,
page_boundary=self._boundary_type.value, # Convert enum to string
do_sanitization=False,
)

for pi, page in enumerate(
doc_dict["pages"]
): # only one page is expected

self._pages[page_no] = self._to_segmented_page(
page=page["original"],
create_words=create_words,
create_textlines=create_textlines,
) # put on cache
return self._pages[page_no]

raise ValueError(
f"incorrect page_no: {page_no} for key={self._key} (min:1, max:{self.number_of_pages()})"
)

def load_all_pages(self, create_words: bool = True, create_lines: bool = True):
doc_dict = self._parser.parse_pdf_from_key(
key=self._key, page_boundary=self._boundary_type, do_sanitization=False
Expand All @@ -163,6 +200,24 @@ def load_all_pages(self, create_words: bool = True, create_lines: bool = True):
create_textlines=create_lines,
) # put on cache

async def load_all_pages_async(
self, create_words: bool = True, create_lines: bool = True
):
doc_dict = await asyncio.to_thread(
self._parser.parse_pdf_from_key,
key=self._key,
page_boundary=self._boundary_type.value, # Convert enum to string
do_sanitization=False,
)

for pi, page in enumerate(doc_dict["pages"]):
# will need to be changed once we remove the original/sanitized from C++
self._pages[pi + 1] = self._to_segmented_page(
page["original"],
create_words=create_words,
create_textlines=create_lines,
) # put on cache

def _to_page_geometry(self, dimension: dict) -> PdfPageGeometry:

boundary_type: PdfPageBoundaryType = PdfPageBoundaryType(
Expand Down Expand Up @@ -515,6 +570,46 @@ def load(
else:
raise RuntimeError(f"Failed to load document with key {key}")

async def load_async(
self,
path_or_stream: Union[str, Path, BytesIO],
lazy: bool = True,
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
) -> PdfDocument:

if isinstance(path_or_stream, str):
path_or_stream = Path(path_or_stream)

if isinstance(path_or_stream, Path):
key = f"key={str(path_or_stream)}" # use filepath as internal handle
success = await asyncio.to_thread(
self._load_document, key=key, filename=str(path_or_stream)
)

elif isinstance(path_or_stream, BytesIO):
hasher = hashlib.sha256(usedforsecurity=False)

while chunk := path_or_stream.read(8192):
hasher.update(chunk)
path_or_stream.seek(0)
hash = hasher.hexdigest()

key = f"key={hash}" # use md5 hash as internal handle
success = await asyncio.to_thread(
self._load_document_from_bytesio, key=key, data=path_or_stream
)

if success:
result_doc = PdfDocument(
parser=self.parser, key=key, boundary_type=boundary_type
)
if not lazy: # eagerly parse the pages at init time if desired
await result_doc.load_all_pages_async()

return result_doc
else:
raise RuntimeError(f"Failed to load document with key {key}")

def _load_document(self, key: str, filename: str) -> bool:
"""Load a document by key and filename.

Expand Down
Loading
Loading