docling-project · cau-git · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp
@@ -252,6 +252,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	 },
 	 pybind11::arg("key"),
 	 pybind11::arg("filename"),
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Load a document by key and filename.
 
@@ -268,6 +269,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	 },
 	 pybind11::arg("key"),
 	 pybind11::arg("bytes_io"),
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Load a document by key from a BytesIO-like object.
 
@@ -310,6 +312,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	   return self.get_annotations(key);
 	 },
 	 pybind11::arg("key"),
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Retrieve annotations for the document identified by its unique key and return them as JSON.
 
@@ -324,6 +327,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	   return self.get_table_of_contents(key);
 	 },
 	 pybind11::arg("key"),
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Retrieve the table of contents for the document identified by its unique key and return it as JSON.
 
@@ -338,6 +342,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	   return self.get_meta_xml(key);
 	 },
 	 pybind11::arg("key"),
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Retrieve the meta data in string or None.
 
@@ -357,6 +362,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	 pybind11::arg("key"),
 	 pybind11::arg("page_boundary") = "crop_box", // media_box
 	 pybind11::arg("do_sanitization") = true, // media_box
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Parse the PDF document identified by its unique key and return a JSON representation.
 
@@ -380,6 +386,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	 pybind11::arg("page"),
 	 pybind11::arg("page_boundary") = "crop_box", // media_box
 	 pybind11::arg("do_sanitization") = true, // media_box
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Parse a specific page of the PDF document identified by its unique key and return a JSON representation.
 
@@ -417,6 +424,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	 pybind11::arg("enforce_same_font")=true,
 	 pybind11::arg("space_width_factor_for_merge")=1.5,
 	 pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
 Sanitize table cells with specified parameters and return the processed JSON.
 
@@ -457,6 +465,7 @@ Sanitize table cells with specified parameters and return the processed JSON.
 	 pybind11::arg("enforce_same_font")=true,
 	 pybind11::arg("space_width_factor_for_merge")=1.5,
 	 pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
+         pybind11::call_guard<pybind11::gil_scoped_release>(),
 	 R"(
     Sanitize table cells in a given bounding box with specified parameters and return the processed JSON.
 

diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py
@@ -1,9 +1,10 @@
 """Parser for PDF files"""
 
+import asyncio
 import hashlib
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, Iterator, List, Optional, Tuple, Union
+from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
 
 from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import (
@@ -34,6 +35,12 @@ def iterate_pages(
         for page_no in range(self.number_of_pages()):
             yield page_no + 1, self.get_page(page_no + 1)
 
+    async def iterate_pages_async(
+        self,
+    ) -> AsyncIterator[Tuple[int, SegmentedPdfPage]]:
+        for page_no in range(self.number_of_pages()):
+            yield page_no + 1, await self.get_page_async(page_no + 1)
+
     def __init__(
         self,
         parser: "pdf_parser_v2",
@@ -149,6 +156,36 @@ def get_page(
 
         return SegmentedPdfPage()
 
+    async def get_page_async(
+        self, page_no: int, create_words: bool = True, create_textlines: bool = True
+    ) -> SegmentedPdfPage:
+        if page_no in self._pages.keys():
+            return self._pages[page_no]
+        else:
+            if 1 <= page_no <= self.number_of_pages():
+                doc_dict = await asyncio.to_thread(
+                    self._parser.parse_pdf_from_key_on_page,
+                    key=self._key,
+                    page=page_no - 1,
+                    page_boundary=self._boundary_type.value,  # Convert enum to string
+                    do_sanitization=False,
+                )
+
+                for pi, page in enumerate(
+                    doc_dict["pages"]
+                ):  # only one page is expected
+
+                    self._pages[page_no] = self._to_segmented_page(
+                        page=page["original"],
+                        create_words=create_words,
+                        create_textlines=create_textlines,
+                    )  # put on cache
+                    return self._pages[page_no]
+
+        raise ValueError(
+            f"incorrect page_no: {page_no} for key={self._key} (min:1, max:{self.number_of_pages()})"
+        )
+
     def load_all_pages(self, create_words: bool = True, create_lines: bool = True):
         doc_dict = self._parser.parse_pdf_from_key(
             key=self._key, page_boundary=self._boundary_type, do_sanitization=False
@@ -163,6 +200,24 @@ def load_all_pages(self, create_words: bool = True, create_lines: bool = True):
                 create_textlines=create_lines,
             )  # put on cache
 
+    async def load_all_pages_async(
+        self, create_words: bool = True, create_lines: bool = True
+    ):
+        doc_dict = await asyncio.to_thread(
+            self._parser.parse_pdf_from_key,
+            key=self._key,
+            page_boundary=self._boundary_type.value,  # Convert enum to string
+            do_sanitization=False,
+        )
+
+        for pi, page in enumerate(doc_dict["pages"]):
+            # will need to be changed once we remove the original/sanitized from C++
+            self._pages[pi + 1] = self._to_segmented_page(
+                page["original"],
+                create_words=create_words,
+                create_textlines=create_lines,
+            )  # put on cache
+
     def _to_page_geometry(self, dimension: dict) -> PdfPageGeometry:
 
         boundary_type: PdfPageBoundaryType = PdfPageBoundaryType(
@@ -515,6 +570,46 @@ def load(
         else:
             raise RuntimeError(f"Failed to load document with key {key}")
 
+    async def load_async(
+        self,
+        path_or_stream: Union[str, Path, BytesIO],
+        lazy: bool = True,
+        boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
+    ) -> PdfDocument:
+
+        if isinstance(path_or_stream, str):
+            path_or_stream = Path(path_or_stream)
+
+        if isinstance(path_or_stream, Path):
+            key = f"key={str(path_or_stream)}"  # use filepath as internal handle
+            success = await asyncio.to_thread(
+                self._load_document, key=key, filename=str(path_or_stream)
+            )
+
+        elif isinstance(path_or_stream, BytesIO):
+            hasher = hashlib.sha256(usedforsecurity=False)
+
+            while chunk := path_or_stream.read(8192):
+                hasher.update(chunk)
+            path_or_stream.seek(0)
+            hash = hasher.hexdigest()
+
+            key = f"key={hash}"  # use md5 hash as internal handle
+            success = await asyncio.to_thread(
+                self._load_document_from_bytesio, key=key, data=path_or_stream
+            )
+
+        if success:
+            result_doc = PdfDocument(
+                parser=self.parser, key=key, boundary_type=boundary_type
+            )
+            if not lazy:  # eagerly parse the pages at init time if desired
+                await result_doc.load_all_pages_async()
+
+            return result_doc
+        else:
+            raise RuntimeError(f"Failed to load document with key {key}")
+
     def _load_document(self, key: str, filename: str) -> bool:
         """Load a document by key and filename.