From 7f010bc2bad773c58f79924667edf0a9ef249664 Mon Sep 17 00:00:00 2001 From: Asksksn Date: Thu, 26 Mar 2026 16:59:24 +0800 Subject: [PATCH] fix(paddleocr): load all PDF pages for image cropping instead of first 100 The __images__ method defaulted to page_to=100, but the PaddleOCR API processes all pages of the PDF. For PDFs with more than 100 pages, page indices beyond 99 were rejected as out of range during crop validation. Closes #13803 Co-Authored-By: Claude Opus 4.6 --- deepdoc/parser/paddleocr_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index 56ecdfb6078..c2ee805d451 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -422,7 +422,7 @@ def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]: """Convert API response to table tuples.""" return [] - def __images__(self, fnm, page_from=0, page_to=100, callback=None): + def __images__(self, fnm, page_from=0, page_to=10**9, callback=None): """Generate page images from PDF for cropping.""" self.page_from = page_from self.page_to = page_to