|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 | 3 | import asyncio
|
| 4 | +import copy |
4 | 5 | import io
|
5 | 6 | import json
|
6 | 7 | import logging
|
|
13 | 14 | from pathlib import Path
|
14 | 15 | from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO
|
15 | 16 |
|
| 17 | +from PIL import Image |
| 18 | + |
16 | 19 | import aiofiles
|
17 | 20 | import httpx
|
18 | 21 | import nest_asyncio # type: ignore
|
|
55 | 58 | MAX_PAGES_PER_SPLIT = 20
|
56 | 59 | HI_RES_STRATEGY = 'hi_res'
|
57 | 60 | MAX_PAGE_LENGTH = 4000
|
| 61 | +TALL_PAGE_ASPECT_RATIO_THRESHOLD = 1.5 |
58 | 62 |
|
59 | 63 |
|
60 | 64 | async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
|
@@ -304,6 +308,10 @@ def before_request(
|
304 | 308 | return request
|
305 | 309 |
|
306 | 310 | pdf = pdf_utils.check_pdf(pdf)
|
| 311 | + |
| 312 | + original_page_count = len(pdf.pages) |
| 313 | + pdf = self._split_tall_pages(pdf) |
| 314 | + image_processing_performed = (len(pdf.pages) != original_page_count) |
307 | 315 |
|
308 | 316 | starting_page_number = form_utils.get_starting_page_number(
|
309 | 317 | form_data,
|
@@ -349,10 +357,30 @@ def before_request(
|
349 | 357 | num_pages=page_count, concurrency_level=concurrency_level
|
350 | 358 | )
|
351 | 359 |
|
352 |
| - # If the doc is small enough, and we aren't slicing it with a page range: |
| 360 | + # If the doc is small enough, |
| 361 | + # and we aren't slicing it with a page range, |
| 362 | + # and no image processing (horizontal slicing) was performed: |
353 | 363 | # do not split, just continue with the original request
|
354 |
| - if split_size >= page_count and page_count == len(pdf.pages): |
| 364 | + if split_size >= page_count and page_count == len(pdf.pages) and not image_processing_performed: |
355 | 365 | return request
|
| 366 | + |
| 367 | + # If image processing was performed, we need to send the processed PDF even for single pages |
| 368 | + if image_processing_performed and len(pdf.pages) == 1: |
| 369 | + # Create a single chunk with the processed PDF |
| 370 | + processed_pdf_data = io.BytesIO() |
| 371 | + pdf_writer = PdfWriter() |
| 372 | + pdf_writer.add_page(pdf.pages[0]) |
| 373 | + pdf_writer.write(processed_pdf_data) |
| 374 | + processed_pdf_data.seek(0) |
| 375 | + |
| 376 | + # Create new request with processed PDF |
| 377 | + processed_request = request_utils.create_pdf_chunk_request( |
| 378 | + form_data=form_data, |
| 379 | + pdf_chunk=(processed_pdf_data, 1), |
| 380 | + filename=pdf_file_meta["filename"], |
| 381 | + original_request=request, |
| 382 | + ) |
| 383 | + return processed_request |
356 | 384 |
|
357 | 385 | pdf = self._trim_large_pages(pdf, form_data)
|
358 | 386 |
|
@@ -445,6 +473,184 @@ async def call_api_partial(
|
445 | 473 |
|
446 | 474 | return response
|
447 | 475 |
|
| 476 | + def _split_tall_pages(self, pdf: PdfReader) -> PdfReader: |
| 477 | + """Checks for and splits pages that are disproportionately tall.""" |
| 478 | + # Initial analysis of the PDF structure |
| 479 | + writer = PdfWriter() |
| 480 | + any_page_split = False |
| 481 | + |
| 482 | + for page in pdf.pages: |
| 483 | + height = float(page.mediabox.height) |
| 484 | + width = float(page.mediabox.width) |
| 485 | + |
| 486 | + if width == 0: # Avoid division by zero for invalid pages |
| 487 | + writer.add_page(page) |
| 488 | + continue |
| 489 | + |
| 490 | + aspect_ratio = height / width |
| 491 | + logger.info(f"Page aspect ratio: {aspect_ratio:.2f} (threshold: {TALL_PAGE_ASPECT_RATIO_THRESHOLD})") |
| 492 | + |
| 493 | + if aspect_ratio <= TALL_PAGE_ASPECT_RATIO_THRESHOLD: |
| 494 | + writer.add_page(page) |
| 495 | + continue |
| 496 | + |
| 497 | + any_page_split = True |
| 498 | + num_splits = math.ceil(aspect_ratio / TALL_PAGE_ASPECT_RATIO_THRESHOLD) |
| 499 | + logger.info(f"Target splits: {num_splits} parts") |
| 500 | + |
| 501 | + try: |
| 502 | + split_pages = self._split_page_with_image_processing(page, num_splits) |
| 503 | + if split_pages and len(split_pages) > 1: |
| 504 | + logger.info(f"Image processing succeeded: {len(split_pages)} parts") |
| 505 | + for split_page in split_pages: |
| 506 | + writer.add_page(split_page) |
| 507 | + else: |
| 508 | + logger.warning("Image processing failed - no valid splits returned") |
| 509 | + self._add_media_box_split_pages(writer, page, num_splits, height) |
| 510 | + except Exception as e: |
| 511 | + logger.error(f"Image processing exception: {e}") |
| 512 | + self._add_media_box_split_pages(writer, page, num_splits, height) |
| 513 | + |
| 514 | + if not any_page_split: |
| 515 | + return pdf |
| 516 | + |
| 517 | + # If we split any pages, return a new PdfReader from the modified content |
| 518 | + buffer = io.BytesIO() |
| 519 | + writer.write(buffer) |
| 520 | + buffer.seek(0) |
| 521 | + return PdfReader(buffer) |
| 522 | + |
| 523 | + def _split_page_with_image_processing(self, page, num_splits): |
| 524 | + """Split a page by extracting and processing its images.""" |
| 525 | + if "/Resources" not in page or "/XObject" not in page["/Resources"]: |
| 526 | + return None |
| 527 | + |
| 528 | + xobjects = page["/Resources"]["/XObject"] |
| 529 | + |
| 530 | + for obj_name, obj in xobjects.items(): |
| 531 | + if hasattr(obj, 'get_object'): |
| 532 | + obj = obj.get_object() |
| 533 | + |
| 534 | + if obj.get("/Subtype") == "/Image": |
| 535 | + width = int(obj.get("/Width", 0)) |
| 536 | + height = int(obj.get("/Height", 0)) |
| 537 | + original_pixels = width * height |
| 538 | + |
| 539 | + image_data = self._extract_image_data(obj) |
| 540 | + if not image_data: |
| 541 | + continue |
| 542 | + |
| 543 | + try: |
| 544 | + pil_image = Image.open(io.BytesIO(image_data)) |
| 545 | + except Exception as e: |
| 546 | + continue |
| 547 | + |
| 548 | + # Calculate target resolution to stay under API limits |
| 549 | + # API limit is ~179M pixels, let's target 80M pixels total for safety margin |
| 550 | + target_pixels_total = 80_000_000 |
| 551 | + target_pixels_per_split = target_pixels_total // num_splits |
| 552 | + |
| 553 | + # Calculate scale factor if we need to reduce resolution |
| 554 | + scale_factor = 1.0 |
| 555 | + if original_pixels > target_pixels_per_split: |
| 556 | + scale_factor = (target_pixels_per_split / original_pixels) ** 0.5 |
| 557 | + |
| 558 | + # Apply scaling |
| 559 | + new_width = int(pil_image.width * scale_factor) |
| 560 | + new_height = int(pil_image.height * scale_factor) |
| 561 | + pil_image = pil_image.resize((new_width, new_height), Image.Resampling.LANCZOS) |
| 562 | + |
| 563 | + strip_height = pil_image.height // num_splits |
| 564 | + total_split_pixels = 0 |
| 565 | + split_pages = [] |
| 566 | + |
| 567 | + for i in range(num_splits): |
| 568 | + top = i * strip_height |
| 569 | + bottom = min((i + 1) * strip_height, pil_image.height) |
| 570 | + |
| 571 | + cropped_image = pil_image.crop((0, top, pil_image.width, bottom)) |
| 572 | + strip_pixels = cropped_image.width * cropped_image.height |
| 573 | + total_split_pixels += strip_pixels |
| 574 | + |
| 575 | + new_page = self._create_page_with_image(cropped_image, page) |
| 576 | + if new_page: |
| 577 | + split_pages.append(new_page) |
| 578 | + else: |
| 579 | + return None |
| 580 | + |
| 581 | + if split_pages and len(split_pages) == num_splits: |
| 582 | + return split_pages |
| 583 | + |
| 584 | + return None |
| 585 | + |
| 586 | + def _extract_image_data(self, image_obj): |
| 587 | + """Extract raw image data from a PDF image object.""" |
| 588 | + try: |
| 589 | + if "/Filter" in image_obj: |
| 590 | + filter_type = image_obj["/Filter"] |
| 591 | + |
| 592 | + if filter_type in ["/DCTDecode", "/JPXDecode"]: |
| 593 | + # JPEG or JPEG2000 - data is already compressed |
| 594 | + data = image_obj._data |
| 595 | + return data |
| 596 | + elif filter_type == "/FlateDecode": |
| 597 | + # PNG-like compression |
| 598 | + import zlib |
| 599 | + compressed_data = image_obj._data |
| 600 | + data = zlib.decompress(compressed_data) |
| 601 | + return data |
| 602 | + |
| 603 | + # Fallback to raw data |
| 604 | + data = image_obj._data |
| 605 | + return data |
| 606 | + |
| 607 | + except Exception as e: |
| 608 | + return None |
| 609 | + |
| 610 | + def _create_page_with_image(self, pil_image, original_page): |
| 611 | + """Create a new PDF page containing the given PIL image.""" |
| 612 | + try: |
| 613 | + img_buffer = io.BytesIO() |
| 614 | + |
| 615 | + # Convert to RGB if necessary |
| 616 | + if pil_image.mode != 'RGB': |
| 617 | + pil_image = pil_image.convert('RGB') |
| 618 | + |
| 619 | + # Save the image as PDF |
| 620 | + pil_image.save(img_buffer, format='PDF') |
| 621 | + img_buffer.seek(0) |
| 622 | + |
| 623 | + # Create a new PDF reader from the image |
| 624 | + img_pdf = PdfReader(img_buffer) |
| 625 | + if not img_pdf.pages: |
| 626 | + return None |
| 627 | + |
| 628 | + new_page = img_pdf.pages[0] |
| 629 | + return new_page |
| 630 | + |
| 631 | + except Exception as e: |
| 632 | + return None |
| 633 | + |
| 634 | + def _add_media_box_split_pages(self, writer, page, num_splits, page_height): |
| 635 | + """Fallback method to add pages with media box splitting (original approach).""" |
| 636 | + split_height = page_height / num_splits |
| 637 | + |
| 638 | + for i in range(num_splits): |
| 639 | + # Create a deep copy to modify the media box independently |
| 640 | + new_page = copy.deepcopy(page) |
| 641 | + |
| 642 | + # Calculate new coordinates for the crop |
| 643 | + top_coord = page.mediabox.top - (i * split_height) |
| 644 | + bottom_coord = page.mediabox.top - ((i + 1) * split_height) |
| 645 | + |
| 646 | + # Set the new media box to crop the page |
| 647 | + new_page.mediabox.lower_left = (page.mediabox.left, bottom_coord) |
| 648 | + new_page.mediabox.lower_right = (page.mediabox.right, bottom_coord) |
| 649 | + new_page.mediabox.upper_left = (page.mediabox.left, top_coord) |
| 650 | + new_page.mediabox.upper_right = (page.mediabox.right, top_coord) |
| 651 | + |
| 652 | + writer.add_page(new_page) |
| 653 | + |
448 | 654 | def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
|
449 | 655 | if form_data['strategy'] != HI_RES_STRATEGY:
|
450 | 656 | return pdf
|
|
0 commit comments