Skip to content

Commit 1d3ea7d

Browse files
committed
feat: add support for tall pages in pdfs by splitting them horizontally
1 parent c1653da commit 1d3ea7d

File tree

1 file changed

+208
-2
lines changed

1 file changed

+208
-2
lines changed

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 208 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import asyncio
4+
import copy
45
import io
56
import json
67
import logging
@@ -13,6 +14,8 @@
1314
from pathlib import Path
1415
from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO
1516

17+
from PIL import Image
18+
1619
import aiofiles
1720
import httpx
1821
import nest_asyncio # type: ignore
@@ -55,6 +58,7 @@
5558
MAX_PAGES_PER_SPLIT = 20
5659
HI_RES_STRATEGY = 'hi_res'
5760
MAX_PAGE_LENGTH = 4000
61+
TALL_PAGE_ASPECT_RATIO_THRESHOLD = 1.5
5862

5963

6064
async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
@@ -304,6 +308,10 @@ def before_request(
304308
return request
305309

306310
pdf = pdf_utils.check_pdf(pdf)
311+
312+
original_page_count = len(pdf.pages)
313+
pdf = self._split_tall_pages(pdf)
314+
image_processing_performed = (len(pdf.pages) != original_page_count)
307315

308316
starting_page_number = form_utils.get_starting_page_number(
309317
form_data,
@@ -349,10 +357,30 @@ def before_request(
349357
num_pages=page_count, concurrency_level=concurrency_level
350358
)
351359

352-
# If the doc is small enough, and we aren't slicing it with a page range:
360+
# If the doc is small enough,
361+
# and we aren't slicing it with a page range,
362+
# and no image processing (horizontal slicing) was performed:
353363
# do not split, just continue with the original request
354-
if split_size >= page_count and page_count == len(pdf.pages):
364+
if split_size >= page_count and page_count == len(pdf.pages) and not image_processing_performed:
355365
return request
366+
367+
# If image processing was performed, we need to send the processed PDF even for single pages
368+
if image_processing_performed and len(pdf.pages) == 1:
369+
# Create a single chunk with the processed PDF
370+
processed_pdf_data = io.BytesIO()
371+
pdf_writer = PdfWriter()
372+
pdf_writer.add_page(pdf.pages[0])
373+
pdf_writer.write(processed_pdf_data)
374+
processed_pdf_data.seek(0)
375+
376+
# Create new request with processed PDF
377+
processed_request = request_utils.create_pdf_chunk_request(
378+
form_data=form_data,
379+
pdf_chunk=(processed_pdf_data, 1),
380+
filename=pdf_file_meta["filename"],
381+
original_request=request,
382+
)
383+
return processed_request
356384

357385
pdf = self._trim_large_pages(pdf, form_data)
358386

@@ -445,6 +473,184 @@ async def call_api_partial(
445473

446474
return response
447475

476+
def _split_tall_pages(self, pdf: PdfReader) -> PdfReader:
477+
"""Checks for and splits pages that are disproportionately tall."""
478+
# Initial analysis of the PDF structure
479+
writer = PdfWriter()
480+
any_page_split = False
481+
482+
for page in pdf.pages:
483+
height = float(page.mediabox.height)
484+
width = float(page.mediabox.width)
485+
486+
if width == 0: # Avoid division by zero for invalid pages
487+
writer.add_page(page)
488+
continue
489+
490+
aspect_ratio = height / width
491+
logger.info(f"Page aspect ratio: {aspect_ratio:.2f} (threshold: {TALL_PAGE_ASPECT_RATIO_THRESHOLD})")
492+
493+
if aspect_ratio <= TALL_PAGE_ASPECT_RATIO_THRESHOLD:
494+
writer.add_page(page)
495+
continue
496+
497+
any_page_split = True
498+
num_splits = math.ceil(aspect_ratio / TALL_PAGE_ASPECT_RATIO_THRESHOLD)
499+
logger.info(f"Target splits: {num_splits} parts")
500+
501+
try:
502+
split_pages = self._split_page_with_image_processing(page, num_splits)
503+
if split_pages and len(split_pages) > 1:
504+
logger.info(f"Image processing succeeded: {len(split_pages)} parts")
505+
for split_page in split_pages:
506+
writer.add_page(split_page)
507+
else:
508+
logger.warning("Image processing failed - no valid splits returned")
509+
self._add_media_box_split_pages(writer, page, num_splits, height)
510+
except Exception as e:
511+
logger.error(f"Image processing exception: {e}")
512+
self._add_media_box_split_pages(writer, page, num_splits, height)
513+
514+
if not any_page_split:
515+
return pdf
516+
517+
# If we split any pages, return a new PdfReader from the modified content
518+
buffer = io.BytesIO()
519+
writer.write(buffer)
520+
buffer.seek(0)
521+
return PdfReader(buffer)
522+
523+
def _split_page_with_image_processing(self, page, num_splits):
524+
"""Split a page by extracting and processing its images."""
525+
if "/Resources" not in page or "/XObject" not in page["/Resources"]:
526+
return None
527+
528+
xobjects = page["/Resources"]["/XObject"]
529+
530+
for obj_name, obj in xobjects.items():
531+
if hasattr(obj, 'get_object'):
532+
obj = obj.get_object()
533+
534+
if obj.get("/Subtype") == "/Image":
535+
width = int(obj.get("/Width", 0))
536+
height = int(obj.get("/Height", 0))
537+
original_pixels = width * height
538+
539+
image_data = self._extract_image_data(obj)
540+
if not image_data:
541+
continue
542+
543+
try:
544+
pil_image = Image.open(io.BytesIO(image_data))
545+
except Exception as e:
546+
continue
547+
548+
# Calculate target resolution to stay under API limits
549+
# API limit is ~179M pixels, let's target 80M pixels total for safety margin
550+
target_pixels_total = 80_000_000
551+
target_pixels_per_split = target_pixels_total // num_splits
552+
553+
# Calculate scale factor if we need to reduce resolution
554+
scale_factor = 1.0
555+
if original_pixels > target_pixels_per_split:
556+
scale_factor = (target_pixels_per_split / original_pixels) ** 0.5
557+
558+
# Apply scaling
559+
new_width = int(pil_image.width * scale_factor)
560+
new_height = int(pil_image.height * scale_factor)
561+
pil_image = pil_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
562+
563+
strip_height = pil_image.height // num_splits
564+
total_split_pixels = 0
565+
split_pages = []
566+
567+
for i in range(num_splits):
568+
top = i * strip_height
569+
bottom = min((i + 1) * strip_height, pil_image.height)
570+
571+
cropped_image = pil_image.crop((0, top, pil_image.width, bottom))
572+
strip_pixels = cropped_image.width * cropped_image.height
573+
total_split_pixels += strip_pixels
574+
575+
new_page = self._create_page_with_image(cropped_image, page)
576+
if new_page:
577+
split_pages.append(new_page)
578+
else:
579+
return None
580+
581+
if split_pages and len(split_pages) == num_splits:
582+
return split_pages
583+
584+
return None
585+
586+
def _extract_image_data(self, image_obj):
587+
"""Extract raw image data from a PDF image object."""
588+
try:
589+
if "/Filter" in image_obj:
590+
filter_type = image_obj["/Filter"]
591+
592+
if filter_type in ["/DCTDecode", "/JPXDecode"]:
593+
# JPEG or JPEG2000 - data is already compressed
594+
data = image_obj._data
595+
return data
596+
elif filter_type == "/FlateDecode":
597+
# PNG-like compression
598+
import zlib
599+
compressed_data = image_obj._data
600+
data = zlib.decompress(compressed_data)
601+
return data
602+
603+
# Fallback to raw data
604+
data = image_obj._data
605+
return data
606+
607+
except Exception as e:
608+
return None
609+
610+
def _create_page_with_image(self, pil_image, original_page):
611+
"""Create a new PDF page containing the given PIL image."""
612+
try:
613+
img_buffer = io.BytesIO()
614+
615+
# Convert to RGB if necessary
616+
if pil_image.mode != 'RGB':
617+
pil_image = pil_image.convert('RGB')
618+
619+
# Save the image as PDF
620+
pil_image.save(img_buffer, format='PDF')
621+
img_buffer.seek(0)
622+
623+
# Create a new PDF reader from the image
624+
img_pdf = PdfReader(img_buffer)
625+
if not img_pdf.pages:
626+
return None
627+
628+
new_page = img_pdf.pages[0]
629+
return new_page
630+
631+
except Exception as e:
632+
return None
633+
634+
def _add_media_box_split_pages(self, writer, page, num_splits, page_height):
635+
"""Fallback method to add pages with media box splitting (original approach)."""
636+
split_height = page_height / num_splits
637+
638+
for i in range(num_splits):
639+
# Create a deep copy to modify the media box independently
640+
new_page = copy.deepcopy(page)
641+
642+
# Calculate new coordinates for the crop
643+
top_coord = page.mediabox.top - (i * split_height)
644+
bottom_coord = page.mediabox.top - ((i + 1) * split_height)
645+
646+
# Set the new media box to crop the page
647+
new_page.mediabox.lower_left = (page.mediabox.left, bottom_coord)
648+
new_page.mediabox.lower_right = (page.mediabox.right, bottom_coord)
649+
new_page.mediabox.upper_left = (page.mediabox.left, top_coord)
650+
new_page.mediabox.upper_right = (page.mediabox.right, top_coord)
651+
652+
writer.add_page(new_page)
653+
448654
def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
449655
if form_data['strategy'] != HI_RES_STRATEGY:
450656
return pdf

0 commit comments

Comments
 (0)