diff --git a/_test_unstructured_client/unit/test_pdf_utils.py b/_test_unstructured_client/unit/test_pdf_utils.py index 92b3e79f..7f78ffac 100644 --- a/_test_unstructured_client/unit/test_pdf_utils.py +++ b/_test_unstructured_client/unit/test_pdf_utils.py @@ -5,7 +5,7 @@ import pytest from pypdf import PdfReader -from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError +from unstructured_client._hooks.custom.pdf_utils import check_pdf, read_pdf, PDFValidationError from _test_unstructured_client.unit_utils import sample_docs_path @@ -23,6 +23,7 @@ def test_check_pdf_with_valid_pdf(): assert isinstance(result, PdfReader) +# TODO(klaijan) - add pdf file when file is ready @pytest.mark.parametrize( ("pdf_name", "expected_error_message"), [ @@ -51,3 +52,15 @@ def test_check_pdf_raises_pdf_validation_error( check_pdf(pdf) assert exc_info.value.message == expected_error_message + + +# TODO(klaijan) - uncomment when file is ready +""" +def test_check_read_pdf(): + pdf_path = sample_docs_path(".pdf") + with open(pdf_path, "rb") as f: + pdf_content = f.read() + pdf = read_pdf(pdf_content) + result = check_pdf(pdf) + assert isinstance(result, PdfReader) +""" \ No newline at end of file diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 8fb5916a..66fe9473 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -26,6 +26,43 @@ def __init__(self, message: str): def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]: + reader = read_pdf_raw(pdf_file=pdf_file) + if reader: + return reader + + # TODO(klaijan) - remove once debugged + pdf_logger.debug("Primary PdfReader parse failed, attempting multipart and raw extraction fallbacks.") + + # Load raw bytes + # case bytes + if isinstance(pdf_file, bytes): + raw = pdf_file + # case BinaryIO + elif hasattr(pdf_file, "read"): + try: + pdf_file.seek(0) + raw = pdf_file.read() + except Exception as e: + raise IOError(f"Failed to read file stream: {e}") from e + else: + raise IOError("Expected bytes or a file-like object with 'read()' method") + + # breakpoint() + # This looks for %PDF- + try: + start = raw.find(b"%PDF-") + end = raw.find(b"%%EOF") + len(b"%%EOF") + if start != -1: + sliced = raw[start:end] + pdf = PdfReader(io.BytesIO(sliced), strict=False) + return check_pdf(pdf) + except Exception as e: + pdf_logger.debug("%%PDF- slicing fallback failed: %s", e) + + return None + + +def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]: """Reads the given PDF file. Args: @@ -34,13 +71,20 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]: Returns: The PdfReader object if the file is a PDF, None otherwise. """ - try: if isinstance(pdf_file, bytes): content = cast(bytes, pdf_file) pdf_file = io.BytesIO(content) - return PdfReader(pdf_file, strict=False) - except (PdfReadError, UnicodeDecodeError): + reader = PdfReader(pdf_file, strict=False) + return check_pdf(reader) + except (PdfReadError, UnicodeDecodeError) as e: + pdf_logger.debug("Read pdf failed: %s", e) + return None + except PDFValidationError as e: + pdf_logger.debug("Check pdf failed: %s", e) + return None + except Exception as e: + pdf_logger.debug("An unexpected error occurred: %s", e) return None