diff --git a/docs/user/security.md b/docs/user/security.md index 2c33c2963..a3ca2b1d0 100644 --- a/docs/user/security.md +++ b/docs/user/security.md @@ -18,6 +18,9 @@ aware of the possible side effects, you can modify the following constants which For JBIG2 images, there is a similar parameter to limit the memory usage during decoding: `pypdf.filters.JBIG2_MAX_OUTPUT_LENGTH` It defaults to 75 MB as well. +For all streams, the maximum allowed value for the `/Length` field is limited to `pypdf.filters.MAX_DECLARED_STREAM_LENGTH`, which +defaults to 75 MB as well. + For the *FlateDecode* filter, the number of bytes to attempt recovery with can be set by `pypdf.filters.ZLIB_MAX_RECOVERY_INPUT_LENGTH`. It defaults to 5 MB due to the much more complex recovery approach. diff --git a/pypdf/filters.py b/pypdf/filters.py index 5efa8616a..562f22f8f 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -72,6 +72,8 @@ is_null_or_none, ) +MAX_DECLARED_STREAM_LENGTH = 75_000_000 + JBIG2_MAX_OUTPUT_LENGTH = 75_000_000 LZW_MAX_OUTPUT_LENGTH = 75_000_000 RUN_LENGTH_MAX_OUTPUT_LENGTH = 75_000_000 diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 04282ef5e..bf6111c12 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -63,7 +63,7 @@ from ..constants import StreamAttributes as SA from ..constants import TypArguments as TA from ..constants import TypFitArguments as TF -from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError +from ..errors import STREAM_TRUNCATED_PREMATURELY, LimitReachedError, PdfReadError, PdfStreamError from ._base import ( BooleanObject, ByteStringObject, @@ -637,6 +637,10 @@ def read_from_stream( length = -1 pstart = stream.tell() if length >= 0: + from ..filters import MAX_DECLARED_STREAM_LENGTH # noqa: PLC0415 + if length > MAX_DECLARED_STREAM_LENGTH: + raise LimitReachedError(f"Declared stream length of {length} exceeds maximum allowed length.") + data["__streamdata__"] = stream.read(length) else: data["__streamdata__"] = read_until_regex( diff --git a/tests/generic/test_data_structures.py b/tests/generic/test_data_structures.py index 403d513f4..d547a5cd6 100644 --- a/tests/generic/test_data_structures.py +++ b/tests/generic/test_data_structures.py @@ -1,12 +1,23 @@ """Test the pypdf.generic._data_structures module.""" +import os +import subprocess +import sys from io import BytesIO +from pathlib import Path +from typing import Callable import pytest from pypdf import PdfReader, PdfWriter +from pypdf.errors import LimitReachedError from pypdf.generic import DictionaryObject, NameObject, RectangleObject, TreeObject from tests import RESOURCE_ROOT, get_data_from_url +try: + import resource +except ImportError: + resource = None + def test_dictionary_object__get_next_object_position(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") @@ -54,3 +65,107 @@ def test_array_object__clone_same_object_multiple_times(caplog): page2 = writer.add_page(page) assert page2.mediabox == RectangleObject((0, 0, 595, 841)) assert caplog.messages == [] + + +@pytest.mark.enable_socket +def test_dictionary_object__read_from_stream__limit(): + name = "read_from_stream__length_2gb.pdf" + url = "https://github.com/user-attachments/files/25842437/read_from_stream__length_2gb.pdf" + + reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) + page = reader.pages[0] + + with pytest.raises( + expected_exception=LimitReachedError, + match=r"^Declared stream length of 2147483647 exceeds maximum allowed length\.$" + ): + page.extract_text() + + +def _prepare_test_dictionary_object__read_from_stream__no_limit( + path: Path +) -> tuple[str, dict[str, str], Callable[[], None]]: + env = os.environ.copy() + env["COVERAGE_PROCESS_START"] = "pyproject.toml" + + name = "read_from_stream__length_2gb.pdf" + url = "https://github.com/user-attachments/files/25842437/read_from_stream__length_2gb.pdf" + data = get_data_from_url(url=url, name=name) + pdf_path = path / name + pdf_path.write_bytes(data) + pdf_path_str = pdf_path.resolve().as_posix() + + try: + env["PYTHONPATH"] = "." + os.pathsep + env["PYTHONPATH"] + except KeyError: + env["PYTHONPATH"] = "." + + def limit_virtual_memory() -> None: + limit_kb = 1_000_000 + limit_bytes = limit_kb * 1024 + resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes)) + + return pdf_path_str, env, limit_virtual_memory + + +@pytest.mark.enable_socket +@pytest.mark.skipif(condition=resource is None, reason="Does not have 'resource' module.") +@pytest.mark.skipif(sys.platform == "darwin", reason="RLIMIT_AS is unreliable.") +def test_dictionary_object__read_from_stream__no_limit(tmp_path): + pdf_path_str, env, limit_virtual_memory = _prepare_test_dictionary_object__read_from_stream__no_limit(tmp_path) + + source_file = tmp_path / "script.py" + source_file.write_text( + f""" +import sys +from pypdf import filters, PdfReader + +filters.MAX_DECLARED_STREAM_LENGTH = sys.maxsize + +with open({pdf_path_str!r}, mode="rb") as fd: + reader = PdfReader(fd) + print(reader.pages[0].extract_text()) +""" + ) + + result = subprocess.run( # noqa: S603 # We have the control here. + [sys.executable, source_file], + capture_output=True, + env=env, + text=True, + preexec_fn=limit_virtual_memory, + ) + assert result.returncode == 1 + assert result.stdout == "" + assert result.stderr.replace("\r", "").endswith("\nMemoryError\n") + + +@pytest.mark.enable_socket +@pytest.mark.skipif(condition=resource is None, reason="Does not have 'resource' module.") +@pytest.mark.skipif(sys.platform == "darwin", reason="RLIMIT_AS is unreliable.") +def test_dictionary_object__read_from_stream__no_limit__path(tmp_path): + pdf_path_str, env, limit_virtual_memory = _prepare_test_dictionary_object__read_from_stream__no_limit(tmp_path) + + source_file = tmp_path / "script.py" + source_file.write_text( + f""" +import sys +from pypdf import filters, PdfReader + +filters.MAX_DECLARED_STREAM_LENGTH = sys.maxsize + +reader = PdfReader({pdf_path_str!r}) +print(reader.pages[0].extract_text()) +""" + ) + + result = subprocess.run( # noqa: S603 # We have the control here. + [sys.executable, source_file], + capture_output=True, + env=env, + text=True, + preexec_fn=limit_virtual_memory, + ) + assert result.returncode == 0 + assert result.stdout.replace("\r", "") == "Hello from pypdf\n" + assert result.stderr == ""