Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/user/security.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ aware of the possible side effects, you can modify the following constants which
For JBIG2 images, there is a similar parameter to limit the memory usage during decoding: `pypdf.filters.JBIG2_MAX_OUTPUT_LENGTH`
It defaults to 75 MB as well.

For all streams, the maximum allowed value for the `/Length` field is limited to `pypdf.filters.MAX_DECLARED_STREAM_LENGTH`, which
defaults to 75 MB as well.

For the *FlateDecode* filter, the number of bytes to attempt recovery with can be set by `pypdf.filters.ZLIB_MAX_RECOVERY_INPUT_LENGTH`.
It defaults to 5 MB due to the much more complex recovery approach.

Expand Down
2 changes: 2 additions & 0 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
is_null_or_none,
)

MAX_DECLARED_STREAM_LENGTH = 75_000_000

JBIG2_MAX_OUTPUT_LENGTH = 75_000_000
LZW_MAX_OUTPUT_LENGTH = 75_000_000
RUN_LENGTH_MAX_OUTPUT_LENGTH = 75_000_000
Expand Down
6 changes: 5 additions & 1 deletion pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
from ..constants import StreamAttributes as SA
from ..constants import TypArguments as TA
from ..constants import TypFitArguments as TF
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
from ..errors import STREAM_TRUNCATED_PREMATURELY, LimitReachedError, PdfReadError, PdfStreamError
from ._base import (
BooleanObject,
ByteStringObject,
Expand Down Expand Up @@ -637,6 +637,10 @@ def read_from_stream(
length = -1
pstart = stream.tell()
if length >= 0:
from ..filters import MAX_DECLARED_STREAM_LENGTH # noqa: PLC0415
if length > MAX_DECLARED_STREAM_LENGTH:
raise LimitReachedError(f"Declared stream length of {length} exceeds maximum allowed length.")

data["__streamdata__"] = stream.read(length)
else:
data["__streamdata__"] = read_until_regex(
Expand Down
115 changes: 115 additions & 0 deletions tests/generic/test_data_structures.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
"""Test the pypdf.generic._data_structures module."""
import os
import subprocess
import sys
from io import BytesIO
from pathlib import Path
from typing import Callable

import pytest

from pypdf import PdfReader, PdfWriter
from pypdf.errors import LimitReachedError
from pypdf.generic import DictionaryObject, NameObject, RectangleObject, TreeObject
from tests import RESOURCE_ROOT, get_data_from_url

try:
import resource
except ImportError:
resource = None


def test_dictionary_object__get_next_object_position():
reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
Expand Down Expand Up @@ -54,3 +65,107 @@ def test_array_object__clone_same_object_multiple_times(caplog):
page2 = writer.add_page(page)
assert page2.mediabox == RectangleObject((0, 0, 595, 841))
assert caplog.messages == []


@pytest.mark.enable_socket
def test_dictionary_object__read_from_stream__limit():
name = "read_from_stream__length_2gb.pdf"
url = "https://github.com/user-attachments/files/25842437/read_from_stream__length_2gb.pdf"

reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name)))
page = reader.pages[0]

with pytest.raises(
expected_exception=LimitReachedError,
match=r"^Declared stream length of 2147483647 exceeds maximum allowed length\.$"
):
page.extract_text()


def _prepare_test_dictionary_object__read_from_stream__no_limit(
path: Path
) -> tuple[str, dict[str, str], Callable[[], None]]:
env = os.environ.copy()
env["COVERAGE_PROCESS_START"] = "pyproject.toml"

name = "read_from_stream__length_2gb.pdf"
url = "https://github.com/user-attachments/files/25842437/read_from_stream__length_2gb.pdf"
data = get_data_from_url(url=url, name=name)
pdf_path = path / name
pdf_path.write_bytes(data)
pdf_path_str = pdf_path.resolve().as_posix()

try:
env["PYTHONPATH"] = "." + os.pathsep + env["PYTHONPATH"]
except KeyError:
env["PYTHONPATH"] = "."

def limit_virtual_memory() -> None:
limit_kb = 1_000_000
limit_bytes = limit_kb * 1024
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))

return pdf_path_str, env, limit_virtual_memory


@pytest.mark.enable_socket
@pytest.mark.skipif(condition=resource is None, reason="Does not have 'resource' module.")
@pytest.mark.skipif(sys.platform == "darwin", reason="RLIMIT_AS is unreliable.")
def test_dictionary_object__read_from_stream__no_limit(tmp_path):
pdf_path_str, env, limit_virtual_memory = _prepare_test_dictionary_object__read_from_stream__no_limit(tmp_path)

source_file = tmp_path / "script.py"
source_file.write_text(
f"""
import sys
from pypdf import filters, PdfReader

filters.MAX_DECLARED_STREAM_LENGTH = sys.maxsize

with open({pdf_path_str!r}, mode="rb") as fd:
reader = PdfReader(fd)
print(reader.pages[0].extract_text())
"""
)

result = subprocess.run( # noqa: S603 # We have the control here.
[sys.executable, source_file],
capture_output=True,
env=env,
text=True,
preexec_fn=limit_virtual_memory,
)
assert result.returncode == 1
assert result.stdout == ""
assert result.stderr.replace("\r", "").endswith("\nMemoryError\n")


@pytest.mark.enable_socket
@pytest.mark.skipif(condition=resource is None, reason="Does not have 'resource' module.")
@pytest.mark.skipif(sys.platform == "darwin", reason="RLIMIT_AS is unreliable.")
def test_dictionary_object__read_from_stream__no_limit__path(tmp_path):
pdf_path_str, env, limit_virtual_memory = _prepare_test_dictionary_object__read_from_stream__no_limit(tmp_path)

source_file = tmp_path / "script.py"
source_file.write_text(
f"""
import sys
from pypdf import filters, PdfReader

filters.MAX_DECLARED_STREAM_LENGTH = sys.maxsize

reader = PdfReader({pdf_path_str!r})
print(reader.pages[0].extract_text())
"""
)

result = subprocess.run( # noqa: S603 # We have the control here.
[sys.executable, source_file],
capture_output=True,
env=env,
text=True,
preexec_fn=limit_virtual_memory,
)
assert result.returncode == 0
assert result.stdout.replace("\r", "") == "Hello from pypdf\n"
assert result.stderr == ""