Skip to content

Commit 2b7d583

Browse files
committed
PI: Fix O(n²) performance in NameObject read/write for pathological names
Three functions had quadratic behavior that caused hangs on PDFs with extremely long Name objects (e.g. repeatedly mis-encoded UTF-8 names): - read_until_regex: searched entire accumulated buffer on each 16-byte chunk instead of only the new chunk, and used bytes concatenation - NameObject.unnumber: rebuilt entire bytes object on each # replacement - NameObject.renumber: used out += concatenation in a loop
1 parent 2cfcd7e commit 2b7d583

File tree

2 files changed

+41
-25
lines changed

2 files changed

+41
-25
lines changed

pypdf/_utils.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -254,18 +254,30 @@ def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
254254
The read bytes.
255255
256256
"""
257-
name = b""
257+
parts: list[bytes] = []
258+
total_len = 0
259+
tail = b""
260+
chunk_size = 16
258261
while True:
259-
tok = stream.read(16)
262+
tok = stream.read(chunk_size)
260263
if not tok:
261-
return name
262-
m = regex.search(name + tok)
264+
return b"".join(parts)
265+
# Search overlap of previous tail + new chunk to catch
266+
# multi-byte regex matches spanning chunk boundaries.
267+
buf = tail + tok
268+
m = regex.search(buf)
263269
if m is not None:
264-
stream.seek(m.start() - (len(name) + len(tok)), 1)
265-
name = (name + tok)[: m.start()]
266-
break
267-
name += tok
268-
return name
270+
overlap = len(tail)
271+
actual_start = total_len - overlap + m.start()
272+
stream.seek(actual_start - total_len - len(tok), 1)
273+
parts.append(tok)
274+
return b"".join(parts)[:actual_start]
275+
parts.append(tok)
276+
total_len += len(tok)
277+
tail = tok[-16:]
278+
if chunk_size < 65536:
279+
chunk_size *= 2
280+
return b"".join(parts)
269281

270282

271283
def read_block_backwards(stream: StreamType, to_read: int) -> bytes:

pypdf/generic/_base.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import hashlib
3030
import re
3131
import sys
32-
from binascii import unhexlify
3332
from collections.abc import Sequence
3433
from math import log10
3534
from struct import iter_unpack
@@ -840,16 +839,16 @@ def renumber(self) -> bytes:
840839
f"Incorrect first char in NameObject, should start with '/': ({self})",
841840
"5.0.0",
842841
)
842+
parts = [out]
843843
for c in self[1:]:
844844
if c > "~":
845-
for x in c.encode("utf-8"):
846-
out += f"#{x:02X}".encode()
845+
parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8"))
847846
else:
848847
try:
849-
out += self.renumber_table[c]
848+
parts.append(self.renumber_table[c])
850849
except KeyError:
851-
out += c.encode("utf-8")
852-
return out
850+
parts.append(c.encode("utf-8"))
851+
return b"".join(parts)
853852

854853
def _sanitize(self) -> "NameObject":
855854
"""
@@ -873,16 +872,21 @@ def surfix(cls) -> bytes: # noqa: N805
873872

874873
@staticmethod
875874
def unnumber(sin: bytes) -> bytes:
876-
i = sin.find(b"#", 0)
877-
while i >= 0:
878-
try:
879-
sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
880-
i = sin.find(b"#", i + 1)
881-
except ValueError:
882-
# if the 2 characters after # can not be converted to hex
883-
# we change nothing and carry on
884-
i = i + 1
885-
return sin
875+
result = bytearray()
876+
i = 0
877+
while i < len(sin):
878+
if sin[i:i + 1] == b"#":
879+
try:
880+
result.append(int(sin[i + 1 : i + 3], 16))
881+
i += 3
882+
continue
883+
except (ValueError, IndexError):
884+
# if the 2 characters after # can not be converted to hex
885+
# we change nothing and carry on
886+
pass
887+
result.append(sin[i])
888+
i += 1
889+
return bytes(result)
886890

887891
CHARSETS = ("utf-8", "gbk", "latin1")
888892

0 commit comments

Comments
 (0)