PI: Fix O(n²) performance in NameObject read/write for pathological names

dmitry-kostin · dmitry-kostin · commit 2b7d5835a547 · 2026-03-10T19:36:46.000+01:00
Three functions had quadratic behavior that caused hangs on PDFs with
extremely long Name objects (e.g. repeatedly mis-encoded UTF-8 names):

- read_until_regex: searched entire accumulated buffer on each 16-byte
  chunk instead of only the new chunk, and used bytes concatenation
- NameObject.unnumber: rebuilt entire bytes object on each # replacement
- NameObject.renumber: used out += concatenation in a loop
diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -254,18 +254,30 @@ def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
         The read bytes.
 
     """
-    name = b""
+    parts: list[bytes] = []
+    total_len = 0
+    tail = b""
+    chunk_size = 16
     while True:
-        tok = stream.read(16)
+        tok = stream.read(chunk_size)
         if not tok:
-            return name
-        m = regex.search(name + tok)
+            return b"".join(parts)
+        # Search overlap of previous tail + new chunk to catch
+        # multi-byte regex matches spanning chunk boundaries.
+        buf = tail + tok
+        m = regex.search(buf)
         if m is not None:
-            stream.seek(m.start() - (len(name) + len(tok)), 1)
-            name = (name + tok)[: m.start()]
-            break
-        name += tok
-    return name
+            overlap = len(tail)
+            actual_start = total_len - overlap + m.start()
+            stream.seek(actual_start - total_len - len(tok), 1)
+            parts.append(tok)
+            return b"".join(parts)[:actual_start]
+        parts.append(tok)
+        total_len += len(tok)
+        tail = tok[-16:]
+        if chunk_size < 65536:
+            chunk_size *= 2
+    return b"".join(parts)
 
 
 def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
@@ -29,7 +29,6 @@
 import hashlib
 import re
 import sys
-from binascii import unhexlify
 from collections.abc import Sequence
 from math import log10
 from struct import iter_unpack
@@ -840,16 +839,16 @@ def renumber(self) -> bytes:
                 f"Incorrect first char in NameObject, should start with '/': ({self})",
                 "5.0.0",
             )
+        parts = [out]
         for c in self[1:]:
             if c > "~":
-                for x in c.encode("utf-8"):
-                    out += f"#{x:02X}".encode()
+                parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8"))
             else:
                 try:
-                    out += self.renumber_table[c]
+                    parts.append(self.renumber_table[c])
                 except KeyError:
-                    out += c.encode("utf-8")
-        return out
+                    parts.append(c.encode("utf-8"))
+        return b"".join(parts)
 
     def _sanitize(self) -> "NameObject":
         """
@@ -873,16 +872,21 @@ def surfix(cls) -> bytes:  # noqa: N805
 
     @staticmethod
     def unnumber(sin: bytes) -> bytes:
-        i = sin.find(b"#", 0)
-        while i >= 0:
-            try:
-                sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
-                i = sin.find(b"#", i + 1)
-            except ValueError:
-                # if the 2 characters after # can not be converted to hex
-                # we change nothing and carry on
-                i = i + 1
-        return sin
+        result = bytearray()
+        i = 0
+        while i < len(sin):
+            if sin[i:i + 1] == b"#":
+                try:
+                    result.append(int(sin[i + 1 : i + 3], 16))
+                    i += 3
+                    continue
+                except (ValueError, IndexError):
+                    # if the 2 characters after # can not be converted to hex
+                    # we change nothing and carry on
+                    pass
+            result.append(sin[i])
+            i += 1
+        return bytes(result)
 
     CHARSETS = ("utf-8", "gbk", "latin1")