From b9c4f77e804f2238519124a744fec32372f1fb62 Mon Sep 17 00:00:00 2001 From: Vedant Ravindra Dhoke <66007382+vedant713@users.noreply.github.com> Date: Sun, 13 Jul 2025 21:24:59 -0400 Subject: [PATCH 1/3] gh-136595: Normalize surrogate pairs in REPL input to fix UnicodeEncodeError on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new REPL implementation (_pyrepl) crashes on Windows when the user inputs Unicode characters outside the Basic Multilingual Plane (≥ U+10000), such as emoji (e.g. 🐍). This happens because the Windows input layer provides surrogate pairs (UTF-16 code units) that _pyrepl attempts to process and tokenize directly, leading to unpaired surrogate handling issues. This commit introduces a `normalize_surrogates()` helper in `Reader` to explicitly normalize surrogate pairs by encoding to UTF-16 with 'surrogatepass' and decoding back. The `get_unicode()` method is patched to use this normalization so that any code consuming REPL input (e.g. syntax highlighting via tokenize) receives valid Unicode text. This resolves UnicodeEncodeError crashes in the REPL when typing emoji or other non-BMP characters on Windows. Fixes #136595 --- Lib/_pyrepl/reader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py index 0ebd9162eca4bb..511ab5d905f444 100644 --- a/Lib/_pyrepl/reader.py +++ b/Lib/_pyrepl/reader.py @@ -40,6 +40,12 @@ # syntax classes SYNTAX_WHITESPACE, SYNTAX_WORD, SYNTAX_SYMBOL = range(3) +def normalize_surrogates(s): + # Encode with surrogatepass, decode to normalize surrogate pairs + try: + return s.encode('utf-16', 'surrogatepass').decode('utf-16') + except UnicodeEncodeError: + return s # fallback if encoding somehow fails def make_default_syntax_table() -> dict[str, int]: # XXX perhaps should use some unicodedata here? @@ -759,4 +765,5 @@ def bind(self, spec: KeySpec, command: CommandName) -> None: def get_unicode(self) -> str: """Return the current buffer as a unicode string.""" - return "".join(self.buffer) + text = "".join(self.buffer) + return normalize_surrogates(text) From a567845b100e70e772199fd220fd67b00e0740d9 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 14 Jul 2025 01:27:44 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst diff --git a/Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst b/Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst new file mode 100644 index 00000000000000..8aad9c6774f7a3 --- /dev/null +++ b/Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst @@ -0,0 +1 @@ +Fix a crash in the REPL on Windows when typing Unicode characters outside the Basic Multilingual Plane (≥ U+10000), such as emoji. These characters are now properly handled as surrogate pairs. From 7a31a1f32f9bb4375005f411cc3600185c2cf9ea Mon Sep 17 00:00:00 2001 From: Vedant Ravindra Dhoke <66007382+vedant713@users.noreply.github.com> Date: Sun, 13 Jul 2025 21:30:43 -0400 Subject: [PATCH 3/3] Update reader.py --- Lib/_pyrepl/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py index 511ab5d905f444..f1548f1118d919 100644 --- a/Lib/_pyrepl/reader.py +++ b/Lib/_pyrepl/reader.py @@ -40,7 +40,7 @@ # syntax classes SYNTAX_WHITESPACE, SYNTAX_WORD, SYNTAX_SYMBOL = range(3) -def normalize_surrogates(s): +def normalize_surrogates(s: str) -> str: # Encode with surrogatepass, decode to normalize surrogate pairs try: return s.encode('utf-16', 'surrogatepass').decode('utf-16')