diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py index 0ebd9162eca4bb..f1548f1118d919 100644 --- a/Lib/_pyrepl/reader.py +++ b/Lib/_pyrepl/reader.py @@ -40,6 +40,12 @@ # syntax classes SYNTAX_WHITESPACE, SYNTAX_WORD, SYNTAX_SYMBOL = range(3) +def normalize_surrogates(s: str) -> str: + # Encode with surrogatepass, decode to normalize surrogate pairs + try: + return s.encode('utf-16', 'surrogatepass').decode('utf-16') + except UnicodeEncodeError: + return s # fallback if encoding somehow fails def make_default_syntax_table() -> dict[str, int]: # XXX perhaps should use some unicodedata here? @@ -759,4 +765,5 @@ def bind(self, spec: KeySpec, command: CommandName) -> None: def get_unicode(self) -> str: """Return the current buffer as a unicode string.""" - return "".join(self.buffer) + text = "".join(self.buffer) + return normalize_surrogates(text) diff --git a/Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst b/Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst new file mode 100644 index 00000000000000..8aad9c6774f7a3 --- /dev/null +++ b/Misc/NEWS.d/next/Windows/2025-07-14-01-27-42.gh-issue-136595.964PbL.rst @@ -0,0 +1 @@ +Fix a crash in the REPL on Windows when typing Unicode characters outside the Basic Multilingual Plane (≥ U+10000), such as emoji. These characters are now properly handled as surrogate pairs.