diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..e7e4ca3358e0f9 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -30,6 +30,7 @@ import codecs import sys +from _codecs import _normalize_encoding from . import aliases _cache = {} @@ -55,18 +56,7 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - chars = [] - punct = False - for c in encoding: - if c.isalnum() or c == '.': - if punct and chars: - chars.append('_') - if c.isascii(): - chars.append(c) - punct = False - else: - punct = True - return ''.join(chars) + return _normalize_encoding(encoding) def search_function(encoding): diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst new file mode 100644 index 00000000000000..70e39a4f2c167c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst @@ -0,0 +1,4 @@ +:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance +by implementing the function in C using the private +``_Py_normalize_encoding`` which has been modified to make lowercase +conversion optional. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 7cf3f152eeecc6..c15e156985f4ba 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1022,6 +1022,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name) return PyCodec_LookupError(name); } +extern int _Py_normalize_encoding(const char *, char *, size_t, int); + +/*[clinic input] +_codecs._normalize_encoding + encoding: unicode + +Normalize an encoding name *encoding*. + +Used for encodings.normalize_encoding. Does not convert to lower case. +[clinic start generated code]*/ + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) +/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ +{ + Py_ssize_t len; + const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len); + if (cstr == NULL) { + return NULL; + } + + if (len > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "encoding is too large"); + return NULL; + } + + char *normalized = PyMem_Malloc(len + 1); + if (normalized == NULL) { + return PyErr_NoMemory(); + } + + if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { + PyMem_Free(normalized); + return NULL; + } + + PyObject *result = PyUnicode_FromString(normalized); + PyMem_Free(normalized); + return result; +} + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -1071,6 +1112,7 @@ static PyMethodDef _codecs_functions[] = { _CODECS_REGISTER_ERROR_METHODDEF _CODECS__UNREGISTER_ERROR_METHODDEF _CODECS_LOOKUP_ERROR_METHODDEF + _CODECS__NORMALIZE_ENCODING_METHODDEF {NULL, NULL} /* sentinel */ }; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index b0310325759326..9e2a7950ebde64 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2779,6 +2779,70 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) return return_value; } +PyDoc_STRVAR(_codecs__normalize_encoding__doc__, +"_normalize_encoding($module, /, encoding)\n" +"--\n" +"\n" +"Normalize an encoding name *encoding*.\n" +"\n" +"Used for encodings.normalize_encoding. Does not convert to lower case."); + +#define _CODECS__NORMALIZE_ENCODING_METHODDEF \ + {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding); + +static PyObject * +_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(encoding), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"encoding", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "_normalize_encoding", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject *encoding; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]); + goto exit; + } + encoding = args[0]; + return_value = _codecs__normalize_encoding_impl(module, encoding); + +exit: + return return_value; +} + #ifndef _CODECS_MBCS_DECODE_METHODDEF #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ @@ -2802,4 +2866,4 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..ba66e273a208be 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3587,13 +3587,14 @@ PyUnicode_FromEncodedObject(PyObject *obj, return v; } -/* Normalize an encoding name: similar to encodings.normalize_encoding(), but - also convert to lowercase. Return 1 on success, or 0 on error (encoding is - longer than lower_len-1). */ +/* Normalize an encoding name like encodings.normalize_encoding() + but allow to convert to lowercase if *to_lower* is true. + Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, char *lower, - size_t lower_len) + size_t lower_len, + int to_lower) { const char *e; char *l; @@ -3624,7 +3625,7 @@ _Py_normalize_encoding(const char *encoding, if (l == l_end) { return 0; } - *l++ = Py_TOLOWER(c); + *l++ = to_lower ? Py_TOLOWER(c) : c; } else { punct = 1; @@ -3659,7 +3660,7 @@ PyUnicode_Decode(const char *s, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ @@ -3916,7 +3917,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ diff --git a/Python/codecs.c b/Python/codecs.c index caf8d9d5f3c188..ffcb14928e0a82 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function) return 0; } -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are converted to lower case, spaces and hyphens are replaced with underscores. */ @@ -108,10 +108,11 @@ PyObject *normalizestring(const char *string) } encoding = PyMem_Malloc(len + 1); - if (encoding == NULL) + if (encoding == NULL) { return PyErr_NoMemory(); + } - if (!_Py_normalize_encoding(string, encoding, len + 1)) + if (!_Py_normalize_encoding(string, encoding, len + 1, 1)) { PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(encoding); diff --git a/Python/fileutils.c b/Python/fileutils.c index 2a3f12d4e872f8..aedf8576c7a930 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) #define USE_FORCE_ASCII -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale and POSIX locale. nl_langinfo(CODESET) announces an alias of the @@ -231,7 +231,7 @@ check_force_ascii(void) } char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ - if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) { + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) { goto error; }