https://github.com/python/cpython/commit/a3ce2f77f0813c214896ec66be3a26121f52361e
commit: a3ce2f77f0813c214896ec66be3a26121f52361e
branch: main
author: Stan Ulbrych <[email protected]>
committer: malemburg <[email protected]>
date: 2025-10-30T15:31:47+01:00
summary:
gh-55531: Implement `normalize_encoding` in C (#136643)
Closes gh-55531
files:
A Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst
M Lib/encodings/__init__.py
M Modules/_codecsmodule.c
M Modules/clinic/_codecsmodule.c.h
M Objects/unicodeobject.c
M Python/fileutils.c
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
index 298177eb8003a7..e7e4ca3358e0f9 100644
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -30,6 +30,7 @@
import codecs
import sys
+from _codecs import _normalize_encoding
from . import aliases
_cache = {}
@@ -55,18 +56,7 @@ def normalize_encoding(encoding):
if isinstance(encoding, bytes):
encoding = str(encoding, "ascii")
- chars = []
- punct = False
- for c in encoding:
- if c.isalnum() or c == '.':
- if punct and chars:
- chars.append('_')
- if c.isascii():
- chars.append(c)
- punct = False
- else:
- punct = True
- return ''.join(chars)
+ return _normalize_encoding(encoding)
def search_function(encoding):
diff --git
a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst
b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst
new file mode 100644
index 00000000000000..70e39a4f2c167c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst
@@ -0,0 +1,4 @@
+:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
+by implementing the function in C using the private
+``_Py_normalize_encoding`` which has been modified to make lowercase
+conversion optional.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index bdffeced7da5a9..2f2edbb05ab5c5 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char
*name)
return PyCodec_LookupError(name);
}
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
+
+/*[clinic input]
+_codecs._normalize_encoding
+ encoding: unicode
+
+Normalize an encoding name *encoding*.
+
+Used for encodings.normalize_encoding. Does not convert to lower case.
+[clinic start generated code]*/
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
+/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
+{
+ Py_ssize_t len;
+ const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
+ if (cstr == NULL) {
+ return NULL;
+ }
+
+ if (len > PY_SSIZE_T_MAX) {
+ PyErr_SetString(PyExc_OverflowError, "encoding is too large");
+ return NULL;
+ }
+
+ char *normalized = PyMem_Malloc(len + 1);
+ if (normalized == NULL) {
+ return PyErr_NoMemory();
+ }
+
+ if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
+ PyMem_Free(normalized);
+ return NULL;
+ }
+
+ PyObject *result = PyUnicode_FromString(normalized);
+ PyMem_Free(normalized);
+ return result;
+}
+
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
@@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {
_CODECS_REGISTER_ERROR_METHODDEF
_CODECS__UNREGISTER_ERROR_METHODDEF
_CODECS_LOOKUP_ERROR_METHODDEF
+ _CODECS__NORMALIZE_ENCODING_METHODDEF
{NULL, NULL} /* sentinel */
};
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h
index b0310325759326..9e2a7950ebde64 100644
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -2779,6 +2779,70 @@ _codecs_lookup_error(PyObject *module, PyObject *arg)
return return_value;
}
+PyDoc_STRVAR(_codecs__normalize_encoding__doc__,
+"_normalize_encoding($module, /, encoding)\n"
+"--\n"
+"\n"
+"Normalize an encoding name *encoding*.\n"
+"\n"
+"Used for encodings.normalize_encoding. Does not convert to lower case.");
+
+#define _CODECS__NORMALIZE_ENCODING_METHODDEF \
+ {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding),
METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__},
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding);
+
+static PyObject *
+_codecs__normalize_encoding(PyObject *module, PyObject *const *args,
Py_ssize_t nargs, PyObject *kwnames)
+{
+ PyObject *return_value = NULL;
+ #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+ #define NUM_KEYWORDS 1
+ static struct {
+ PyGC_Head _this_is_not_used;
+ PyObject_VAR_HEAD
+ Py_hash_t ob_hash;
+ PyObject *ob_item[NUM_KEYWORDS];
+ } _kwtuple = {
+ .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+ .ob_hash = -1,
+ .ob_item = { &_Py_ID(encoding), },
+ };
+ #undef NUM_KEYWORDS
+ #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+ #else // !Py_BUILD_CORE
+ # define KWTUPLE NULL
+ #endif // !Py_BUILD_CORE
+
+ static const char * const _keywords[] = {"encoding", NULL};
+ static _PyArg_Parser _parser = {
+ .keywords = _keywords,
+ .fname = "_normalize_encoding",
+ .kwtuple = KWTUPLE,
+ };
+ #undef KWTUPLE
+ PyObject *argsbuf[1];
+ PyObject *encoding;
+
+ args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+ /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+ if (!args) {
+ goto exit;
+ }
+ if (!PyUnicode_Check(args[0])) {
+ _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'",
"str", args[0]);
+ goto exit;
+ }
+ encoding = args[0];
+ return_value = _codecs__normalize_encoding_impl(module, encoding);
+
+exit:
+ return return_value;
+}
+
#ifndef _CODECS_MBCS_DECODE_METHODDEF
#define _CODECS_MBCS_DECODE_METHODDEF
#endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
@@ -2802,4 +2866,4 @@ _codecs_lookup_error(PyObject *module, PyObject *arg)
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1c443e88e05029..4e8c132327b7d0 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
return v;
}
-/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
- also convert to lowercase. Return 1 on success, or 0 on error (encoding is
- longer than lower_len-1). */
+/* Normalize an encoding name like encodings.normalize_encoding()
+ but allow to convert to lowercase if *to_lower* is true.
+ Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
int
_Py_normalize_encoding(const char *encoding,
char *lower,
- size_t lower_len)
+ size_t lower_len,
+ int to_lower)
{
const char *e;
char *l;
@@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,
if (l == l_end) {
return 0;
}
- *l++ = Py_TOLOWER(c);
+ *l++ = to_lower ? Py_TOLOWER(c) : c;
}
else {
punct = 1;
@@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,
}
/* Shortcuts for common default encodings */
- if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+ if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
char *lower = buflower;
/* Fast paths */
@@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
}
/* Shortcuts for common default encodings */
- if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+ if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
char *lower = buflower;
/* Fast paths */
diff --git a/Python/fileutils.c b/Python/fileutils.c
index b808229716fd9c..93abd70a34d420 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len,
mbstate_t *pmbs)
#define USE_FORCE_ASCII
-extern int _Py_normalize_encoding(const char *, char *, size_t);
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@@ -229,7 +229,7 @@ check_force_ascii(void)
}
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
- if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
+ if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
goto error;
}
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]