https://github.com/python/cpython/commit/0518edc17049a2f474b049b7d7fe3ef4339ceb83
commit: 0518edc17049a2f474b049b7d7fe3ef4339ceb83
branch: main
author: Victor Stinner <[email protected]>
committer: vstinner <[email protected]>
date: 2024-05-28T18:05:20+02:00
summary:
gh-119396: Optimize unicode_repr() (#119617)
Use stringlib to specialize unicode_repr() for each string kind
(UCS1, UCS2, UCS4).
Benchmark:
+-------------------------------------+---------+----------------------+
| Benchmark | ref | change2 |
+=====================================+=========+======================+
| repr('abc') | 100 ns | 103 ns: 1.02x slower |
+-------------------------------------+---------+----------------------+
| repr('a' * 100) | 369 ns | 369 ns: 1.00x slower |
+-------------------------------------+---------+----------------------+
| repr(('a' + squote) * 100) | 1.21 us | 946 ns: 1.27x faster |
+-------------------------------------+---------+----------------------+
| repr(('a' + nl) * 100) | 1.23 us | 907 ns: 1.36x faster |
+-------------------------------------+---------+----------------------+
| repr(dquote + ('a' + squote) * 100) | 1.08 us | 858 ns: 1.25x faster |
+-------------------------------------+---------+----------------------+
| Geometric mean | (ref) | 1.16x faster |
+-------------------------------------+---------+----------------------+
files:
A Objects/stringlib/repr.h
M Makefile.pre.in
M Objects/unicodeobject.c
M Tools/c-analyzer/cpython/_parser.py
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 9e99c95e2af042..a80d9334ba5134 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1841,6 +1841,7 @@ UNICODE_DEPS = \
$(srcdir)/Objects/stringlib/localeutil.h \
$(srcdir)/Objects/stringlib/partition.h \
$(srcdir)/Objects/stringlib/replace.h \
+ $(srcdir)/Objects/stringlib/repr.h \
$(srcdir)/Objects/stringlib/split.h \
$(srcdir)/Objects/stringlib/ucs1lib.h \
$(srcdir)/Objects/stringlib/ucs2lib.h \
diff --git a/Objects/stringlib/repr.h b/Objects/stringlib/repr.h
new file mode 100644
index 00000000000000..87b1a8ba629dc6
--- /dev/null
+++ b/Objects/stringlib/repr.h
@@ -0,0 +1,95 @@
+/* stringlib: repr() implementation */
+
+#ifndef STRINGLIB_FASTSEARCH_H
+#error must include "stringlib/fastsearch.h" before including this module
+#endif
+
+
+static void
+STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote,
+ STRINGLIB_CHAR *odata)
+{
+ Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
+ const void *idata = PyUnicode_DATA(unicode);
+ int ikind = PyUnicode_KIND(unicode);
+
+ *odata++ = quote;
+ for (Py_ssize_t i = 0; i < isize; i++) {
+ Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
+
+ /* Escape quotes and backslashes */
+ if ((ch == quote) || (ch == '\\')) {
+ *odata++ = '\\';
+ *odata++ = ch;
+ continue;
+ }
+
+ /* Map special whitespace to '\t', \n', '\r' */
+ if (ch == '\t') {
+ *odata++ = '\\';
+ *odata++ = 't';
+ }
+ else if (ch == '\n') {
+ *odata++ = '\\';
+ *odata++ = 'n';
+ }
+ else if (ch == '\r') {
+ *odata++ = '\\';
+ *odata++ = 'r';
+ }
+
+ /* Map non-printable US ASCII to '\xhh' */
+ else if (ch < ' ' || ch == 0x7F) {
+ *odata++ = '\\';
+ *odata++ = 'x';
+ *odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
+ *odata++ = Py_hexdigits[ch & 0x000F];
+ }
+
+ /* Copy ASCII characters as-is */
+ else if (ch < 0x7F) {
+ *odata++ = ch;
+ }
+
+ /* Non-ASCII characters */
+ else {
+ /* Map Unicode whitespace and control characters
+ (categories Z* and C* except ASCII space)
+ */
+ if (!Py_UNICODE_ISPRINTABLE(ch)) {
+ *odata++ = '\\';
+ /* Map 8-bit characters to '\xhh' */
+ if (ch <= 0xff) {
+ *odata++ = 'x';
+ *odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
+ *odata++ = Py_hexdigits[ch & 0x000F];
+ }
+ /* Map 16-bit characters to '\uxxxx' */
+ else if (ch <= 0xffff) {
+ *odata++ = 'u';
+ *odata++ = Py_hexdigits[(ch >> 12) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 8) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 4) & 0xF];
+ *odata++ = Py_hexdigits[ch & 0xF];
+ }
+ /* Map 21-bit characters to '\U00xxxxxx' */
+ else {
+ *odata++ = 'U';
+ *odata++ = Py_hexdigits[(ch >> 28) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 24) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 20) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 16) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 12) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 8) & 0xF];
+ *odata++ = Py_hexdigits[(ch >> 4) & 0xF];
+ *odata++ = Py_hexdigits[ch & 0xF];
+ }
+ }
+ /* Copy characters as-is */
+ else {
+ *odata++ = ch;
+ }
+ }
+ }
+ *odata = quote;
+}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 92db31f1e498f9..eb37b478cc4de1 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
+#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
@@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
+#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
@@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
+#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
@@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject
*suffix)
static PyObject *
unicode_repr(PyObject *unicode)
{
- PyObject *repr;
- Py_ssize_t isize;
- Py_ssize_t osize, squote, dquote, i, o;
- Py_UCS4 max, quote;
- int ikind, okind, unchanged;
- const void *idata;
- void *odata;
-
- isize = PyUnicode_GET_LENGTH(unicode);
- idata = PyUnicode_DATA(unicode);
+ Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
+ const void *idata = PyUnicode_DATA(unicode);
/* Compute length of output, quote characters, and
maximum character */
- osize = 0;
- max = 127;
- squote = dquote = 0;
- ikind = PyUnicode_KIND(unicode);
- for (i = 0; i < isize; i++) {
+ Py_ssize_t osize = 0;
+ Py_UCS4 maxch = 127;
+ Py_ssize_t squote = 0;
+ Py_ssize_t dquote = 0;
+ int ikind = PyUnicode_KIND(unicode);
+ for (Py_ssize_t i = 0; i < isize; i++) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Py_ssize_t incr = 1;
switch (ch) {
@@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode)
else if (ch < 0x7f)
;
else if (Py_UNICODE_ISPRINTABLE(ch))
- max = ch > max ? ch : max;
+ maxch = (ch > maxch) ? ch : maxch;
else if (ch < 0x100)
incr = 4; /* \xHH */
else if (ch < 0x10000)
@@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode)
osize += incr;
}
- quote = '\'';
- unchanged = (osize == isize);
+ Py_UCS4 quote = '\'';
+ int changed = (osize != isize);
if (squote) {
- unchanged = 0;
+ changed = 1;
if (dquote)
/* Both squote and dquote present. Use squote,
and escape them */
@@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode)
}
osize += 2; /* quotes */
- repr = PyUnicode_New(osize, max);
+ PyObject *repr = PyUnicode_New(osize, maxch);
if (repr == NULL)
return NULL;
- okind = PyUnicode_KIND(repr);
- odata = PyUnicode_DATA(repr);
+ int okind = PyUnicode_KIND(repr);
+ void *odata = PyUnicode_DATA(repr);
+
+ if (!changed) {
+ PyUnicode_WRITE(okind, odata, 0, quote);
- PyUnicode_WRITE(okind, odata, 0, quote);
- PyUnicode_WRITE(okind, odata, osize-1, quote);
- if (unchanged) {
_PyUnicode_FastCopyCharacters(repr, 1,
unicode, 0,
isize);
+
+ PyUnicode_WRITE(okind, odata, osize-1, quote);
}
else {
- for (i = 0, o = 1; i < isize; i++) {
- Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
-
- /* Escape quotes and backslashes */
- if ((ch == quote) || (ch == '\\')) {
- PyUnicode_WRITE(okind, odata, o++, '\\');
- PyUnicode_WRITE(okind, odata, o++, ch);
- continue;
- }
-
- /* Map special whitespace to '\t', \n', '\r' */
- if (ch == '\t') {
- PyUnicode_WRITE(okind, odata, o++, '\\');
- PyUnicode_WRITE(okind, odata, o++, 't');
- }
- else if (ch == '\n') {
- PyUnicode_WRITE(okind, odata, o++, '\\');
- PyUnicode_WRITE(okind, odata, o++, 'n');
- }
- else if (ch == '\r') {
- PyUnicode_WRITE(okind, odata, o++, '\\');
- PyUnicode_WRITE(okind, odata, o++, 'r');
- }
-
- /* Map non-printable US ASCII to '\xhh' */
- else if (ch < ' ' || ch == 0x7F) {
- PyUnicode_WRITE(okind, odata, o++, '\\');
- PyUnicode_WRITE(okind, odata, o++, 'x');
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) &
0x000F]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
- }
-
- /* Copy ASCII characters as-is */
- else if (ch < 0x7F) {
- PyUnicode_WRITE(okind, odata, o++, ch);
- }
-
- /* Non-ASCII characters */
- else {
- /* Map Unicode whitespace and control characters
- (categories Z* and C* except ASCII space)
- */
- if (!Py_UNICODE_ISPRINTABLE(ch)) {
- PyUnicode_WRITE(okind, odata, o++, '\\');
- /* Map 8-bit characters to '\xhh' */
- if (ch <= 0xff) {
- PyUnicode_WRITE(okind, odata, o++, 'x');
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
4) & 0x000F]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch &
0x000F]);
- }
- /* Map 16-bit characters to '\uxxxx' */
- else if (ch <= 0xffff) {
- PyUnicode_WRITE(okind, odata, o++, 'u');
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
12) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
8) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
4) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch &
0xF]);
- }
- /* Map 21-bit characters to '\U00xxxxxx' */
- else {
- PyUnicode_WRITE(okind, odata, o++, 'U');
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
28) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
24) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
20) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
16) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
12) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
8) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >>
4) & 0xF]);
- PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch &
0xF]);
- }
- }
- /* Copy characters as-is */
- else {
- PyUnicode_WRITE(okind, odata, o++, ch);
- }
- }
+ switch (okind) {
+ case PyUnicode_1BYTE_KIND:
+ ucs1lib_repr(unicode, quote, odata);
+ break;
+ case PyUnicode_2BYTE_KIND:
+ ucs2lib_repr(unicode, quote, odata);
+ break;
+ default:
+ assert(okind == PyUnicode_4BYTE_KIND);
+ ucs4lib_repr(unicode, quote, odata);
}
}
- /* Closing quote already added at the beginning */
+
assert(_PyUnicode_CheckConsistency(repr, 1));
return repr;
}
diff --git a/Tools/c-analyzer/cpython/_parser.py
b/Tools/c-analyzer/cpython/_parser.py
index 12010f0e9c0549..4623f2c8d671bd 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -167,6 +167,7 @@ def clean_lines(text):
Objects/stringlib/find.h Objects/stringlib/fastsearch.h
Objects/stringlib/partition.h Objects/stringlib/fastsearch.h
Objects/stringlib/replace.h Objects/stringlib/fastsearch.h
+Objects/stringlib/repr.h Objects/stringlib/fastsearch.h
Objects/stringlib/split.h Objects/stringlib/fastsearch.h
# @end=tsv@
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]