Christian Heimes added the comment:
Why should I use (sizeof(lower)-1)? Do you mean
PyMem_Malloc(strlen(encoding) + 1)?
Changes since last patch:
* added #include "bytes_methods.h" in unicodeobject.c
* fix all is* to use the macros from bytes_methods.h
* use malloc/free the lower version instead of using a fixed buffer
* replace '_' with '-' in encoding
I still think that a fixed buffer of 12 chars (strlen(iso-8859-1) + 1 +
\0) is sufficient and faster.
* also check for iso-8859-1
Added file: http://bugs.python.org/file8607/py3k_profile_fix3.patch
__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1302>
__________________________________
Index: Objects/unicodeobject.c
===================================================================
--- Objects/unicodeobject.c (revision 58654)
+++ Objects/unicodeobject.c (working copy)
@@ -41,6 +41,7 @@
#define PY_SSIZE_T_CLEAN
#include "Python.h"
+#include "bytes_methods.h"
#include "unicodeobject.h"
#include "ucnhash.h"
@@ -592,9 +593,9 @@
if (*f == '%') {
const char* p = f;
width = 0;
- while (isdigit(Py_CHARMASK(*f)))
+ while (ISDIGIT(*f))
width = (width*10) + *f++ - '0';
- while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
+ while (*++f && *f != '%' && !ISALPHA(*f))
;
/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
@@ -755,12 +756,12 @@
zeropad = (*f == '0');
/* parse the width.precision part */
width = 0;
- while (isdigit(Py_CHARMASK(*f)))
+ while (ISDIGIT(*f))
width = (width*10) + *f++ - '0';
precision = 0;
if (*f == '.') {
f++;
- while (isdigit(Py_CHARMASK(*f)))
+ while (ISDIGIT(*f))
precision = (precision*10) + *f++ - '0';
}
/* handle the long flag, but only for %ld and %lu.
@@ -1056,22 +1057,51 @@
{
PyObject *buffer = NULL, *unicode;
Py_buffer info;
+ char *lower;
+ char *l, *e;
if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
+ encoding = PyUnicode_GetDefaultEncoding();
+ lower = PyMem_Malloc(strlen(encoding) + 1);
+
+ /* Convert encoding to lower case and replace '_' with '-' in order to
+ catch e.g. UTF_8 */
+ e = (char*)encoding;
+ l = lower;
+ while (*e) {
+ if (ISUPPER(*e)) {
+ *l++ = TOLOWER(*e++);
+ }
+ else if (*e == '_') {
+ *l++ = '-';
+ e++;
+ }
+ else {
+ *l++ = *e++;
+ }
+ }
+ *l = '\0';
+
/* Shortcuts for common default encodings */
- if (strcmp(encoding, "utf-8") == 0)
+ if (strcmp(lower, "utf-8") == 0)
return PyUnicode_DecodeUTF8(s, size, errors);
- else if (strcmp(encoding, "latin-1") == 0)
+ else if ((strcmp(lower, "latin-1") == 0) ||
+ (strcmp(lower, "iso-8859-1") == 0))
return PyUnicode_DecodeLatin1(s, size, errors);
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
- else if (strcmp(encoding, "mbcs") == 0)
+ else if (strcmp(lower, "mbcs") == 0)
return PyUnicode_DecodeMBCS(s, size, errors);
#endif
- else if (strcmp(encoding, "ascii") == 0)
+ else if (strcmp(lower, "ascii") == 0)
return PyUnicode_DecodeASCII(s, size, errors);
+ else if (strcmp(lower, "utf-16") == 0)
+ return PyUnicode_DecodeUTF16(s, size, errors, 0);
+ else if (strcmp(lower, "utf-32") == 0)
+ return PyUnicode_DecodeUTF32(s, size, errors, 0);
+ PyMem_Free(lower);
+
/* Decode via the codec registry */
buffer = NULL;
if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
@@ -1470,7 +1500,7 @@
#define B64(n) \
("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
#define B64CHAR(c) \
- (isalnum(c) || (c) == '+' || (c) == '/')
+ (ISALNUM(c) || (c) == '+' || (c) == '/')
#define UB64(c) \
((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
@@ -2703,7 +2733,7 @@
}
for (i = 0; i < digits; ++i) {
c = (unsigned char) s[i];
- if (!isxdigit(c)) {
+ if (!ISXDIGIT(c)) {
endinpos = (s+i+1)-starts;
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
@@ -3077,7 +3107,7 @@
outpos = p-PyUnicode_AS_UNICODE(v);
for (x = 0, i = 0; i < count; ++i, ++s) {
c = (unsigned char)*s;
- if (!isxdigit(c)) {
+ if (!ISXDIGIT(c)) {
endinpos = s-starts;
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
Index: Lib/test/regrtest.py
===================================================================
--- Lib/test/regrtest.py (revision 58654)
+++ Lib/test/regrtest.py (working copy)
@@ -1119,6 +1119,15 @@
if not os.path.supports_unicode_filenames:
self.expected.add('test_pep277')
+ # doctest, profile and cProfile tests fail when the codec for the fs
+ # encoding isn't built in because PyUnicode_Decode() adds two calls
+ # into Python.
+ encs = ("utf-8", "latin-1", "ascii", "mbcs", "utf-16", "utf-32")
+ if sys.getfilesystemencoding().lower() not in encs:
+ self.expected.add('test_profile')
+ self.expected.add('test_cProfile')
+ self.expected.add('test_doctest')
+
try:
from test import test_socket_ssl
except ImportError:
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com