STINNER Victor added the comment:
Hijacking locale.getpreferredencoding() is maybe dangerous. I attached a
new patch, force_ascii.patch, which uses a different approach: be more
strict than mbstowcs(), force the ASCII encoding when:
- the LC_CTYPE locale is C
- nl_langinfo(CODESET) is ASCII or an alias of ASCII
- mbstowcs() is able to decode non-ASCII characters
2012/11/12 STINNER Victor <rep...@bugs.python.org>
>
> STINNER Victor added the comment:
>
> Some tests are failing with the patch:
>
> ======================================================================
> FAIL: test_undecodable_env (test.test_subprocess.POSIXProcessTestCase)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
> File "/usr/home/haypo/prog/python/default/Lib/test/test_subprocess.py",
> line 1606, in test_undecodable_env
> self.assertEqual(stdout.decode('ascii'), ascii(value))
> AssertionError: "'abc\\xff'" != "'abc\\udcff'"
> - 'abc\xff'
> ? ^
> + 'abc\udcff'
> ? ^^^
>
> ======================================================================
> FAIL: test_strcoll_with_diacritic (test.test_locale.TestEnUSCollation)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
> File "/usr/home/haypo/prog/python/default/Lib/test/test_locale.py", line
> 364, in test_strcoll_with_diacritic
> self.assertLess(locale.strcoll('\xe0', 'b'), 0)
> AssertionError: 126 not less than 0
>
> ======================================================================
> FAIL: test_strxfrm_with_diacritic (test.test_locale.TestEnUSCollation)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
> File "/usr/home/haypo/prog/python/default/Lib/test/test_locale.py", line
> 367, in test_strxfrm_with_diacritic
> self.assertLess(locale.strxfrm('\xe0'), locale.strxfrm('b'))
> AssertionError: '\xe0' not less than 'b'
>
> ----------
>
> _______________________________________
> Python tracker <rep...@bugs.python.org>
> <http://bugs.python.org/issue16455>
> _______________________________________
>
----------
Added file: http://bugs.python.org/file27970/force_ascii.patch
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue16455>
_______________________________________
diff -r 6a6ad09faad2 Python/fileutils.c
--- a/Python/fileutils.c Mon Nov 12 01:23:51 2012 +0100
+++ b/Python/fileutils.c Mon Nov 12 15:33:24 2012 +0100
@@ -4,6 +4,7 @@
#endif
#ifdef HAVE_LANGINFO_H
+#include <locale.h>
#include <langinfo.h>
#endif
@@ -39,6 +40,104 @@ PyObject *
#ifdef HAVE_STAT
+/* Workaround FreeBSD and OpenIndiana locale encoding issue. On these
+ operating systems, nl_langinfo(CODESET) announces an alias of the ASCII
+ encoding, whereas mbstowcs() and wcstombs() functions use the ISO-8859-1
+ encoding. The problem is that os.fsencode() and os.fsdecode() use the
+ Python codec "ASCII". For example, if command line arguments are decoded
+ by mbstowcs() and encoded by os.fsencode(), we get a UnicodeEncodeError
+ instead of retrieving the original byte string.
+
+ The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C" and
+ nl_langinfo(CODESET) returns "ascii". The workaround is not used if
+ setlocale(LC_CTYPE, NULL) failed, or if nl_langinfo() or CODESET is not
+ available.
+
+ Values of locale_is_ascii:
+
+ 1: the workaround is used, the ASCII codec is used instead of mbstowcs()
+ and wcstombs() functions
+ 0: the workaround is not used
+ -1: unknown, need to call check_locale_force_ascii() to known the value
+*/
+static int locale_force_ascii = -1;
+
+extern char* _Py_GetLocaleEncoding(void);
+
+static int
+check_locale_force_ascii(void)
+{
+#ifdef MS_WINDOWS
+ return 0;
+#else
+ char *encoding, *loc;
+ int i;
+ unsigned char ch;
+ wchar_t wch;
+ size_t res;
+
+ return 1;
+
+ loc = setlocale(LC_CTYPE, NULL);
+ if (loc == NULL || strcmp(loc, "C") != 0) {
+ /* Failed to get the LC_CTYPE locale or it is different than C:
+ * don't use the workaround. */
+ return 0;
+ }
+
+ encoding = _Py_GetLocaleEncoding();
+ if (encoding == NULL) {
+ /* unknown encoding: consider that the encoding is not ASCII */
+ PyErr_Clear();
+ return 0;
+ }
+
+ if (strcmp(encoding, "ascii") != 0) {
+ free(encoding);
+ return 0;
+ }
+ free(encoding);
+
+ /* the locale is not set and nl_langinfo(CODESET) returns "ASCII"
+ (or an alias of the ASCII encoding). Check if the locale encoding
+ is really ASCII. */
+ for (i=0x80; i<0xff; i++) {
+ ch = (unsigned char)i;
+ res = mbstowcs(&wch, (char*)&ch, 1);
+ if (res == (size_t)-1) {
+ /* decoding a non-ASCII character from the locale encoding failed:
+ the encoding is really ASCII */
+ return 0;
+ }
+ }
+ return 1;
+#endif
+}
+
+static wchar_t*
+locale_decode_ascii(const char *arg, size_t *size)
+{
+ wchar_t *res;
+ unsigned char *in;
+ wchar_t *out;
+
+ res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
+ if (!res)
+ return NULL;
+
+ in = (unsigned char*)arg;
+ out = res;
+ while(*in)
+ if(*in < 128)
+ *out++ = *in++;
+ else
+ *out++ = 0xdc00 + *in++;
+ *out = 0;
+ if (size != NULL)
+ *size = out - res;
+ return res;
+}
+
/* Decode a byte string from the locale encoding with the
surrogateescape error handler (undecodable bytes are decoded as characters
in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
@@ -60,20 +159,33 @@ wchar_t*
_Py_char2wchar(const char* arg, size_t *size)
{
wchar_t *res;
+ size_t argsize;
+ size_t count;
+ unsigned char *in;
+ wchar_t *out;
+#ifdef HAVE_MBRTOWC
+ mbstate_t mbs;
+#endif
+
+ if (locale_force_ascii == -1)
+ locale_force_ascii = check_locale_force_ascii();
+
+ if (locale_force_ascii) {
+ /* force ASCII encoding to workaround mbstowcs() issue */
+ res = locale_decode_ascii(arg, size);
+ if (res == NULL)
+ goto oom;
+ return res;
+ }
+
#ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that
* would result from conversion. Use an upper bound.
*/
- size_t argsize = strlen(arg);
+ argsize = strlen(arg);
#else
- size_t argsize = mbstowcs(NULL, arg, 0);
-#endif
- size_t count;
- unsigned char *in;
- wchar_t *out;
-#ifdef HAVE_MBRTOWC
- mbstate_t mbs;
+ argsize = mbstowcs(NULL, arg, 0);
#endif
if (argsize != (size_t)-1) {
res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
@@ -144,24 +256,16 @@ wchar_t*
argsize -= converted;
out++;
}
+ if (size != NULL)
+ *size = out - res;
#else
/* Cannot use C locale for escaping; manually escape as if charset
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */
- res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
- if (!res)
+ res = locale_decode_ascii(arg, size);
+ if (res == NULL)
goto oom;
- in = (unsigned char*)arg;
- out = res;
- while(*in)
- if(*in < 128)
- *out++ = *in++;
- else
- *out++ = 0xdc00 + *in++;
- *out = 0;
#endif
- if (size != NULL)
- *size = out - res;
return res;
oom:
if (size != NULL)
@@ -169,6 +273,45 @@ oom:
return NULL;
}
+static char*
+locale_encode_ascii(const wchar_t *text, size_t *error_pos)
+{
+ char *result = NULL, *out;
+ size_t len, i;
+ wchar_t ch;
+
+ if (error_pos != NULL)
+ *error_pos = (size_t)-1;
+
+ len = wcslen(text);
+
+ result = PyMem_Malloc(len + 1); /* +1 for NUL byte */
+ if (result == NULL)
+ return NULL;
+
+ out = result;
+ for (i=0; i<len; i++) {
+ ch = text[i];
+
+ if (ch <= 0x7f) {
+ /* ASCII character */
+ *out++ = (char)ch;
+ }
+ else if (0xdc80 <= ch && ch <= 0xdcff) {
+ /* UTF-8b surrogate */
+ *out++ = (char)(ch - 0xdc00);
+ }
+ else {
+ if (error_pos != NULL)
+ *error_pos = i;
+ PyMem_Free(result);
+ return NULL;
+ }
+ }
+ *out = '\0';
+ return result;
+}
+
/* Encode a (wide) character string to the locale encoding with the
surrogateescape error handler (characters in range U+DC80..U+DCFF are
converted to bytes 0x80..0xFF).
@@ -191,6 +334,12 @@ char*
if (error_pos != NULL)
*error_pos = (size_t)-1;
+ if (locale_force_ascii == -1)
+ locale_force_ascii = check_locale_force_ascii();
+
+ if (locale_force_ascii)
+ return locale_encode_ascii(text, error_pos);
+
/* The function works in two steps:
1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */
@@ -231,7 +380,7 @@ char*
}
}
if (result != NULL) {
- *bytes = 0;
+ *bytes = '\0';
break;
}
diff -r 6a6ad09faad2 Python/pythonrun.c
--- a/Python/pythonrun.c Mon Nov 12 01:23:51 2012 +0100
+++ b/Python/pythonrun.c Mon Nov 12 15:33:24 2012 +0100
@@ -170,8 +170,8 @@ error:
return NULL;
}
-static char*
-get_locale_encoding(void)
+char*
+_Py_GetLocaleEncoding(void)
{
#ifdef MS_WINDOWS
char codepage[100];
@@ -868,7 +868,7 @@ initfsencoding(PyInterpreterState *inter
if (Py_FileSystemDefaultEncoding == NULL)
{
- Py_FileSystemDefaultEncoding = get_locale_encoding();
+ Py_FileSystemDefaultEncoding = _Py_GetLocaleEncoding();
if (Py_FileSystemDefaultEncoding == NULL)
Py_FatalError("Py_Initialize: Unable to get the locale encoding");
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com