[issue10156] Initialization of globals in unicodeobject.c

Serhiy Storchaka Thu, 24 Jan 2013 12:23:40 -0800

Serhiy Storchaka added the comment:

There is a set of updated patches.


----------
Added file: http://bugs.python.org/file28815/unicode_globals-2.7_2.patch
Added file: http://bugs.python.org/file28816/unicode_globals-3.2_2.patch
Added file: http://bugs.python.org/file28817/unicode_globals-3.3_2.patch
Added file: http://bugs.python.org/file28818/unicode_globals-3.4_2.patch

_______________________________________
Python tracker <[email protected]>
<http://bugs.python.org/issue10156>
_______________________________________

diff -r 8f2edea69d5d Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Thu Jan 24 07:28:33 2013 -0800
+++ b/Objects/unicodeobject.c   Thu Jan 24 22:14:14 2013 +0200
@@ -93,15 +93,27 @@
 #endif
 
 /* Free list for Unicode objects */
-static PyUnicodeObject *free_list;
-static int numfree;
+static PyUnicodeObject *free_list = NULL;
+static int numfree = 0;
 
 /* The empty Unicode object is shared to improve performance. */
-static PyUnicodeObject *unicode_empty;
+static PyUnicodeObject *unicode_empty = NULL;
+
+#define _Py_RETURN_UNICODE_EMPTY()                      \
+    do {                                                \
+        if (unicode_empty != NULL)                      \
+            Py_INCREF(unicode_empty);                   \
+        else {                                          \
+            unicode_empty = _PyUnicode_New(0);          \
+            if (unicode_empty != NULL)                  \
+                Py_INCREF(unicode_empty);               \
+        }                                               \
+        return (PyObject *)unicode_empty;               \
+    } while (0)
 
 /* Single character Unicode strings in the Latin-1 range are being
    shared as well. */
-static PyUnicodeObject *unicode_latin1[256];
+static PyUnicodeObject *unicode_latin1[256] = {NULL};
 
 /* Default encoding to use and assume when NULL is passed as encoding
    parameter; it is initialized by _PyUnicode_Init().
@@ -110,7 +122,7 @@
    PyUnicode_GetDefaultEncoding() APIs to access this global.
 
 */
-static char unicode_default_encoding[100];
+static char unicode_default_encoding[100 + 1] = "ascii";
 
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
@@ -204,7 +216,7 @@
 
 #define BLOOM_MASK unsigned long
 
-static BLOOM_MASK bloom_linebreak;
+static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
 
 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
@@ -448,10 +460,8 @@
     if (u != NULL) {
 
         /* Optimization for empty strings */
-        if (size == 0 && unicode_empty != NULL) {
-            Py_INCREF(unicode_empty);
-            return (PyObject *)unicode_empty;
-        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
 
         /* Single character Unicode objects in the Latin-1 range are
            shared when using this constructor */
@@ -497,10 +507,8 @@
     if (u != NULL) {
 
         /* Optimization for empty strings */
-        if (size == 0 && unicode_empty != NULL) {
-            Py_INCREF(unicode_empty);
-            return (PyObject *)unicode_empty;
-        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
 
         /* Single characters are shared when using this constructor.
            Restrict to ASCII, since the input must be UTF-8. */
@@ -1162,13 +1170,10 @@
     }
 
     /* Convert to Unicode */
-    if (len == 0) {
-        Py_INCREF(unicode_empty);
-        v = (PyObject *)unicode_empty;
-    }
-    else
-        v = PyUnicode_Decode(s, len, encoding, errors);
-
+    if (len == 0)
+        _Py_RETURN_UNICODE_EMPTY();
+
+    v = PyUnicode_Decode(s, len, encoding, errors);
     return v;
 
   onError:
@@ -1381,7 +1386,7 @@
     Py_DECREF(v);
     strncpy(unicode_default_encoding,
             encoding,
-            sizeof(unicode_default_encoding));
+            sizeof(unicode_default_encoding) - 1);
     return 0;
 
   onError:
@@ -8850,8 +8855,6 @@
 
 void _PyUnicode_Init(void)
 {
-    int i;
-
     /* XXX - move this array to unicodectype.c ? */
     Py_UNICODE linebreak[] = {
         0x000A, /* LINE FEED */
@@ -8865,15 +8868,10 @@
     };
 
     /* Init the implementation */
-    free_list = NULL;
-    numfree = 0;
     unicode_empty = _PyUnicode_New(0);
     if (!unicode_empty)
         return;
 
-    strcpy(unicode_default_encoding, "ascii");
-    for (i = 0; i < 256; i++)
-        unicode_latin1[i] = NULL;
     if (PyType_Ready(&PyUnicode_Type) < 0)
         Py_FatalError("Can't initialize 'unicode'");
 
@@ -8918,15 +8916,11 @@
 {
     int i;
 
-    Py_XDECREF(unicode_empty);
-    unicode_empty = NULL;
-
-    for (i = 0; i < 256; i++) {
-        if (unicode_latin1[i]) {
-            Py_DECREF(unicode_latin1[i]);
-            unicode_latin1[i] = NULL;
-        }
-    }
+    Py_CLEAR(unicode_empty);
+
+    for (i = 0; i < 256; i++)
+        Py_CLEAR(unicode_latin1[i]);
+
     (void)PyUnicode_ClearFreeList();
 }

diff -r 5b02d622d625 Lib/re.py
--- a/Lib/re.py Thu Jan 24 07:23:34 2013 -0800
+++ b/Lib/re.py Thu Jan 24 22:14:22 2013 +0200
@@ -233,7 +233,8 @@
         alphanum = _alphanum_str
         s = list(pattern)
         for i, c in enumerate(pattern):
-            if c not in alphanum:
+            if c not in alphanum and not ('\U00010000'[1:] and
+                                          0xdc00 <= ord(c) < 0xe000):
                 if c == "\000":
                     s[i] = "\\000"
                 else:
diff -r 5b02d622d625 Lib/sre_parse.py
--- a/Lib/sre_parse.py  Thu Jan 24 07:23:34 2013 -0800
+++ b/Lib/sre_parse.py  Thu Jan 24 22:14:22 2013 +0200
@@ -177,26 +177,32 @@
 
 class Tokenizer:
     def __init__(self, string):
-        self.string = string
+        if isinstance(string, bytes):
+            self.string = string.decode('latin1')
+        else:
+            self.string = string
         self.index = 0
         self.__next()
     def __next(self):
         if self.index >= len(self.string):
             self.next = None
             return
-        char = self.string[self.index:self.index+1]
-        # Special case for the str8, since indexing returns a integer
-        # XXX This is only needed for test_bug_926075 in test_re.py
-        if char and isinstance(char, bytes):
-            char = chr(char[0])
+        char = self.string[self.index]
         if char == "\\":
             try:
                 c = self.string[self.index + 1]
             except IndexError:
                 raise error("bogus escape (end of line)")
-            if isinstance(self.string, bytes):
-                c = chr(c)
             char = char + c
+        else:
+            c = char
+        if '\U00010000'[1:] and 0xd800 <= ord(c) < 0xdc00:
+            try:
+                c2 = self.string[self.index + len(char)]
+                if 0xdc00 <= ord(c2) < 0xe000:
+                    char += c2
+            except IndexError:
+                pass
         self.index = self.index + len(char)
         self.next = char
     def match(self, char, skip=1):
@@ -238,7 +244,7 @@
     if code and code[0] == IN:
         return code
     try:
-        c = escape[1:2]
+        c = escape[1:]
         if c == "x":
             # hexadecimal escape (exactly two digits)
             while source.next in HEXDIGITS and len(escape) < 4:
@@ -255,8 +261,8 @@
             return LITERAL, int(escape, 8) & 0xff
         elif c in DIGITS:
             raise error("bogus escape: %s" % repr(escape))
-        if len(escape) == 2:
-            return LITERAL, ord(escape[1])
+        if c:
+            return LITERAL, ord(c)
     except ValueError:
         pass
     raise error("bogus escape: %s" % repr(escape))
@@ -270,7 +276,7 @@
     if code:
         return code
     try:
-        c = escape[1:2]
+        c = escape[1:]
         if c == "x":
             # hexadecimal escape
             while source.next in HEXDIGITS and len(escape) < 4:
@@ -299,8 +305,8 @@
                     raise error("cannot refer to open group")
                 return GROUPREF, group
             raise ValueError
-        if len(escape) == 2:
-            return LITERAL, ord(escape[1])
+        if c:
+            return LITERAL, ord(c)
     except ValueError:
         pass
     raise error("bogus escape: %s" % repr(escape))
@@ -458,7 +464,7 @@
                         lo = code1[1]
                         hi = code2[1]
                         if hi < lo:
-                            raise error("bad character range")
+                            raise error("bad character range %x %x" % (lo, hi))
                         setappend((RANGE, (lo, hi)))
                     else:
                         raise error("unexpected end of regular expression")
@@ -704,6 +710,7 @@
     elif tail:
         raise error("bogus characters at end of regular expression")
 
+    #flags |= SRE_FLAG_DEBUG
     if flags & SRE_FLAG_DEBUG:
         p.dump()
 
diff -r 5b02d622d625 Lib/test/test_re.py
--- a/Lib/test/test_re.py       Thu Jan 24 07:23:34 2013 -0800
+++ b/Lib/test/test_re.py       Thu Jan 24 22:14:22 2013 +0200
@@ -522,12 +522,23 @@
         self.assertMatch(re.escape(p), p)
 
     def test_re_escape_non_ascii(self):
-        s = 'xxx\u2620\u2620\u2620xxx'
+        #s = 'xxx\u2620\u2620\u2620xxx'
+        #s_escaped = re.escape(s)
+        #self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
+        #self.assertMatch(s_escaped, s)
+        #self.assertMatch('.%s+.' % re.escape('\u2620'), s,
+                         #'x\u2620\u2620\u2620x', (2, 7), re.search)
+        print('*********')
+        s = 'xxx\U0001d11e\U0001d11e\U0001d11exxx'
         s_escaped = re.escape(s)
-        self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
+        p = re.compile('.%s+.' % re.escape('\U0001d11e'), re.DEBUG)
+        print(p.code)
+        m = re.search('.%s+.' % re.escape('\U0001d11e'), s)
+        self.assertEqual(m.group(), 'x\U0001d11e\U0001d11e\U0001d11ex')
+        self.assertEqual(s_escaped, 
'xxx\\\U0001d11e\\\U0001d11e\\\U0001d11exxx')
         self.assertMatch(s_escaped, s)
-        self.assertMatch('.%s+.' % re.escape('\u2620'), s,
-                         'x\u2620\u2620\u2620x', (2, 7), re.search)
+        self.assertMatch('.%s+.' % re.escape('\U0001d11e'), s,
+                         'x\U0001d11e\U0001d11e\U0001d11ex', (2, len(s) - 2), 
re.search)
 
     def test_re_escape_non_ascii_bytes(self):
         b = 'y\u2620y\u2620y'.encode('utf-8')
diff -r 5b02d622d625 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Thu Jan 24 07:23:34 2013 -0800
+++ b/Objects/unicodeobject.c   Thu Jan 24 22:14:22 2013 +0200
@@ -98,18 +98,30 @@
    Another way to look at this is that to say that the actual reference
    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
 */
-static PyObject *interned;
+static PyObject *interned = NULL;
 
 /* Free list for Unicode objects */
-static PyUnicodeObject *free_list;
-static int numfree;
+static PyUnicodeObject *free_list = NULL;
+static int numfree = 0;
 
 /* The empty Unicode object is shared to improve performance. */
-static PyUnicodeObject *unicode_empty;
+static PyUnicodeObject *unicode_empty = NULL;
+
+#define _Py_RETURN_UNICODE_EMPTY()                      \
+    do {                                                \
+        if (unicode_empty != NULL)                      \
+            Py_INCREF(unicode_empty);                   \
+        else {                                          \
+            unicode_empty = _PyUnicode_New(0);          \
+            if (unicode_empty != NULL)                  \
+                Py_INCREF(unicode_empty);               \
+        }                                               \
+        return (PyObject *)unicode_empty;               \
+    } while (0)
 
 /* Single character Unicode strings in the Latin-1 range are being
    shared as well. */
-static PyUnicodeObject *unicode_latin1[256];
+static PyUnicodeObject *unicode_latin1[256] = {NULL};
 
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
@@ -214,7 +226,7 @@
 
 #define BLOOM_MASK unsigned long
 
-static BLOOM_MASK bloom_linebreak;
+static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
 
 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
@@ -479,10 +491,8 @@
     if (u != NULL) {
 
         /* Optimization for empty strings */
-        if (size == 0 && unicode_empty != NULL) {
-            Py_INCREF(unicode_empty);
-            return (PyObject *)unicode_empty;
-        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
 
         /* Single character Unicode objects in the Latin-1 range are
            shared when using this constructor */
@@ -528,10 +538,8 @@
     if (u != NULL) {
 
         /* Optimization for empty strings */
-        if (size == 0 && unicode_empty != NULL) {
-            Py_INCREF(unicode_empty);
-            return (PyObject *)unicode_empty;
-        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
 
         /* Single characters are shared when using this constructor.
            Restrict to ASCII, since the input must be UTF-8. */
@@ -1393,15 +1401,11 @@
 
     /* Decoding bytes objects is the most common case and should be fast */
     if (PyBytes_Check(obj)) {
-        if (PyBytes_GET_SIZE(obj) == 0) {
-            Py_INCREF(unicode_empty);
-            v = (PyObject *) unicode_empty;
-        }
-        else {
-            v = PyUnicode_Decode(
-                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
-                    encoding, errors);
-        }
+        if (PyBytes_GET_SIZE(obj) == 0)
+            _Py_RETURN_UNICODE_EMPTY();
+        v = PyUnicode_Decode(
+                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
+                encoding, errors);
         return v;
     }
 
@@ -1421,12 +1425,11 @@
     }
 
     if (buffer.len == 0) {
-        Py_INCREF(unicode_empty);
-        v = (PyObject *) unicode_empty;
-    }
-    else
-        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
-
+        PyBuffer_Release(&buffer);
+        _Py_RETURN_UNICODE_EMPTY();
+    }
+
+    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
     PyBuffer_Release(&buffer);
     return v;
 }
@@ -8323,10 +8326,8 @@
     Py_ssize_t nchars;
     size_t nbytes;
 
-    if (len < 1) {
-        Py_INCREF(unicode_empty);
-        return (PyObject *)unicode_empty;
-    }
+    if (len < 1)
+        _Py_RETURN_UNICODE_EMPTY();
 
     if (len == 1 && PyUnicode_CheckExact(str)) {
         /* no repeat, return original string */
@@ -10056,8 +10057,6 @@
 
 void _PyUnicode_Init(void)
 {
-    int i;
-
     /* XXX - move this array to unicodectype.c ? */
     Py_UNICODE linebreak[] = {
         0x000A, /* LINE FEED */
@@ -10071,14 +10070,10 @@
     };
 
     /* Init the implementation */
-    free_list = NULL;
-    numfree = 0;
     unicode_empty = _PyUnicode_New(0);
     if (!unicode_empty)
         return;
 
-    for (i = 0; i < 256; i++)
-        unicode_latin1[i] = NULL;
     if (PyType_Ready(&PyUnicode_Type) < 0)
         Py_FatalError("Can't initialize 'unicode'");
 
@@ -10123,15 +10118,11 @@
 {
     int i;
 
-    Py_XDECREF(unicode_empty);
-    unicode_empty = NULL;
-
-    for (i = 0; i < 256; i++) {
-        if (unicode_latin1[i]) {
-            Py_DECREF(unicode_latin1[i]);
-            unicode_latin1[i] = NULL;
-        }
-    }
+    Py_CLEAR(unicode_empty);
+
+    for (i = 0; i < 256; i++)
+        Py_CLEAR(unicode_latin1[i]);
+
     (void)PyUnicode_ClearFreeList();
 }
 
@@ -10250,8 +10241,7 @@
             "mortal/immortal\n", mortal_size, immortal_size);
     Py_DECREF(keys);
     PyDict_Clear(interned);
-    Py_DECREF(interned);
-    interned = NULL;
+    Py_CLEAR(interned);
 }

diff -r 99db73ce8374 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Thu Jan 24 20:03:49 2013 +0200
+++ b/Objects/unicodeobject.c   Thu Jan 24 22:16:03 2013 +0200
@@ -179,17 +179,34 @@
    Another way to look at this is that to say that the actual reference
    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
 */
-static PyObject *interned;
+static PyObject *interned = NULL;
 
 /* The empty Unicode object is shared to improve performance. */
-static PyObject *unicode_empty;
+static PyObject *unicode_empty = NULL;
+
+#define _Py_INCREF_UNICODE_EMPTY()                      \
+    do {                                                \
+        if (unicode_empty != NULL)                      \
+            Py_INCREF(unicode_empty);                   \
+        else {                                          \
+            unicode_empty = PyUnicode_New(0, 0);        \
+            if (unicode_empty != NULL)                  \
+                Py_INCREF(unicode_empty);               \
+        }                                               \
+    } while (0)
+
+#define _Py_RETURN_UNICODE_EMPTY()                      \
+    do {                                                \
+        _Py_INCREF_UNICODE_EMPTY();                     \
+        return unicode_empty;                           \
+    } while (0)
 
 /* List of static strings. */
-static _Py_Identifier *static_strings;
+static _Py_Identifier *static_strings = NULL;
 
 /* Single character Unicode strings in the Latin-1 range are being
    shared as well. */
-static PyObject *unicode_latin1[256];
+static PyObject *unicode_latin1[256] = {NULL};
 
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
@@ -416,9 +433,8 @@
 
     len = _PyUnicode_WSTR_LENGTH(unicode);
     if (len == 0) {
-        Py_INCREF(unicode_empty);
         Py_DECREF(unicode);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     if (len == 1) {
@@ -450,8 +466,8 @@
     length = PyUnicode_GET_LENGTH(unicode);
     if (length == 0) {
         if (unicode != unicode_empty) {
-            Py_INCREF(unicode_empty);
             Py_DECREF(unicode);
+            _Py_RETURN_UNICODE_EMPTY();
         }
         return unicode_empty;
     }
@@ -528,7 +544,7 @@
 
 #define BLOOM_MASK unsigned long
 
-static BLOOM_MASK bloom_linebreak;
+static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
 
 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
@@ -1582,9 +1598,11 @@
         return 0;
 
     if (length == 0) {
+        _Py_INCREF_UNICODE_EMPTY();
+        if (!unicode_empty)
+            return -1;
         Py_DECREF(*p_unicode);
         *p_unicode = unicode_empty;
-        Py_INCREF(*p_unicode);
         return 0;
     }
 
@@ -1731,10 +1749,8 @@
        some optimizations which share commonly used objects. */
 
     /* Optimization for empty strings */
-    if (size == 0 && unicode_empty != NULL) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* Single character Unicode objects in the Latin-1 range are
        shared when using this constructor */
@@ -1893,10 +1909,8 @@
     PyObject *res;
     unsigned char max_char;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     assert(size > 0);
     if (size == 1)
         return get_latin1_char(u[0]);
@@ -1916,10 +1930,8 @@
     PyObject *res;
     Py_UCS2 max_char;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     assert(size > 0);
     if (size == 1) {
         Py_UCS4 ch = u[0];
@@ -1954,10 +1966,8 @@
     PyObject *res;
     Py_UCS4 max_char;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     assert(size > 0);
     if (size == 1) {
         Py_UCS4 ch = u[0];
@@ -2249,10 +2259,8 @@
 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
 {
     if (w == NULL) {
-        if (size == 0) {
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
-        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
         PyErr_BadInternalCall();
         return NULL;
     }
@@ -3007,15 +3015,11 @@
 
     /* Decoding bytes objects is the most common case and should be fast */
     if (PyBytes_Check(obj)) {
-        if (PyBytes_GET_SIZE(obj) == 0) {
-            Py_INCREF(unicode_empty);
-            v = unicode_empty;
-        }
-        else {
-            v = PyUnicode_Decode(
-                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
-                    encoding, errors);
-        }
+        if (PyBytes_GET_SIZE(obj) == 0)
+            _Py_RETURN_UNICODE_EMPTY();
+        v = PyUnicode_Decode(
+                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
+                encoding, errors);
         return v;
     }
 
@@ -3035,12 +3039,11 @@
     }
 
     if (buffer.len == 0) {
-        Py_INCREF(unicode_empty);
-        v = unicode_empty;
-    }
-    else
-        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
-
+        PyBuffer_Release(&buffer);
+        _Py_RETURN_UNICODE_EMPTY();
+    }
+
+    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
     PyBuffer_Release(&buffer);
     return v;
 }
@@ -4720,8 +4723,7 @@
     if (size == 0) {
         if (consumed)
             *consumed = 0;
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
@@ -5232,8 +5234,7 @@
     if (q == e) {
         if (consumed)
             *consumed = size;
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
@@ -6558,10 +6559,8 @@
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
     if (size == 1 && (unsigned char)s[0] < 128)
@@ -6940,8 +6939,7 @@
         if (chunk_size == 0 && done) {
             if (v != NULL)
                 break;
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
+            _Py_RETURN_UNICODE_EMPTY();
         }
 
 
@@ -9503,9 +9501,7 @@
     /* If empty sequence, return u"". */
     if (seqlen == 0) {
         Py_DECREF(fseq);
-        Py_INCREF(unicode_empty);
-        res = unicode_empty;
-        return res;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     /* If singleton sequence with an exact Unicode, return that. */
@@ -10205,7 +10201,9 @@
         }
         new_size = slen + n * (len2 - len1);
         if (new_size == 0) {
-            Py_INCREF(unicode_empty);
+            _Py_INCREF_UNICODE_EMPTY();
+            if (!unicode_empty)
+                goto error;
             u = unicode_empty;
             goto done;
         }
@@ -11672,10 +11670,8 @@
         PyErr_SetString(PyExc_IndexError, "string index out of range");
         return NULL;
     }
-    if (start >= length || end < start) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (start >= length || end < start)
+        _Py_RETURN_UNICODE_EMPTY();
 
     length = end - start;
     if (PyUnicode_IS_ASCII(self)) {
@@ -11802,10 +11798,8 @@
     PyObject *u;
     Py_ssize_t nchars, n;
 
-    if (len < 1) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (len < 1)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* no repeat, return original string */
     if (len == 1)
@@ -12924,8 +12918,7 @@
 {
     if (writer->pos == 0) {
         Py_XDECREF(writer->buffer);
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
     if (writer->readonly) {
         assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
@@ -13143,8 +13136,7 @@
         }
 
         if (slicelength <= 0) {
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
+            _Py_RETURN_UNICODE_EMPTY();
         } else if (start == 0 && step == 1 &&
                    slicelength == PyUnicode_GET_LENGTH(self)) {
             return unicode_result_unchanged(self);
@@ -13974,10 +13966,8 @@
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
                                      kwlist, &x, &encoding, &errors))
         return NULL;
-    if (x == NULL) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (x == NULL)
+        _Py_RETURN_UNICODE_EMPTY();
     if (encoding == NULL && errors == NULL)
         return PyObject_Str(x);
     else
@@ -14146,8 +14136,6 @@
 
 int _PyUnicode_Init(void)
 {
-    int i;
-
     /* XXX - move this array to unicodectype.c ? */
     Py_UCS2 linebreak[] = {
         0x000A, /* LINE FEED */
@@ -14161,13 +14149,13 @@
     };
 
     /* Init the implementation */
-    unicode_empty = PyUnicode_New(0, 0);
-    if (!unicode_empty)
-        Py_FatalError("Can't create empty string");
+    if (unicode_empty == NULL) {
+        unicode_empty = PyUnicode_New(0, 0);
+        if (!unicode_empty)
+            Py_FatalError("Can't create empty string");
+    }
     assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
 
-    for (i = 0; i < 256; i++)
-        unicode_latin1[i] = NULL;
     if (PyType_Ready(&PyUnicode_Type) < 0)
         Py_FatalError("Can't initialize 'unicode'");
 
@@ -14207,15 +14195,10 @@
 {
     int i;
 
-    Py_XDECREF(unicode_empty);
-    unicode_empty = NULL;
-
-    for (i = 0; i < 256; i++) {
-        if (unicode_latin1[i]) {
-            Py_DECREF(unicode_latin1[i]);
-            unicode_latin1[i] = NULL;
-        }
-    }
+    Py_CLEAR(unicode_empty);
+
+    for (i = 0; i < 256; i++)
+        Py_CLEAR(unicode_latin1[i]);
     _PyUnicode_ClearStaticStrings();
     (void)PyUnicode_ClearFreeList();
 }
@@ -14344,8 +14327,7 @@
             "mortal/immortal\n", mortal_size, immortal_size);
     Py_DECREF(keys);
     PyDict_Clear(interned);
-    Py_DECREF(interned);
-    interned = NULL;
+    Py_CLEAR(interned);
 }

diff -r ab0ff935126c Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Thu Jan 24 20:04:37 2013 +0200
+++ b/Objects/unicodeobject.c   Thu Jan 24 22:16:10 2013 +0200
@@ -171,17 +171,32 @@
    Another way to look at this is that to say that the actual reference
    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
 */
-static PyObject *interned;
+static PyObject *interned = NULL;
 
 /* The empty Unicode object is shared to improve performance. */
-static PyObject *unicode_empty;
+static PyObject *unicode_empty = NULL;
+
+#define _Py_INCREF_UNICODE_EMPTY() do {                 \
+        if (unicode_empty != NULL)                      \
+            Py_INCREF(unicode_empty);                   \
+        else {                                          \
+            unicode_empty = PyUnicode_New(0, 0);        \
+            if (unicode_empty != NULL)                  \
+                Py_INCREF(unicode_empty);               \
+        }                                               \
+    } while (0)
+
+#define _Py_RETURN_UNICODE_EMPTY()  do {                \
+        _Py_INCREF_UNICODE_EMPTY();                     \
+        return unicode_empty;                           \
+    } while (0)
 
 /* List of static strings. */
-static _Py_Identifier *static_strings;
+static _Py_Identifier *static_strings = NULL;
 
 /* Single character Unicode strings in the Latin-1 range are being
    shared as well. */
-static PyObject *unicode_latin1[256];
+static PyObject *unicode_latin1[256] = {NULL};
 
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
@@ -406,9 +421,8 @@
 
     len = _PyUnicode_WSTR_LENGTH(unicode);
     if (len == 0) {
-        Py_INCREF(unicode_empty);
         Py_DECREF(unicode);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     if (len == 1) {
@@ -442,8 +456,8 @@
     length = PyUnicode_GET_LENGTH(unicode);
     if (length == 0) {
         if (unicode != unicode_empty) {
-            Py_INCREF(unicode_empty);
             Py_DECREF(unicode);
+            _Py_RETURN_UNICODE_EMPTY();
         }
         return unicode_empty;
     }
@@ -520,7 +534,7 @@
 
 #define BLOOM_MASK unsigned long
 
-static BLOOM_MASK bloom_linebreak;
+static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
 
 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
@@ -1602,9 +1616,11 @@
         return 0;
 
     if (length == 0) {
+        _Py_INCREF_UNICODE_EMPTY();
+        if (!unicode_empty)
+            return -1;
         Py_DECREF(*p_unicode);
         *p_unicode = unicode_empty;
-        Py_INCREF(*p_unicode);
         return 0;
     }
 
@@ -1727,10 +1743,8 @@
        some optimizations which share commonly used objects. */
 
     /* Optimization for empty strings */
-    if (size == 0 && unicode_empty != NULL) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* Single character Unicode objects in the Latin-1 range are
        shared when using this constructor */
@@ -1889,10 +1903,8 @@
     PyObject *res;
     unsigned char max_char;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     assert(size > 0);
     if (size == 1)
         return get_latin1_char(u[0]);
@@ -1912,10 +1924,8 @@
     PyObject *res;
     Py_UCS2 max_char;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     assert(size > 0);
     if (size == 1) {
         Py_UCS4 ch = u[0];
@@ -1950,10 +1960,8 @@
     PyObject *res;
     Py_UCS4 max_char;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     assert(size > 0);
     if (size == 1) {
         Py_UCS4 ch = u[0];
@@ -2245,10 +2253,8 @@
 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
 {
     if (w == NULL) {
-        if (size == 0) {
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
-        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
         PyErr_BadInternalCall();
         return NULL;
     }
@@ -2825,15 +2831,11 @@
 
     /* Decoding bytes objects is the most common case and should be fast */
     if (PyBytes_Check(obj)) {
-        if (PyBytes_GET_SIZE(obj) == 0) {
-            Py_INCREF(unicode_empty);
-            v = unicode_empty;
-        }
-        else {
-            v = PyUnicode_Decode(
-                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
-                    encoding, errors);
-        }
+        if (PyBytes_GET_SIZE(obj) == 0)
+            _Py_RETURN_UNICODE_EMPTY();
+        v = PyUnicode_Decode(
+                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
+                encoding, errors);
         return v;
     }
 
@@ -2853,12 +2855,11 @@
     }
 
     if (buffer.len == 0) {
-        Py_INCREF(unicode_empty);
-        v = unicode_empty;
-    }
-    else
-        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
-
+        PyBuffer_Release(&buffer);
+        _Py_RETURN_UNICODE_EMPTY();
+    }
+
+    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
     PyBuffer_Release(&buffer);
     return v;
 }
@@ -4201,8 +4202,7 @@
     if (size == 0) {
         if (consumed)
             *consumed = 0;
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     /* Start off assuming it's all ASCII. Widen later as necessary. */
@@ -4609,8 +4609,7 @@
     if (size == 0) {
         if (consumed)
             *consumed = 0;
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
@@ -4868,8 +4867,7 @@
     if (q == e) {
         if (consumed)
             *consumed = size;
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
 #ifdef WORDS_BIGENDIAN
@@ -5108,8 +5106,7 @@
     if (q == e) {
         if (consumed)
             *consumed = size;
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
 #if PY_LITTLE_ENDIAN
@@ -5386,10 +5383,8 @@
     Py_ssize_t len;
 
     len = length_of_escaped_ascii_string(s, size);
-    if (len == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (len == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* After length_of_escaped_ascii_string() there are two alternatives,
        either the string is pure ASCII with named escapes like \n, etc.
@@ -5781,10 +5776,8 @@
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* Escaped strings will always be longer than the resulting
        Unicode string, so we start with size here and then reduce the
@@ -5988,10 +5981,8 @@
                      1))
         return NULL;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* XXX overflow detection missing */
     _PyUnicodeWriter_Init(&writer, 0);
@@ -6439,10 +6430,8 @@
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
     if (size == 1 && (unsigned char)s[0] < 128)
@@ -6820,8 +6809,7 @@
         if (chunk_size == 0 && done) {
             if (v != NULL)
                 break;
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
+            _Py_RETURN_UNICODE_EMPTY();
         }
 
 
@@ -7298,10 +7286,8 @@
     if (mapping == NULL)
         return PyUnicode_DecodeLatin1(s, size, errors);
 
-    if (size == 0) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
     _PyUnicodeWriter_Init(&writer, 0);
     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
         goto onError;
@@ -9354,9 +9340,7 @@
     /* If empty sequence, return u"". */
     if (seqlen == 0) {
         Py_DECREF(fseq);
-        Py_INCREF(unicode_empty);
-        res = unicode_empty;
-        return res;
+        _Py_RETURN_UNICODE_EMPTY();
     }
 
     /* If singleton sequence with an exact Unicode, return that. */
@@ -10056,7 +10040,9 @@
         }
         new_size = slen + n * (len2 - len1);
         if (new_size == 0) {
-            Py_INCREF(unicode_empty);
+            _Py_INCREF_UNICODE_EMPTY();
+            if (!unicode_empty)
+                goto error;
             u = unicode_empty;
             goto done;
         }
@@ -11559,10 +11545,8 @@
         PyErr_SetString(PyExc_IndexError, "string index out of range");
         return NULL;
     }
-    if (start >= length || end < start) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (start >= length || end < start)
+        _Py_RETURN_UNICODE_EMPTY();
 
     length = end - start;
     if (PyUnicode_IS_ASCII(self)) {
@@ -11689,10 +11673,8 @@
     PyObject *u;
     Py_ssize_t nchars, n;
 
-    if (len < 1) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (len < 1)
+        _Py_RETURN_UNICODE_EMPTY();
 
     /* no repeat, return original string */
     if (len == 1)
@@ -12832,8 +12814,7 @@
 {
     if (writer->pos == 0) {
         Py_XDECREF(writer->buffer);
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
+        _Py_RETURN_UNICODE_EMPTY();
     }
     if (writer->readonly) {
         assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
@@ -13051,8 +13032,7 @@
         }
 
         if (slicelength <= 0) {
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
+            _Py_RETURN_UNICODE_EMPTY();
         } else if (start == 0 && step == 1 &&
                    slicelength == PyUnicode_GET_LENGTH(self)) {
             return unicode_result_unchanged(self);
@@ -14056,10 +14036,8 @@
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
                                      kwlist, &x, &encoding, &errors))
         return NULL;
-    if (x == NULL) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
+    if (x == NULL)
+        _Py_RETURN_UNICODE_EMPTY();
     if (encoding == NULL && errors == NULL)
         return PyObject_Str(x);
     else
@@ -14228,8 +14206,6 @@
 
 int _PyUnicode_Init(void)
 {
-    int i;
-
     /* XXX - move this array to unicodectype.c ? */
     Py_UCS2 linebreak[] = {
         0x000A, /* LINE FEED */
@@ -14243,13 +14219,13 @@
     };
 
     /* Init the implementation */
-    unicode_empty = PyUnicode_New(0, 0);
-    if (!unicode_empty)
-        Py_FatalError("Can't create empty string");
+    if (unicode_empty == NULL) {
+        unicode_empty = PyUnicode_New(0, 0);
+        if (!unicode_empty)
+            Py_FatalError("Can't create empty string");
+    }
     assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
 
-    for (i = 0; i < 256; i++)
-        unicode_latin1[i] = NULL;
     if (PyType_Ready(&PyUnicode_Type) < 0)
         Py_FatalError("Can't initialize 'unicode'");
 
@@ -14289,15 +14265,10 @@
 {
     int i;
 
-    Py_XDECREF(unicode_empty);
-    unicode_empty = NULL;
-
-    for (i = 0; i < 256; i++) {
-        if (unicode_latin1[i]) {
-            Py_DECREF(unicode_latin1[i]);
-            unicode_latin1[i] = NULL;
-        }
-    }
+    Py_CLEAR(unicode_empty);
+
+    for (i = 0; i < 256; i++)
+        Py_CLEAR(unicode_latin1[i]);
     _PyUnicode_ClearStaticStrings();
     (void)PyUnicode_ClearFreeList();
 }
@@ -14426,8 +14397,7 @@
             "mortal/immortal\n", mortal_size, immortal_size);
     Py_DECREF(keys);
     PyDict_Clear(interned);
-    Py_DECREF(interned);
-    interned = NULL;
+    Py_CLEAR(interned);
 }

_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

[issue10156] Initialization of globals in unicodeobject.c

Reply via email to