Author: Armin Rigo <ar...@tunes.org> Branch: py3k Changeset: r85878:904955c86e02 Date: 2016-07-25 11:15 +0200 http://bitbucket.org/pypy/pypy/changeset/904955c86e02/
Log: Handle the special case of \u03A3 in lower() diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py --- a/pypy/objspace/std/stringmethods.py +++ b/pypy/objspace/std/stringmethods.py @@ -152,7 +152,7 @@ builder = self._builder(len(value)) builder.append(self._upper(value[0])) for i in range(1, len(value)): - builder.append(self._lower(value[i])) + builder.append(self._lower_in_str(value, i)) return self._new(builder.build()) @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) @@ -452,9 +452,13 @@ value = self._val(space) builder = self._builder(len(value)) for i in range(len(value)): - builder.append(self._lower(value[i])) + builder.append(self._lower_in_str(value, i)) return self._new(builder.build()) + def _lower_in_str(self, value, i): + # overridden in unicodeobject.py + return self._lower(value[i]) + def descr_partition(self, space, w_sub): from pypy.objspace.std.bytearrayobject import W_BytearrayObject value = self._val(space) @@ -699,7 +703,7 @@ for i in range(len(selfvalue)): ch = selfvalue[i] if self._isupper(ch): - builder.append(self._lower(ch)) + builder.append(self._lower_in_str(selfvalue, i)) elif self._islower(ch): builder.append(self._upper(ch)) else: @@ -716,11 +720,12 @@ def title(self, value): builder = self._builder(len(value)) previous_is_cased = False - for ch in value: + for i in range(len(value)): + ch = value[i] if not previous_is_cased: builder.append(self._title(ch)) else: - builder.append(self._lower(ch)) + builder.append(self._lower_in_str(value, i)) previous_is_cased = self._iscased(ch) return builder.build() diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -978,10 +978,20 @@ raises(TypeError, 'u"".encode("utf-8", None)') def test_casefold(self): - assert 'hello'.casefold() == 'hello' - assert 'hELlo'.casefold() == 'hello' - assert 'ß'.casefold() == 'ss' - assert 'fi'.casefold() == 'fi' - assert '\u03a3'.casefold() == '\u03c3' - assert 'A\u0345\u03a3'.casefold() == 'a\u03b9\u03c3' - assert '\u00b5'.casefold() == '\u03bc' + assert u'hello'.casefold() == u'hello' + assert u'hELlo'.casefold() == u'hello' + assert u'ß'.casefold() == u'ss' + assert u'fi'.casefold() == u'fi' + assert u'\u03a3'.casefold() == u'\u03c3' + assert u'A\u0345\u03a3'.casefold() == u'a\u03b9\u03c3' + assert u'\u00b5'.casefold() == u'\u03bc' + + def test_lower_3a3(self): + # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 + assert u'\u03a3'.lower() == u'\u03c3' + assert u'\u0345\u03a3'.lower() == u'\u0345\u03c3' + assert u'A\u0345\u03a3'.lower() == u'a\u0345\u03c2' + assert u'A\u0345\u03a3a'.lower() == u'a\u0345\u03c3a' + assert u'A\u0345\u03a3'.lower() == u'a\u0345\u03c2' + assert u'A\u03a3\u0345'.lower() == u'a\u03c2\u0345' + assert u'\u03a3\u0345 '.lower() == u'\u03c3\u0345 ' diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -173,7 +173,11 @@ return u''.join([unichr(x) for x in unicodedb.toupper_full(ord(ch))]) - def _lower(self, ch): + def _lower_in_str(self, value, i): + ch = value[i] + if ord(ch) == 0x3A3: + # Obscure special case. + return self._handle_capital_sigma(value, i) return u''.join([unichr(x) for x in unicodedb.tolower_full(ord(ch))]) @@ -181,6 +185,31 @@ return u''.join([unichr(x) for x in unicodedb.totitle_full(ord(ch))]) + def _handle_capital_sigma(self, value, i): + # U+03A3 is in the Final_Sigma context when, it is found like this: + #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* \p{cased}) + # where \p{xxx} is a character with property xxx. + j = i - 1 + while j >= 0: + ch = value[j] + if not unicodedb.iscaseignorable(ord(ch)): + break + j -= 1 + final_sigma = j >= 0 and unicodedb.iscased(ord(ch)) + if final_sigma: + j = i + 1 + length = len(value) + while j < length: + ch = value[j] + if not unicodedb.iscaseignorable(ord(ch)): + break + j += 1 + final_sigma = j == length or not unicodedb.iscased(ord(ch)) + if final_sigma: + return unichr(0x3C2) + else: + return unichr(0x3C3) + def _newlist_unwrapped(self, space, lst): return space.newlist_unicode(lst) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit