Ezio Melotti <ezio.melo...@gmail.com> added the comment:
test attachments
----------
Added file: http://bugs.python.org/file23325/unnamed
Added file: http://bugs.python.org/file23326/issue12753-3.diff
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue2771>
_______________________________________
test attachments<br>
<br><br>
diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -29,6 +29,9 @@
Look up character by name. If a character with the given name is found,
return
the corresponding character. If not found, :exc:`KeyError` is raised.
+ .. versionchanged:: 3.3
+ Support for name aliases [#]_ and named sequences [#]_ has been added.
+
.. function:: name(chr[, default])
@@ -160,3 +163,9 @@
>>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber
'AN'
+
+.. rubric:: Footnotes
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt
diff --git a/Doc/reference/lexical_analysis.rst
b/Doc/reference/lexical_analysis.rst
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@@ -492,13 +492,13 @@
+-----------------+---------------------------------+-------+
| Escape Sequence | Meaning | Notes |
+=================+=================================+=======+
-| ``\N{name}`` | Character named *name* in the | |
+| ``\N{name}`` | Character named *name* in the | \(4) |
| | Unicode database | |
+-----------------+---------------------------------+-------+
-| ``\uxxxx`` | Character with 16-bit hex value | \(4) |
+| ``\uxxxx`` | Character with 16-bit hex value | \(5) |
| | *xxxx* | |
+-----------------+---------------------------------+-------+
-| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(5) |
+| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(6) |
| | *xxxxxxxx* | |
+-----------------+---------------------------------+-------+
@@ -516,10 +516,14 @@
with the given value.
(4)
+ .. versionchanged:: 3.3
+ Support for name aliases [#]_ has been added.
+
+(5)
Individual code units which form parts of a surrogate pair can be encoded
using
this escape sequence. Exactly four hex digits are required.
-(5)
+(6)
Any Unicode character can be encoded this way, but characters outside the
Basic
Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is
compiled to use 16-bit code units (the default). Exactly eight hex digits
@@ -706,3 +710,8 @@
occurrence outside string literals and comments is an unconditional error::
$ ? `
+
+
+.. rubric:: Footnotes
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -8,8 +8,11 @@
"""#"
import unittest
+import unicodedata
from test import support
+from http.client import HTTPException
+from test.test_normalization import check_version
class UnicodeNamesTest(unittest.TestCase):
@@ -59,8 +62,6 @@
)
def test_ascii_letters(self):
- import unicodedata
-
for char in "".join(map(chr, range(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
@@ -81,7 +82,6 @@
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
- import unicodedata
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
def test_cjk_unified_ideographs(self):
@@ -97,14 +97,11 @@
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
def test_bmp_characters(self):
- import unicodedata
- count = 0
for code in range(0x10000):
char = chr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
- count += 1
def test_misc_symbols(self):
self.checkletter("PILCROW SIGN", "\u00b6")
@@ -112,8 +109,65 @@
self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
+ def test_aliases(self):
+ # Check that the aliases defined in the NameAliases.txt file work.
+ # This should be updated when new aliases are added or the file
+ # should be downloaded and parsed instead. See #12753.
+ aliases = [
+ ('LATIN CAPITAL LETTER GHA', 0x01A2),
+ ('LATIN SMALL LETTER GHA', 0x01A3),
+ ('KANNADA LETTER LLLA', 0x0CDE),
+ ('LAO LETTER FO FON', 0x0E9D),
+ ('LAO LETTER FO FAY', 0x0E9F),
+ ('LAO LETTER RO', 0x0EA3),
+ ('LAO LETTER LO', 0x0EA5),
+ ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
+ ('YI SYLLABLE ITERATION MARK', 0xA015),
+ ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET',
0xFE18),
+ ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
+ ]
+ for alias, codepoint in aliases:
+ self.checkletter(alias, chr(codepoint))
+ name = unicodedata.name(chr(codepoint))
+ self.assertNotEqual(name, alias)
+ self.assertEqual(unicodedata.lookup(alias),
+ unicodedata.lookup(name))
+
+ def test_named_sequences_sample(self):
+ # Check a few named sequences. See #12753.
+ sequences = [
+ ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
+ ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
+ ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
+ ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
+ ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
+ ]
+ for seqname, codepoints in sequences:
+ self.assertEqual(unicodedata.lookup(seqname), codepoints)
+ with self.assertRaises(SyntaxError):
+ self.checkletter(seqname, None)
+
+ def test_named_sequences_full(self):
+ # Check all the named sequences
+ url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
+ unicodedata.unidata_version)
+ try:
+ testdata = support.open_urlresource(url, encoding="utf-8",
+ check=check_version)
+ except (IOError, HTTPException):
+ self.skipTest("Could not retrieve " + url)
+ self.addCleanup(testdata.close)
+ for line in testdata:
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ seqname, codepoints = line.split(';')
+ codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
+ self.assertEqual(unicodedata.lookup(seqname), codepoints)
+ with self.assertRaises(SyntaxError):
+ self.checkletter(seqname, None)
+
def test_errors(self):
- import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
self.assertRaises(TypeError, unicodedata.lookup)
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -1054,7 +1054,7 @@
static int
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
{
- unsigned int h, v;
+ unsigned int h, v, k;
unsigned int mask = code_size-1;
unsigned int i, incr;
@@ -1100,6 +1100,17 @@
return 1;
}
+ /* check for aliases defined in NameAliases.txt */
+ for (k=0; k<aliases_count; k++) {
+ /* name might not be nul-terminated, so it's necessary to check
+ that the len of the two names is the same before comparing them */
+ if ((name_aliases[k].namelen == namelen) &&
+ (strncmp(name, name_aliases[k].name, namelen) == 0)) {
+ *code = name_aliases[k].codepoint;
+ return 1;
+ }
+ }
+
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
@@ -1176,6 +1187,26 @@
return PyUnicode_FromString(name);
}
+static PyObject *
+_lookup_named_sequences(char* name) {
+ int low, mid, high, cmp;
+ low = 0;
+ high = named_sequences_count;
+ while (low <= high) {
+ mid = (low+high) / 2;
+ cmp = strcmp(name, named_sequences[mid].name);
+ if (cmp < 0)
+ high = mid - 1;
+ else if (cmp > 0)
+ low = mid + 1;
+ else
+ return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
+ named_sequences[mid].seq,
+ named_sequences[mid].seqlen);
+ }
+ return NULL;
+}
+
PyDoc_STRVAR(unicodedata_lookup__doc__,
"lookup(name)\n\
\n\
@@ -1187,6 +1218,7 @@
unicodedata_lookup(PyObject* self, PyObject* args)
{
Py_UCS4 code;
+ PyObject *codes; /* for named sequences */
char* name;
int namelen;
@@ -1194,9 +1226,13 @@
return NULL;
if (!_getcode(self, name, namelen, &code)) {
- PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
- name);
- return NULL;
+ /* if the normal lookup fails try with named sequences */
+ codes = _lookup_named_sequences(name);
+ if (codes == NULL) {
+ PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
name);
+ return NULL;
+ }
+ return codes;
}
return PyUnicode_FromOrdinal(code);
diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
@@ -18811,3 +18811,452 @@
#define code_magic 47
#define code_size 32768
#define code_poly 32771
+
+typedef struct Alias {
+ char *name;
+ int namelen;
+ int codepoint;
+} alias;
+
+static const int aliases_count = 11;
+static const alias name_aliases[] = {
+ {"LATIN CAPITAL LETTER GHA", 24, 0x01A2},
+ {"LATIN SMALL LETTER GHA", 22, 0x01A3},
+ {"KANNADA LETTER LLLA", 19, 0x0CDE},
+ {"LAO LETTER FO FON", 17, 0x0E9D},
+ {"LAO LETTER FO FAY", 17, 0x0E9F},
+ {"LAO LETTER RO", 13, 0x0EA3},
+ {"LAO LETTER LO", 13, 0x0EA5},
+ {"TIBETAN MARK BKA- SHOG GI MGO RGYAN", 35, 0x0FD0},
+ {"YI SYLLABLE ITERATION MARK", 26, 0xA015},
+ {"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 61,
0xFE18},
+ {"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 52, 0x1D0C5},
+};
+
+typedef struct NamedSequence {
+ char *name;
+ int seqlen;
+ Py_UCS2 seq[4];
+} named_sequence;
+
+static const int named_sequences_count = 418;
+static const named_sequence named_sequences[] = {
+ {"BENGALI LETTER KHINYA", 3, {0x0995, 0x09CD, 0x09B7}},
+ {"GEORGIAN LETTER U-BRJGU", 2, {0x10E3, 0x0302}},
+ {"HIRAGANA LETTER BIDAKUON NGA", 2, {0x304B, 0x309A}},
+ {"HIRAGANA LETTER BIDAKUON NGE", 2, {0x3051, 0x309A}},
+ {"HIRAGANA LETTER BIDAKUON NGI", 2, {0x304D, 0x309A}},
+ {"HIRAGANA LETTER BIDAKUON NGO", 2, {0x3053, 0x309A}},
+ {"HIRAGANA LETTER BIDAKUON NGU", 2, {0x304F, 0x309A}},
+ {"KATAKANA LETTER AINU CE", 2, {0x30BB, 0x309A}},
+ {"KATAKANA LETTER AINU P", 2, {0x31F7, 0x309A}},
+ {"KATAKANA LETTER AINU TO", 2, {0x30C8, 0x309A}},
+ {"KATAKANA LETTER AINU TU", 2, {0x30C4, 0x309A}},
+ {"KATAKANA LETTER BIDAKUON NGA", 2, {0x30AB, 0x309A}},
+ {"KATAKANA LETTER BIDAKUON NGE", 2, {0x30B1, 0x309A}},
+ {"KATAKANA LETTER BIDAKUON NGI", 2, {0x30AD, 0x309A}},
+ {"KATAKANA LETTER BIDAKUON NGO", 2, {0x30B3, 0x309A}},
+ {"KATAKANA LETTER BIDAKUON NGU", 2, {0x30AF, 0x309A}},
+ {"KHMER CONSONANT SIGN COENG BA", 2, {0x17D2, 0x1794}},
+ {"KHMER CONSONANT SIGN COENG CA", 2, {0x17D2, 0x1785}},
+ {"KHMER CONSONANT SIGN COENG CHA", 2, {0x17D2, 0x1786}},
+ {"KHMER CONSONANT SIGN COENG CHO", 2, {0x17D2, 0x1788}},
+ {"KHMER CONSONANT SIGN COENG CO", 2, {0x17D2, 0x1787}},
+ {"KHMER CONSONANT SIGN COENG DA", 2, {0x17D2, 0x178A}},
+ {"KHMER CONSONANT SIGN COENG DO", 2, {0x17D2, 0x178C}},
+ {"KHMER CONSONANT SIGN COENG HA", 2, {0x17D2, 0x17A0}},
+ {"KHMER CONSONANT SIGN COENG KA", 2, {0x17D2, 0x1780}},
+ {"KHMER CONSONANT SIGN COENG KHA", 2, {0x17D2, 0x1781}},
+ {"KHMER CONSONANT SIGN COENG KHO", 2, {0x17D2, 0x1783}},
+ {"KHMER CONSONANT SIGN COENG KO", 2, {0x17D2, 0x1782}},
+ {"KHMER CONSONANT SIGN COENG LA", 2, {0x17D2, 0x17A1}},
+ {"KHMER CONSONANT SIGN COENG LO", 2, {0x17D2, 0x179B}},
+ {"KHMER CONSONANT SIGN COENG MO", 2, {0x17D2, 0x1798}},
+ {"KHMER CONSONANT SIGN COENG NA", 2, {0x17D2, 0x178E}},
+ {"KHMER CONSONANT SIGN COENG NGO", 2, {0x17D2, 0x1784}},
+ {"KHMER CONSONANT SIGN COENG NO", 2, {0x17D2, 0x1793}},
+ {"KHMER CONSONANT SIGN COENG NYO", 2, {0x17D2, 0x1789}},
+ {"KHMER CONSONANT SIGN COENG PHA", 2, {0x17D2, 0x1795}},
+ {"KHMER CONSONANT SIGN COENG PHO", 2, {0x17D2, 0x1797}},
+ {"KHMER CONSONANT SIGN COENG PO", 2, {0x17D2, 0x1796}},
+ {"KHMER CONSONANT SIGN COENG RO", 2, {0x17D2, 0x179A}},
+ {"KHMER CONSONANT SIGN COENG SA", 2, {0x17D2, 0x179F}},
+ {"KHMER CONSONANT SIGN COENG SHA", 2, {0x17D2, 0x179D}},
+ {"KHMER CONSONANT SIGN COENG SSA", 2, {0x17D2, 0x179E}},
+ {"KHMER CONSONANT SIGN COENG TA", 2, {0x17D2, 0x178F}},
+ {"KHMER CONSONANT SIGN COENG THA", 2, {0x17D2, 0x1790}},
+ {"KHMER CONSONANT SIGN COENG THO", 2, {0x17D2, 0x1792}},
+ {"KHMER CONSONANT SIGN COENG TO", 2, {0x17D2, 0x1791}},
+ {"KHMER CONSONANT SIGN COENG TTHA", 2, {0x17D2, 0x178B}},
+ {"KHMER CONSONANT SIGN COENG TTHO", 2, {0x17D2, 0x178D}},
+ {"KHMER CONSONANT SIGN COENG VO", 2, {0x17D2, 0x179C}},
+ {"KHMER CONSONANT SIGN COENG YO", 2, {0x17D2, 0x1799}},
+ {"KHMER INDEPENDENT VOWEL SIGN COENG QE", 2, {0x17D2, 0x17AF}},
+ {"KHMER INDEPENDENT VOWEL SIGN COENG QU", 2, {0x17D2, 0x17A7}},
+ {"KHMER INDEPENDENT VOWEL SIGN COENG RY", 2, {0x17D2, 0x17AB}},
+ {"KHMER INDEPENDENT VOWEL SIGN COENG RYY", 2, {0x17D2, 0x17AC}},
+ {"KHMER VOWEL SIGN AAM", 2, {0x17B6, 0x17C6}},
+ {"KHMER VOWEL SIGN COENG QA", 2, {0x17D2, 0x17A2}},
+ {"KHMER VOWEL SIGN OM", 2, {0x17BB, 0x17C6}},
+ {"LATIN CAPITAL LETTER A WITH MACRON AND GRAVE", 2, {0x0100, 0x0300}},
+ {"LATIN CAPITAL LETTER A WITH OGONEK AND ACUTE", 2, {0x0104, 0x0301}},
+ {"LATIN CAPITAL LETTER A WITH OGONEK AND TILDE", 2, {0x0104, 0x0303}},
+ {"LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON", 2, {0x00CA, 0x030C}},
+ {"LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON", 2, {0x00CA, 0x0304}},
+ {"LATIN CAPITAL LETTER E WITH DOT ABOVE AND ACUTE", 2, {0x0116, 0x0301}},
+ {"LATIN CAPITAL LETTER E WITH DOT ABOVE AND TILDE", 2, {0x0116, 0x0303}},
+ {"LATIN CAPITAL LETTER E WITH OGONEK AND ACUTE", 2, {0x0118, 0x0301}},
+ {"LATIN CAPITAL LETTER E WITH OGONEK AND TILDE", 2, {0x0118, 0x0303}},
+ {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW", 2, {0x0045, 0x0329}},
+ {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00C9,
0x0329}},
+ {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00C8,
0x0329}},
+ {"LATIN CAPITAL LETTER I WITH MACRON AND GRAVE", 2, {0x012A, 0x0300}},
+ {"LATIN CAPITAL LETTER I WITH OGONEK AND ACUTE", 2, {0x012E, 0x0301}},
+ {"LATIN CAPITAL LETTER I WITH OGONEK AND TILDE", 2, {0x012E, 0x0303}},
+ {"LATIN CAPITAL LETTER J WITH TILDE", 2, {0x004A, 0x0303}},
+ {"LATIN CAPITAL LETTER L WITH TILDE", 2, {0x004C, 0x0303}},
+ {"LATIN CAPITAL LETTER M WITH TILDE", 2, {0x004D, 0x0303}},
+ {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW", 2, {0x004F, 0x0329}},
+ {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00D3,
0x0329}},
+ {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00D2,
0x0329}},
+ {"LATIN CAPITAL LETTER R WITH TILDE", 2, {0x0052, 0x0303}},
+ {"LATIN CAPITAL LETTER S WITH VERTICAL LINE BELOW", 2, {0x0053, 0x0329}},
+ {"LATIN CAPITAL LETTER U WITH MACRON AND ACUTE", 2, {0x016A, 0x0301}},
+ {"LATIN CAPITAL LETTER U WITH MACRON AND GRAVE", 2, {0x016A, 0x0300}},
+ {"LATIN CAPITAL LETTER U WITH MACRON AND TILDE", 2, {0x016A, 0x0303}},
+ {"LATIN CAPITAL LETTER U WITH OGONEK AND ACUTE", 2, {0x0172, 0x0301}},
+ {"LATIN CAPITAL LETTER U WITH OGONEK AND TILDE", 2, {0x0172, 0x0303}},
+ {"LATIN SMALL LETTER A WITH MACRON AND GRAVE", 2, {0x0101, 0x0300}},
+ {"LATIN SMALL LETTER A WITH OGONEK AND ACUTE", 2, {0x0105, 0x0301}},
+ {"LATIN SMALL LETTER A WITH OGONEK AND TILDE", 2, {0x0105, 0x0303}},
+ {"LATIN SMALL LETTER AE WITH GRAVE", 2, {0x00E6, 0x0300}},
+ {"LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON", 2, {0x00EA, 0x030C}},
+ {"LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON", 2, {0x00EA, 0x0304}},
+ {"LATIN SMALL LETTER E WITH DOT ABOVE AND ACUTE", 2, {0x0117, 0x0301}},
+ {"LATIN SMALL LETTER E WITH DOT ABOVE AND TILDE", 2, {0x0117, 0x0303}},
+ {"LATIN SMALL LETTER E WITH OGONEK AND ACUTE", 2, {0x0119, 0x0301}},
+ {"LATIN SMALL LETTER E WITH OGONEK AND TILDE", 2, {0x0119, 0x0303}},
+ {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW", 2, {0x0065, 0x0329}},
+ {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00E9,
0x0329}},
+ {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00E8,
0x0329}},
+ {"LATIN SMALL LETTER HOOKED SCHWA WITH ACUTE", 2, {0x025A, 0x0301}},
+ {"LATIN SMALL LETTER HOOKED SCHWA WITH GRAVE", 2, {0x025A, 0x0300}},
+ {"LATIN SMALL LETTER I WITH DOT ABOVE AND ACUTE", 3, {0x0069, 0x0307,
0x0301}},
+ {"LATIN SMALL LETTER I WITH DOT ABOVE AND GRAVE", 3, {0x0069, 0x0307,
0x0300}},
+ {"LATIN SMALL LETTER I WITH DOT ABOVE AND TILDE", 3, {0x0069, 0x0307,
0x0303}},
+ {"LATIN SMALL LETTER I WITH MACRON AND GRAVE", 2, {0x012B, 0x0300}},
+ {"LATIN SMALL LETTER I WITH OGONEK AND DOT ABOVE AND ACUTE", 3, {0x012F,
0x0307, 0x0301}},
+ {"LATIN SMALL LETTER I WITH OGONEK AND DOT ABOVE AND TILDE", 3, {0x012F,
0x0307, 0x0303}},
+ {"LATIN SMALL LETTER J WITH DOT ABOVE AND TILDE", 3, {0x006A, 0x0307,
0x0303}},
+ {"LATIN SMALL LETTER L WITH TILDE", 2, {0x006C, 0x0303}},
+ {"LATIN SMALL LETTER M WITH TILDE", 2, {0x006D, 0x0303}},
+ {"LATIN SMALL LETTER NG WITH TILDE ABOVE", 3, {0x006E, 0x0360, 0x0067}},
+ {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW", 2, {0x006F, 0x0329}},
+ {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00F3,
0x0329}},
+ {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00F2,
0x0329}},
+ {"LATIN SMALL LETTER OPEN O WITH ACUTE", 2, {0x0254, 0x0301}},
+ {"LATIN SMALL LETTER OPEN O WITH GRAVE", 2, {0x0254, 0x0300}},
+ {"LATIN SMALL LETTER R WITH TILDE", 2, {0x0072, 0x0303}},
+ {"LATIN SMALL LETTER S WITH VERTICAL LINE BELOW", 2, {0x0073, 0x0329}},
+ {"LATIN SMALL LETTER SCHWA WITH ACUTE", 2, {0x0259, 0x0301}},
+ {"LATIN SMALL LETTER SCHWA WITH GRAVE", 2, {0x0259, 0x0300}},
+ {"LATIN SMALL LETTER TURNED V WITH ACUTE", 2, {0x028C, 0x0301}},
+ {"LATIN SMALL LETTER TURNED V WITH GRAVE", 2, {0x028C, 0x0300}},
+ {"LATIN SMALL LETTER U WITH MACRON AND ACUTE", 2, {0x016B, 0x0301}},
+ {"LATIN SMALL LETTER U WITH MACRON AND GRAVE", 2, {0x016B, 0x0300}},
+ {"LATIN SMALL LETTER U WITH MACRON AND TILDE", 2, {0x016B, 0x0303}},
+ {"LATIN SMALL LETTER U WITH OGONEK AND ACUTE", 2, {0x0173, 0x0301}},
+ {"LATIN SMALL LETTER U WITH OGONEK AND TILDE", 2, {0x0173, 0x0303}},
+ {"MODIFIER LETTER EXTRA-HIGH EXTRA-LOW CONTOUR TONE BAR", 2, {0x02E5,
0x02E9}},
+ {"MODIFIER LETTER EXTRA-LOW EXTRA-HIGH CONTOUR TONE BAR", 2, {0x02E9,
0x02E5}},
+ {"TAMIL CONSONANT C", 2, {0x0B9A, 0x0BCD}},
+ {"TAMIL CONSONANT H", 2, {0x0BB9, 0x0BCD}},
+ {"TAMIL CONSONANT J", 2, {0x0B9C, 0x0BCD}},
+ {"TAMIL CONSONANT K", 2, {0x0B95, 0x0BCD}},
+ {"TAMIL CONSONANT KSS", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCD}},
+ {"TAMIL CONSONANT L", 2, {0x0BB2, 0x0BCD}},
+ {"TAMIL CONSONANT LL", 2, {0x0BB3, 0x0BCD}},
+ {"TAMIL CONSONANT LLL", 2, {0x0BB4, 0x0BCD}},
+ {"TAMIL CONSONANT M", 2, {0x0BAE, 0x0BCD}},
+ {"TAMIL CONSONANT N", 2, {0x0BA8, 0x0BCD}},
+ {"TAMIL CONSONANT NG", 2, {0x0B99, 0x0BCD}},
+ {"TAMIL CONSONANT NN", 2, {0x0BA3, 0x0BCD}},
+ {"TAMIL CONSONANT NNN", 2, {0x0BA9, 0x0BCD}},
+ {"TAMIL CONSONANT NY", 2, {0x0B9E, 0x0BCD}},
+ {"TAMIL CONSONANT P", 2, {0x0BAA, 0x0BCD}},
+ {"TAMIL CONSONANT R", 2, {0x0BB0, 0x0BCD}},
+ {"TAMIL CONSONANT RR", 2, {0x0BB1, 0x0BCD}},
+ {"TAMIL CONSONANT S", 2, {0x0BB8, 0x0BCD}},
+ {"TAMIL CONSONANT SH", 2, {0x0BB6, 0x0BCD}},
+ {"TAMIL CONSONANT SS", 2, {0x0BB7, 0x0BCD}},
+ {"TAMIL CONSONANT T", 2, {0x0BA4, 0x0BCD}},
+ {"TAMIL CONSONANT TT", 2, {0x0B9F, 0x0BCD}},
+ {"TAMIL CONSONANT V", 2, {0x0BB5, 0x0BCD}},
+ {"TAMIL CONSONANT Y", 2, {0x0BAF, 0x0BCD}},
+ {"TAMIL SYLLABLE CAA", 2, {0x0B9A, 0x0BBE}},
+ {"TAMIL SYLLABLE CAI", 2, {0x0B9A, 0x0BC8}},
+ {"TAMIL SYLLABLE CAU", 2, {0x0B9A, 0x0BCC}},
+ {"TAMIL SYLLABLE CE", 2, {0x0B9A, 0x0BC6}},
+ {"TAMIL SYLLABLE CEE", 2, {0x0B9A, 0x0BC7}},
+ {"TAMIL SYLLABLE CI", 2, {0x0B9A, 0x0BBF}},
+ {"TAMIL SYLLABLE CII", 2, {0x0B9A, 0x0BC0}},
+ {"TAMIL SYLLABLE CO", 2, {0x0B9A, 0x0BCA}},
+ {"TAMIL SYLLABLE COO", 2, {0x0B9A, 0x0BCB}},
+ {"TAMIL SYLLABLE CU", 2, {0x0B9A, 0x0BC1}},
+ {"TAMIL SYLLABLE CUU", 2, {0x0B9A, 0x0BC2}},
+ {"TAMIL SYLLABLE HAA", 2, {0x0BB9, 0x0BBE}},
+ {"TAMIL SYLLABLE HAI", 2, {0x0BB9, 0x0BC8}},
+ {"TAMIL SYLLABLE HAU", 2, {0x0BB9, 0x0BCC}},
+ {"TAMIL SYLLABLE HE", 2, {0x0BB9, 0x0BC6}},
+ {"TAMIL SYLLABLE HEE", 2, {0x0BB9, 0x0BC7}},
+ {"TAMIL SYLLABLE HI", 2, {0x0BB9, 0x0BBF}},
+ {"TAMIL SYLLABLE HII", 2, {0x0BB9, 0x0BC0}},
+ {"TAMIL SYLLABLE HO", 2, {0x0BB9, 0x0BCA}},
+ {"TAMIL SYLLABLE HOO", 2, {0x0BB9, 0x0BCB}},
+ {"TAMIL SYLLABLE HU", 2, {0x0BB9, 0x0BC1}},
+ {"TAMIL SYLLABLE HUU", 2, {0x0BB9, 0x0BC2}},
+ {"TAMIL SYLLABLE JAA", 2, {0x0B9C, 0x0BBE}},
+ {"TAMIL SYLLABLE JAI", 2, {0x0B9C, 0x0BC8}},
+ {"TAMIL SYLLABLE JAU", 2, {0x0B9C, 0x0BCC}},
+ {"TAMIL SYLLABLE JE", 2, {0x0B9C, 0x0BC6}},
+ {"TAMIL SYLLABLE JEE", 2, {0x0B9C, 0x0BC7}},
+ {"TAMIL SYLLABLE JI", 2, {0x0B9C, 0x0BBF}},
+ {"TAMIL SYLLABLE JII", 2, {0x0B9C, 0x0BC0}},
+ {"TAMIL SYLLABLE JO", 2, {0x0B9C, 0x0BCA}},
+ {"TAMIL SYLLABLE JOO", 2, {0x0B9C, 0x0BCB}},
+ {"TAMIL SYLLABLE JU", 2, {0x0B9C, 0x0BC1}},
+ {"TAMIL SYLLABLE JUU", 2, {0x0B9C, 0x0BC2}},
+ {"TAMIL SYLLABLE KAA", 2, {0x0B95, 0x0BBE}},
+ {"TAMIL SYLLABLE KAI", 2, {0x0B95, 0x0BC8}},
+ {"TAMIL SYLLABLE KAU", 2, {0x0B95, 0x0BCC}},
+ {"TAMIL SYLLABLE KE", 2, {0x0B95, 0x0BC6}},
+ {"TAMIL SYLLABLE KEE", 2, {0x0B95, 0x0BC7}},
+ {"TAMIL SYLLABLE KI", 2, {0x0B95, 0x0BBF}},
+ {"TAMIL SYLLABLE KII", 2, {0x0B95, 0x0BC0}},
+ {"TAMIL SYLLABLE KO", 2, {0x0B95, 0x0BCA}},
+ {"TAMIL SYLLABLE KOO", 2, {0x0B95, 0x0BCB}},
+ {"TAMIL SYLLABLE KSSA", 3, {0x0B95, 0x0BCD, 0x0BB7}},
+ {"TAMIL SYLLABLE KSSAA", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BBE}},
+ {"TAMIL SYLLABLE KSSAI", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC8}},
+ {"TAMIL SYLLABLE KSSAU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCC}},
+ {"TAMIL SYLLABLE KSSE", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC6}},
+ {"TAMIL SYLLABLE KSSEE", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC7}},
+ {"TAMIL SYLLABLE KSSI", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BBF}},
+ {"TAMIL SYLLABLE KSSII", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC0}},
+ {"TAMIL SYLLABLE KSSO", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCA}},
+ {"TAMIL SYLLABLE KSSOO", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCB}},
+ {"TAMIL SYLLABLE KSSU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC1}},
+ {"TAMIL SYLLABLE KSSUU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC2}},
+ {"TAMIL SYLLABLE KU", 2, {0x0B95, 0x0BC1}},
+ {"TAMIL SYLLABLE KUU", 2, {0x0B95, 0x0BC2}},
+ {"TAMIL SYLLABLE LAA", 2, {0x0BB2, 0x0BBE}},
+ {"TAMIL SYLLABLE LAI", 2, {0x0BB2, 0x0BC8}},
+ {"TAMIL SYLLABLE LAU", 2, {0x0BB2, 0x0BCC}},
+ {"TAMIL SYLLABLE LE", 2, {0x0BB2, 0x0BC6}},
+ {"TAMIL SYLLABLE LEE", 2, {0x0BB2, 0x0BC7}},
+ {"TAMIL SYLLABLE LI", 2, {0x0BB2, 0x0BBF}},
+ {"TAMIL SYLLABLE LII", 2, {0x0BB2, 0x0BC0}},
+ {"TAMIL SYLLABLE LLAA", 2, {0x0BB3, 0x0BBE}},
+ {"TAMIL SYLLABLE LLAI", 2, {0x0BB3, 0x0BC8}},
+ {"TAMIL SYLLABLE LLAU", 2, {0x0BB3, 0x0BCC}},
+ {"TAMIL SYLLABLE LLE", 2, {0x0BB3, 0x0BC6}},
+ {"TAMIL SYLLABLE LLEE", 2, {0x0BB3, 0x0BC7}},
+ {"TAMIL SYLLABLE LLI", 2, {0x0BB3, 0x0BBF}},
+ {"TAMIL SYLLABLE LLII", 2, {0x0BB3, 0x0BC0}},
+ {"TAMIL SYLLABLE LLLAA", 2, {0x0BB4, 0x0BBE}},
+ {"TAMIL SYLLABLE LLLAI", 2, {0x0BB4, 0x0BC8}},
+ {"TAMIL SYLLABLE LLLAU", 2, {0x0BB4, 0x0BCC}},
+ {"TAMIL SYLLABLE LLLE", 2, {0x0BB4, 0x0BC6}},
+ {"TAMIL SYLLABLE LLLEE", 2, {0x0BB4, 0x0BC7}},
+ {"TAMIL SYLLABLE LLLI", 2, {0x0BB4, 0x0BBF}},
+ {"TAMIL SYLLABLE LLLII", 2, {0x0BB4, 0x0BC0}},
+ {"TAMIL SYLLABLE LLLO", 2, {0x0BB4, 0x0BCA}},
+ {"TAMIL SYLLABLE LLLOO", 2, {0x0BB4, 0x0BCB}},
+ {"TAMIL SYLLABLE LLLU", 2, {0x0BB4, 0x0BC1}},
+ {"TAMIL SYLLABLE LLLUU", 2, {0x0BB4, 0x0BC2}},
+ {"TAMIL SYLLABLE LLO", 2, {0x0BB3, 0x0BCA}},
+ {"TAMIL SYLLABLE LLOO", 2, {0x0BB3, 0x0BCB}},
+ {"TAMIL SYLLABLE LLU", 2, {0x0BB3, 0x0BC1}},
+ {"TAMIL SYLLABLE LLUU", 2, {0x0BB3, 0x0BC2}},
+ {"TAMIL SYLLABLE LO", 2, {0x0BB2, 0x0BCA}},
+ {"TAMIL SYLLABLE LOO", 2, {0x0BB2, 0x0BCB}},
+ {"TAMIL SYLLABLE LU", 2, {0x0BB2, 0x0BC1}},
+ {"TAMIL SYLLABLE LUU", 2, {0x0BB2, 0x0BC2}},
+ {"TAMIL SYLLABLE MAA", 2, {0x0BAE, 0x0BBE}},
+ {"TAMIL SYLLABLE MAI", 2, {0x0BAE, 0x0BC8}},
+ {"TAMIL SYLLABLE MAU", 2, {0x0BAE, 0x0BCC}},
+ {"TAMIL SYLLABLE ME", 2, {0x0BAE, 0x0BC6}},
+ {"TAMIL SYLLABLE MEE", 2, {0x0BAE, 0x0BC7}},
+ {"TAMIL SYLLABLE MI", 2, {0x0BAE, 0x0BBF}},
+ {"TAMIL SYLLABLE MII", 2, {0x0BAE, 0x0BC0}},
+ {"TAMIL SYLLABLE MO", 2, {0x0BAE, 0x0BCA}},
+ {"TAMIL SYLLABLE MOO", 2, {0x0BAE, 0x0BCB}},
+ {"TAMIL SYLLABLE MU", 2, {0x0BAE, 0x0BC1}},
+ {"TAMIL SYLLABLE MUU", 2, {0x0BAE, 0x0BC2}},
+ {"TAMIL SYLLABLE NAA", 2, {0x0BA8, 0x0BBE}},
+ {"TAMIL SYLLABLE NAI", 2, {0x0BA8, 0x0BC8}},
+ {"TAMIL SYLLABLE NAU", 2, {0x0BA8, 0x0BCC}},
+ {"TAMIL SYLLABLE NE", 2, {0x0BA8, 0x0BC6}},
+ {"TAMIL SYLLABLE NEE", 2, {0x0BA8, 0x0BC7}},
+ {"TAMIL SYLLABLE NGAA", 2, {0x0B99, 0x0BBE}},
+ {"TAMIL SYLLABLE NGAI", 2, {0x0B99, 0x0BC8}},
+ {"TAMIL SYLLABLE NGAU", 2, {0x0B99, 0x0BCC}},
+ {"TAMIL SYLLABLE NGE", 2, {0x0B99, 0x0BC6}},
+ {"TAMIL SYLLABLE NGEE", 2, {0x0B99, 0x0BC7}},
+ {"TAMIL SYLLABLE NGI", 2, {0x0B99, 0x0BBF}},
+ {"TAMIL SYLLABLE NGII", 2, {0x0B99, 0x0BC0}},
+ {"TAMIL SYLLABLE NGO", 2, {0x0B99, 0x0BCA}},
+ {"TAMIL SYLLABLE NGOO", 2, {0x0B99, 0x0BCB}},
+ {"TAMIL SYLLABLE NGU", 2, {0x0B99, 0x0BC1}},
+ {"TAMIL SYLLABLE NGUU", 2, {0x0B99, 0x0BC2}},
+ {"TAMIL SYLLABLE NI", 2, {0x0BA8, 0x0BBF}},
+ {"TAMIL SYLLABLE NII", 2, {0x0BA8, 0x0BC0}},
+ {"TAMIL SYLLABLE NNAA", 2, {0x0BA3, 0x0BBE}},
+ {"TAMIL SYLLABLE NNAI", 2, {0x0BA3, 0x0BC8}},
+ {"TAMIL SYLLABLE NNAU", 2, {0x0BA3, 0x0BCC}},
+ {"TAMIL SYLLABLE NNE", 2, {0x0BA3, 0x0BC6}},
+ {"TAMIL SYLLABLE NNEE", 2, {0x0BA3, 0x0BC7}},
+ {"TAMIL SYLLABLE NNI", 2, {0x0BA3, 0x0BBF}},
+ {"TAMIL SYLLABLE NNII", 2, {0x0BA3, 0x0BC0}},
+ {"TAMIL SYLLABLE NNNAA", 2, {0x0BA9, 0x0BBE}},
+ {"TAMIL SYLLABLE NNNAI", 2, {0x0BA9, 0x0BC8}},
+ {"TAMIL SYLLABLE NNNAU", 2, {0x0BA9, 0x0BCC}},
+ {"TAMIL SYLLABLE NNNE", 2, {0x0BA9, 0x0BC6}},
+ {"TAMIL SYLLABLE NNNEE", 2, {0x0BA9, 0x0BC7}},
+ {"TAMIL SYLLABLE NNNI", 2, {0x0BA9, 0x0BBF}},
+ {"TAMIL SYLLABLE NNNII", 2, {0x0BA9, 0x0BC0}},
+ {"TAMIL SYLLABLE NNNO", 2, {0x0BA9, 0x0BCA}},
+ {"TAMIL SYLLABLE NNNOO", 2, {0x0BA9, 0x0BCB}},
+ {"TAMIL SYLLABLE NNNU", 2, {0x0BA9, 0x0BC1}},
+ {"TAMIL SYLLABLE NNNUU", 2, {0x0BA9, 0x0BC2}},
+ {"TAMIL SYLLABLE NNO", 2, {0x0BA3, 0x0BCA}},
+ {"TAMIL SYLLABLE NNOO", 2, {0x0BA3, 0x0BCB}},
+ {"TAMIL SYLLABLE NNU", 2, {0x0BA3, 0x0BC1}},
+ {"TAMIL SYLLABLE NNUU", 2, {0x0BA3, 0x0BC2}},
+ {"TAMIL SYLLABLE NO", 2, {0x0BA8, 0x0BCA}},
+ {"TAMIL SYLLABLE NOO", 2, {0x0BA8, 0x0BCB}},
+ {"TAMIL SYLLABLE NU", 2, {0x0BA8, 0x0BC1}},
+ {"TAMIL SYLLABLE NUU", 2, {0x0BA8, 0x0BC2}},
+ {"TAMIL SYLLABLE NYAA", 2, {0x0B9E, 0x0BBE}},
+ {"TAMIL SYLLABLE NYAI", 2, {0x0B9E, 0x0BC8}},
+ {"TAMIL SYLLABLE NYAU", 2, {0x0B9E, 0x0BCC}},
+ {"TAMIL SYLLABLE NYE", 2, {0x0B9E, 0x0BC6}},
+ {"TAMIL SYLLABLE NYEE", 2, {0x0B9E, 0x0BC7}},
+ {"TAMIL SYLLABLE NYI", 2, {0x0B9E, 0x0BBF}},
+ {"TAMIL SYLLABLE NYII", 2, {0x0B9E, 0x0BC0}},
+ {"TAMIL SYLLABLE NYO", 2, {0x0B9E, 0x0BCA}},
+ {"TAMIL SYLLABLE NYOO", 2, {0x0B9E, 0x0BCB}},
+ {"TAMIL SYLLABLE NYU", 2, {0x0B9E, 0x0BC1}},
+ {"TAMIL SYLLABLE NYUU", 2, {0x0B9E, 0x0BC2}},
+ {"TAMIL SYLLABLE PAA", 2, {0x0BAA, 0x0BBE}},
+ {"TAMIL SYLLABLE PAI", 2, {0x0BAA, 0x0BC8}},
+ {"TAMIL SYLLABLE PAU", 2, {0x0BAA, 0x0BCC}},
+ {"TAMIL SYLLABLE PE", 2, {0x0BAA, 0x0BC6}},
+ {"TAMIL SYLLABLE PEE", 2, {0x0BAA, 0x0BC7}},
+ {"TAMIL SYLLABLE PI", 2, {0x0BAA, 0x0BBF}},
+ {"TAMIL SYLLABLE PII", 2, {0x0BAA, 0x0BC0}},
+ {"TAMIL SYLLABLE PO", 2, {0x0BAA, 0x0BCA}},
+ {"TAMIL SYLLABLE POO", 2, {0x0BAA, 0x0BCB}},
+ {"TAMIL SYLLABLE PU", 2, {0x0BAA, 0x0BC1}},
+ {"TAMIL SYLLABLE PUU", 2, {0x0BAA, 0x0BC2}},
+ {"TAMIL SYLLABLE RAA", 2, {0x0BB0, 0x0BBE}},
+ {"TAMIL SYLLABLE RAI", 2, {0x0BB0, 0x0BC8}},
+ {"TAMIL SYLLABLE RAU", 2, {0x0BB0, 0x0BCC}},
+ {"TAMIL SYLLABLE RE", 2, {0x0BB0, 0x0BC6}},
+ {"TAMIL SYLLABLE REE", 2, {0x0BB0, 0x0BC7}},
+ {"TAMIL SYLLABLE RI", 2, {0x0BB0, 0x0BBF}},
+ {"TAMIL SYLLABLE RII", 2, {0x0BB0, 0x0BC0}},
+ {"TAMIL SYLLABLE RO", 2, {0x0BB0, 0x0BCA}},
+ {"TAMIL SYLLABLE ROO", 2, {0x0BB0, 0x0BCB}},
+ {"TAMIL SYLLABLE RRAA", 2, {0x0BB1, 0x0BBE}},
+ {"TAMIL SYLLABLE RRAI", 2, {0x0BB1, 0x0BC8}},
+ {"TAMIL SYLLABLE RRAU", 2, {0x0BB1, 0x0BCC}},
+ {"TAMIL SYLLABLE RRE", 2, {0x0BB1, 0x0BC6}},
+ {"TAMIL SYLLABLE RREE", 2, {0x0BB1, 0x0BC7}},
+ {"TAMIL SYLLABLE RRI", 2, {0x0BB1, 0x0BBF}},
+ {"TAMIL SYLLABLE RRII", 2, {0x0BB1, 0x0BC0}},
+ {"TAMIL SYLLABLE RRO", 2, {0x0BB1, 0x0BCA}},
+ {"TAMIL SYLLABLE RROO", 2, {0x0BB1, 0x0BCB}},
+ {"TAMIL SYLLABLE RRU", 2, {0x0BB1, 0x0BC1}},
+ {"TAMIL SYLLABLE RRUU", 2, {0x0BB1, 0x0BC2}},
+ {"TAMIL SYLLABLE RU", 2, {0x0BB0, 0x0BC1}},
+ {"TAMIL SYLLABLE RUU", 2, {0x0BB0, 0x0BC2}},
+ {"TAMIL SYLLABLE SAA", 2, {0x0BB8, 0x0BBE}},
+ {"TAMIL SYLLABLE SAI", 2, {0x0BB8, 0x0BC8}},
+ {"TAMIL SYLLABLE SAU", 2, {0x0BB8, 0x0BCC}},
+ {"TAMIL SYLLABLE SE", 2, {0x0BB8, 0x0BC6}},
+ {"TAMIL SYLLABLE SEE", 2, {0x0BB8, 0x0BC7}},
+ {"TAMIL SYLLABLE SHAA", 2, {0x0BB6, 0x0BBE}},
+ {"TAMIL SYLLABLE SHAI", 2, {0x0BB6, 0x0BC8}},
+ {"TAMIL SYLLABLE SHAU", 2, {0x0BB6, 0x0BCC}},
+ {"TAMIL SYLLABLE SHE", 2, {0x0BB6, 0x0BC6}},
+ {"TAMIL SYLLABLE SHEE", 2, {0x0BB6, 0x0BC7}},
+ {"TAMIL SYLLABLE SHI", 2, {0x0BB6, 0x0BBF}},
+ {"TAMIL SYLLABLE SHII", 2, {0x0BB6, 0x0BC0}},
+ {"TAMIL SYLLABLE SHO", 2, {0x0BB6, 0x0BCA}},
+ {"TAMIL SYLLABLE SHOO", 2, {0x0BB6, 0x0BCB}},
+ {"TAMIL SYLLABLE SHRII", 4, {0x0BB6, 0x0BCD, 0x0BB0, 0x0BC0}},
+ {"TAMIL SYLLABLE SHU", 2, {0x0BB6, 0x0BC1}},
+ {"TAMIL SYLLABLE SHUU", 2, {0x0BB6, 0x0BC2}},
+ {"TAMIL SYLLABLE SI", 2, {0x0BB8, 0x0BBF}},
+ {"TAMIL SYLLABLE SII", 2, {0x0BB8, 0x0BC0}},
+ {"TAMIL SYLLABLE SO", 2, {0x0BB8, 0x0BCA}},
+ {"TAMIL SYLLABLE SOO", 2, {0x0BB8, 0x0BCB}},
+ {"TAMIL SYLLABLE SSAA", 2, {0x0BB7, 0x0BBE}},
+ {"TAMIL SYLLABLE SSAI", 2, {0x0BB7, 0x0BC8}},
+ {"TAMIL SYLLABLE SSAU", 2, {0x0BB7, 0x0BCC}},
+ {"TAMIL SYLLABLE SSE", 2, {0x0BB7, 0x0BC6}},
+ {"TAMIL SYLLABLE SSEE", 2, {0x0BB7, 0x0BC7}},
+ {"TAMIL SYLLABLE SSI", 2, {0x0BB7, 0x0BBF}},
+ {"TAMIL SYLLABLE SSII", 2, {0x0BB7, 0x0BC0}},
+ {"TAMIL SYLLABLE SSO", 2, {0x0BB7, 0x0BCA}},
+ {"TAMIL SYLLABLE SSOO", 2, {0x0BB7, 0x0BCB}},
+ {"TAMIL SYLLABLE SSU", 2, {0x0BB7, 0x0BC1}},
+ {"TAMIL SYLLABLE SSUU", 2, {0x0BB7, 0x0BC2}},
+ {"TAMIL SYLLABLE SU", 2, {0x0BB8, 0x0BC1}},
+ {"TAMIL SYLLABLE SUU", 2, {0x0BB8, 0x0BC2}},
+ {"TAMIL SYLLABLE TAA", 2, {0x0BA4, 0x0BBE}},
+ {"TAMIL SYLLABLE TAI", 2, {0x0BA4, 0x0BC8}},
+ {"TAMIL SYLLABLE TAU", 2, {0x0BA4, 0x0BCC}},
+ {"TAMIL SYLLABLE TE", 2, {0x0BA4, 0x0BC6}},
+ {"TAMIL SYLLABLE TEE", 2, {0x0BA4, 0x0BC7}},
+ {"TAMIL SYLLABLE TI", 2, {0x0BA4, 0x0BBF}},
+ {"TAMIL SYLLABLE TII", 2, {0x0BA4, 0x0BC0}},
+ {"TAMIL SYLLABLE TO", 2, {0x0BA4, 0x0BCA}},
+ {"TAMIL SYLLABLE TOO", 2, {0x0BA4, 0x0BCB}},
+ {"TAMIL SYLLABLE TTAA", 2, {0x0B9F, 0x0BBE}},
+ {"TAMIL SYLLABLE TTAI", 2, {0x0B9F, 0x0BC8}},
+ {"TAMIL SYLLABLE TTAU", 2, {0x0B9F, 0x0BCC}},
+ {"TAMIL SYLLABLE TTE", 2, {0x0B9F, 0x0BC6}},
+ {"TAMIL SYLLABLE TTEE", 2, {0x0B9F, 0x0BC7}},
+ {"TAMIL SYLLABLE TTI", 2, {0x0B9F, 0x0BBF}},
+ {"TAMIL SYLLABLE TTII", 2, {0x0B9F, 0x0BC0}},
+ {"TAMIL SYLLABLE TTO", 2, {0x0B9F, 0x0BCA}},
+ {"TAMIL SYLLABLE TTOO", 2, {0x0B9F, 0x0BCB}},
+ {"TAMIL SYLLABLE TTU", 2, {0x0B9F, 0x0BC1}},
+ {"TAMIL SYLLABLE TTUU", 2, {0x0B9F, 0x0BC2}},
+ {"TAMIL SYLLABLE TU", 2, {0x0BA4, 0x0BC1}},
+ {"TAMIL SYLLABLE TUU", 2, {0x0BA4, 0x0BC2}},
+ {"TAMIL SYLLABLE VAA", 2, {0x0BB5, 0x0BBE}},
+ {"TAMIL SYLLABLE VAI", 2, {0x0BB5, 0x0BC8}},
+ {"TAMIL SYLLABLE VAU", 2, {0x0BB5, 0x0BCC}},
+ {"TAMIL SYLLABLE VE", 2, {0x0BB5, 0x0BC6}},
+ {"TAMIL SYLLABLE VEE", 2, {0x0BB5, 0x0BC7}},
+ {"TAMIL SYLLABLE VI", 2, {0x0BB5, 0x0BBF}},
+ {"TAMIL SYLLABLE VII", 2, {0x0BB5, 0x0BC0}},
+ {"TAMIL SYLLABLE VO", 2, {0x0BB5, 0x0BCA}},
+ {"TAMIL SYLLABLE VOO", 2, {0x0BB5, 0x0BCB}},
+ {"TAMIL SYLLABLE VU", 2, {0x0BB5, 0x0BC1}},
+ {"TAMIL SYLLABLE VUU", 2, {0x0BB5, 0x0BC2}},
+ {"TAMIL SYLLABLE YAA", 2, {0x0BAF, 0x0BBE}},
+ {"TAMIL SYLLABLE YAI", 2, {0x0BAF, 0x0BC8}},
+ {"TAMIL SYLLABLE YAU", 2, {0x0BAF, 0x0BCC}},
+ {"TAMIL SYLLABLE YE", 2, {0x0BAF, 0x0BC6}},
+ {"TAMIL SYLLABLE YEE", 2, {0x0BAF, 0x0BC7}},
+ {"TAMIL SYLLABLE YI", 2, {0x0BAF, 0x0BBF}},
+ {"TAMIL SYLLABLE YII", 2, {0x0BAF, 0x0BC0}},
+ {"TAMIL SYLLABLE YO", 2, {0x0BAF, 0x0BCA}},
+ {"TAMIL SYLLABLE YOO", 2, {0x0BAF, 0x0BCB}},
+ {"TAMIL SYLLABLE YU", 2, {0x0BAF, 0x0BC1}},
+ {"TAMIL SYLLABLE YUU", 2, {0x0BAF, 0x0BC2}},
+};
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -25,7 +25,12 @@
# written by Fredrik Lundh (fred...@pythonware.com)
#
-import sys, os, zipfile
+import os
+import sys
+import zipfile
+
+from textwrap import dedent
+from operator import itemgetter
SCRIPT = sys.argv[0]
VERSION = "3.2"
@@ -39,6 +44,8 @@
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
+NAME_ALIASES = "NameAliases%s.txt"
+NAMED_SEQUENCES = "NamedSequences%s.txt"
old_versions = ["3.2.0"]
@@ -692,6 +699,40 @@
print("/* name->code dictionary */", file=fp)
codehash.dump(fp, trace)
+ print(dedent("""
+ typedef struct Alias {
+ char *name;
+ int namelen;
+ int codepoint;
+ } alias;
+ """), file=fp)
+
+ print('static const int aliases_count = %d;' % len(unicode.aliases),
file=fp)
+
+ print('static const alias name_aliases[] = {', file=fp)
+ for name, codepoint in unicode.aliases:
+ print(' {"%s", %d, 0x%04X},' % (name, len(name), codepoint),
file=fp)
+ print('};', file=fp)
+
+ # the Py_UCS2 seq[4] should use Py_UCS4 if non-BMP chars are added to the
+ # sequences and have an higher number of elements if the sequences get
longer
+ print(dedent("""
+ typedef struct NamedSequence {
+ char *name;
+ int seqlen;
+ Py_UCS2 seq[4];
+ } named_sequence;
+ """), file=fp)
+
+ print('static const int named_sequences_count = %d;' %
len(unicode.named_sequences),
+ file=fp)
+
+ print('static const named_sequence named_sequences[] = {', file=fp)
+ for name, sequence in unicode.named_sequences:
+ seq_str = ', '.join('0x%04X' % cp for cp in sequence)
+ print(' {"%s", %d, {%s}},' % (name, len(sequence), seq_str),
file=fp)
+ print('};', file=fp)
+
fp.close()
@@ -855,6 +896,31 @@
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
+ self.aliases = []
+ with open_data(NAME_ALIASES, version) as file:
+ for s in file:
+ s = s.strip()
+ if not s or s.startswith('#'):
+ continue
+ char, name = s.split(';')
+ char = int(char, 16)
+ self.aliases.append((name, char))
+
+ self.named_sequences = []
+ with open_data(NAMED_SEQUENCES, version) as file:
+ for s in file:
+ s = s.strip()
+ if not s or s.startswith('#'):
+ continue
+ name, chars = s.split(';')
+ chars = tuple(int(char, 16) for char in chars.split())
+ # check that the structure defined in makeunicodename is OK
+ assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+ assert all(c <= 0xFFFF for c in chars), "use Py_UCS4 instead"
+ self.named_sequences.append((name, chars))
+ # sort names to enable binary search
+ self.named_sequences.sort(key=itemgetter(0))
+
self.exclusions = {}
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
for s in file:
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com