Seems I'm bad at communicating in english, so here is C variant of my proposal to bring \u escaping into extended strings. Reasons:
- More people are familiar with \u escaping, as it's standard in Java/C#/Python, probably more.. - U& strings will not work when stdstr=off. Syntax: \uXXXX - 16-bit value \UXXXXXXXX - 32-bit value Additionally, both \u and \U can be used to specify UTF-16 surrogate pairs to encode characters with value > 0xFFFF. This is exact behaviour used by Java/C#/Python. (except that Java does not have \U) I'm ok with this patch left to 8.5. -- marko
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index a559d75..fdb0cc5 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -394,6 +394,14 @@ SELECT 'foo' 'bar'; </entry> <entry>hexadecimal byte value</entry> </row> + <row> + <entry> + <literal>\u<replaceable>xxxx</replaceable></literal>, + <literal>\U<replaceable>xxxxxxxx</replaceable></literal> + (<replaceable>x</replaceable> = 0 - 9, A - F) + </entry> + <entry>16 or 32-bit hexadecimal Unicode character value.</entry> + </row> </tbody> </tgroup> </table> @@ -407,6 +415,14 @@ SELECT 'foo' 'bar'; </para> <para> + The Unicode escape syntax works fully only when the server encoding is UTF8. + When other server encodings are used, only code points in the ASCII range + (up to <literal>\u007F</>) can be specified. Both <literal>\u</> and <literal>\U</> + can also be used to specify UTF-16 surrogate pair to escape characters + with value larger than <literal>\uFFFF</>. + </para> + + <para> It is your responsibility that the byte sequences you create are valid characters in the server character set encoding. When the server encoding is UTF-8, then the alternative Unicode escape diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index a070e85..c0695f1 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -98,6 +98,11 @@ static char *scanbuf; static unsigned char unescape_single_char(unsigned char c); +/* first part of unicode surrogate */ +static unsigned long xeu_surrogate1; + +static void addunicode(pg_wchar c); + %} %option 8bit @@ -128,6 +133,7 @@ static unsigned char unescape_single_char(unsigned char c); * <xdolq> $foo$ quoted strings * <xui> quoted identifier with Unicode escapes * <xus> quoted string with Unicode escapes + * <xeu> Unicode surrogate escape in extended string */ %x xb @@ -139,6 +145,7 @@ static unsigned char unescape_single_char(unsigned char c); %x xdolq %x xui %x xus +%x xeu /* * In order to make the world safe for Windows and Mac clients as well as @@ -217,6 +224,7 @@ xeinside [^\\']+ xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} +xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) /* Extended quote * xqdouble implements embedded quote, '''' @@ -506,6 +514,37 @@ other . <xe>{xeinside} { addlit(yytext, yyleng); } +<xe>{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + check_escape_warning(); + + if (c >= 0xD800 && c < 0xDC00) + { + xeu_surrogate1 = c; + BEGIN(xeu); + } + else if (c >= 0xDC00 && c < 0xE000) + yyerror("invalid Unicode escape value"); + + addunicode(c); + } +<xeu>{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + if (c < 0xDC00 || c >= 0xE000) + yyerror("invalid Unicode surrogate pair"); + + c = ((xeu_surrogate1 & 0x3FF) << 10) | (c & 0x3FF); + + addunicode(c + 0x10000); + + BEGIN(xe); + } +<xeu>. { + yyerror("invalid Unicode surrogate pair"); + } + <xe>{xeescape} { if (yytext[1] == '\'') { @@ -1153,3 +1192,18 @@ check_escape_warning(void) lexer_errposition())); warn_on_first_escape = false; /* warn only once per string */ } + +static void +addunicode(pg_wchar c) +{ + char buf[8]; + + if (c == 0 || c > 0x10FFFF) + yyerror("invalid Unicode escape value"); + if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8) + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + + unicode_to_utf8(c, (unsigned char *)buf); + addlit(buf, pg_mblen(buf)); +} +
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers