On 4/16/09, Marko Kreen <mark...@gmail.com> wrote: > It's up to UTF8 validator whether to consider non-characters as error.
I checked, and it did not work well, as addunicode() did not set the saw_high_bit variable.when outputting UTF8. Attached patch fixes it. Currently is would be NOP as pg_verifymbstr() only checks for invalid UTF8, and addunicode cannot output it, but in the future we may want to reject some codes, so now it can. Btw, is there any good reason why we don't reject \000, \x00 in text strings? Currently I made addunicode() do it, because it seems sensible. -- marko
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index a559d75..fdb0cc5 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -394,6 +394,14 @@ SELECT 'foo' 'bar'; </entry> <entry>hexadecimal byte value</entry> </row> + <row> + <entry> + <literal>\u<replaceable>xxxx</replaceable></literal>, + <literal>\U<replaceable>xxxxxxxx</replaceable></literal> + (<replaceable>x</replaceable> = 0 - 9, A - F) + </entry> + <entry>16 or 32-bit hexadecimal Unicode character value.</entry> + </row> </tbody> </tgroup> </table> @@ -407,6 +415,14 @@ SELECT 'foo' 'bar'; </para> <para> + The Unicode escape syntax works fully only when the server encoding is UTF8. + When other server encodings are used, only code points in the ASCII range + (up to <literal>\u007F</>) can be specified. Both <literal>\u</> and <literal>\U</> + can also be used to specify UTF-16 surrogate pair to escape characters + with value larger than <literal>\uFFFF</>. + </para> + + <para> It is your responsibility that the byte sequences you create are valid characters in the server character set encoding. When the server encoding is UTF-8, then the alternative Unicode escape diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index a070e85..992cc9a 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -98,6 +98,11 @@ static char *scanbuf; static unsigned char unescape_single_char(unsigned char c); +/* first part of unicode surrogate */ +static unsigned long xeu_surrogate1; + +static void addunicode(pg_wchar c); + %} %option 8bit @@ -128,6 +133,7 @@ static unsigned char unescape_single_char(unsigned char c); * <xdolq> $foo$ quoted strings * <xui> quoted identifier with Unicode escapes * <xus> quoted string with Unicode escapes + * <xeu> Unicode surrogate escape in extended string */ %x xb @@ -139,6 +145,7 @@ static unsigned char unescape_single_char(unsigned char c); %x xdolq %x xui %x xus +%x xeu /* * In order to make the world safe for Windows and Mac clients as well as @@ -217,6 +224,7 @@ xeinside [^\\']+ xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} +xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) /* Extended quote * xqdouble implements embedded quote, '''' @@ -506,6 +514,37 @@ other . <xe>{xeinside} { addlit(yytext, yyleng); } +<xe>{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + check_escape_warning(); + + if (c >= 0xD800 && c < 0xDC00) + { + xeu_surrogate1 = c; + BEGIN(xeu); + } + else if (c >= 0xDC00 && c < 0xE000) + yyerror("invalid Unicode escape value"); + + addunicode(c); + } +<xeu>{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + if (c < 0xDC00 || c >= 0xE000) + yyerror("invalid Unicode surrogate pair"); + + c = ((xeu_surrogate1 & 0x3FF) << 10) | (c & 0x3FF); + + addunicode(c + 0x10000); + + BEGIN(xe); + } +<xeu>. { + yyerror("invalid Unicode surrogate pair"); + } + <xe>{xeescape} { if (yytext[1] == '\'') { @@ -1153,3 +1192,21 @@ check_escape_warning(void) lexer_errposition())); warn_on_first_escape = false; /* warn only once per string */ } + +static void +addunicode(pg_wchar c) +{ + char buf[8]; + + if (c == 0 || c > 0x10FFFF) + yyerror("invalid Unicode escape value"); + if (c > 0x7F) + { + if (GetDatabaseEncoding() != PG_UTF8) + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + saw_high_bit = true; + } + unicode_to_utf8(c, (unsigned char *)buf); + addlit(buf, pg_mblen(buf)); +} +
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers