Hi Sergei, Please review a patch for MDEV-8844.
Thanks!
diff --git a/include/m_ctype.h b/include/m_ctype.h index a552226..b059258 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -582,6 +582,7 @@ struct charset_info_st extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_bin; extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_latin1; extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_filename; +extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_errmsg; extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_utf8_general_ci; extern struct charset_info_st my_charset_big5_bin; diff --git a/mysql-test/r/ctype_latin1.result b/mysql-test/r/ctype_latin1.result index 4847592..fce1a07 100644 --- a/mysql-test/r/ctype_latin1.result +++ b/mysql-test/r/ctype_latin1.result @@ -8181,5 +8181,44 @@ Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where ((coalesce(`test`.`t1`.`c`,0) = '3 ') and (coalesce(`test`.`t1`.`d`,0) = '3 ')) DROP TABLE t1; # +# MDEV-8844 Unreadable control characters printed as is in warnings +# +SET NAMES latin1; +SELECT CAST(_latin1 0x610062 AS INT); +CAST(_latin1 0x610062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0000b' +SELECT CAST(_latin1 0x610162 AS INT); +CAST(_latin1 0x610162 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0001b' +SELECT CAST(_latin1 0x611F62 AS INT); +CAST(_latin1 0x611F62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\001Fb' +SELECT CAST(_latin1 0x617F62 AS INT); +CAST(_latin1 0x617F62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\007Fb' +SELECT CAST(_latin1 0x612062 AS INT); +CAST(_latin1 0x612062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a b' +SELECT CAST(_latin1 0x617E62 AS INT); +CAST(_latin1 0x617E62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a~b' +SELECT CAST(_latin1 0x61FF62 AS INT); +CAST(_latin1 0x61FF62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'aÿb' +# # End of 10.1 tests # diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result index 5617431..82e4784 100644 --- a/mysql-test/r/ctype_ucs.result +++ b/mysql-test/r/ctype_ucs.result @@ -5649,5 +5649,38 @@ CAST(CONVERT('1IJ3' USING ucs2) AS SIGNED) Warnings: Warning 1292 Truncated incorrect INTEGER value: '1IJ3' # +# MDEV-8844 Unreadable control characters printed as is in warnings +# +SELECT CAST(_ucs2 0x006100000062 AS INT); +CAST(_ucs2 0x006100000062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0000b' +SELECT CAST(_ucs2 0x006100010062 AS INT); +CAST(_ucs2 0x006100010062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0001b' +SELECT CAST(_ucs2 0x0061D8000062 AS INT); +CAST(_ucs2 0x0061D8000062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\D800b' +SELECT CAST(_ucs2 0x0061DFFF0062 AS INT); +CAST(_ucs2 0x0061DFFF0062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\DFFFb' +SELECT CAST(_ucs2 0x0061D7000062 AS INT); +CAST(_ucs2 0x0061D7000062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'aíb' +SELECT CAST(_ucs2 0x0061E0030062 AS INT); +CAST(_ucs2 0x0061E0030062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'aîb' +# # End of 10.1 tests # diff --git a/mysql-test/r/ctype_utf16.result b/mysql-test/r/ctype_utf16.result index 3bd3725..303fa48 100644 --- a/mysql-test/r/ctype_utf16.result +++ b/mysql-test/r/ctype_utf16.result @@ -2199,5 +2199,14 @@ CAST(CONVERT('1IJ3' USING utf16) AS SIGNED) Warnings: Warning 1292 Truncated incorrect INTEGER value: '1IJ3' # +# MDEV-8844 Unreadable control characters printed as is in warnings +# +SET NAMES utf8; +SELECT CAST(_utf16 0x0061D83DDE0E0062 AS INT); +CAST(_utf16 0x0061D83DDE0E0062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a?b' +# # End of 10.1 tests # diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 66db7df..ab340e0 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -10213,5 +10213,69 @@ Warnings: Note 1003 select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A') DROP TABLE t1; # +# MDEV-8844 Unreadable control characters printed as is in warnings +# +SET NAMES utf8; +SELECT CAST(_utf8 0x610062 AS INT); +CAST(_utf8 0x610062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0000b' +SELECT CAST(_utf8 0x610162 AS INT); +CAST(_utf8 0x610162 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0001b' +SELECT CAST(_utf8 0x611F62 AS INT); +CAST(_utf8 0x611F62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\001Fb' +SELECT CAST(_utf8 0x617F62 AS INT); +CAST(_utf8 0x617F62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\007Fb' +SELECT CAST(_utf8 0x61C28062 AS INT); +CAST(_utf8 0x61C28062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\0080b' +SELECT CAST(_utf8 0x61C29F62 AS INT); +CAST(_utf8 0x61C29F62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a\009Fb' +SELECT CAST(_utf8 0x612062 AS INT); +CAST(_utf8 0x612062 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a b' +SELECT CAST(_utf8 0x617E62 AS INT); +CAST(_utf8 0x617E62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a~b' +SELECT CAST(_utf8 0x61C2BF62 AS INT); +CAST(_utf8 0x61C2BF62 AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'a¿b' +SELECT CAST(_utf8 'ëëë' AS INT); +CAST(_utf8 'ëëë' AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'ëëë' +SELECT CAST(_utf8 'ÅÅÅ' AS INT); +CAST(_utf8 'ÅÅÅ' AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'ÅÅÅ' +SELECT CAST(_utf8 'ÑÑÑ' AS INT); +CAST(_utf8 'ÑÑÑ' AS INT) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: 'ÑÑÑ' +# # End of 10.1 tests # diff --git a/mysql-test/t/ctype_latin1.test b/mysql-test/t/ctype_latin1.test index a30c7ae..7478ca6 100644 --- a/mysql-test/t/ctype_latin1.test +++ b/mysql-test/t/ctype_latin1.test @@ -374,5 +374,19 @@ SELECT * FROM t1 WHERE COALESCE(c,0)='3 ' AND COALESCE(d,0)=COALESCE(c,0); DROP TABLE t1; --echo # +--echo # MDEV-8844 Unreadable control characters printed as is in warnings +--echo # +SET NAMES latin1; +# control +SELECT CAST(_latin1 0x610062 AS INT); +SELECT CAST(_latin1 0x610162 AS INT); +SELECT CAST(_latin1 0x611F62 AS INT); +SELECT CAST(_latin1 0x617F62 AS INT); +# normal characters +SELECT CAST(_latin1 0x612062 AS INT); +SELECT CAST(_latin1 0x617E62 AS INT); +SELECT CAST(_latin1 0x61FF62 AS INT); + +--echo # --echo # End of 10.1 tests --echo # diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test index 2f48062..d6341fb 100644 --- a/mysql-test/t/ctype_ucs.test +++ b/mysql-test/t/ctype_ucs.test @@ -955,5 +955,18 @@ SET NAMES utf8; SELECT CAST(CONVERT('1IJ3' USING ucs2) AS SIGNED); --echo # +--echo # MDEV-8844 Unreadable control characters printed as is in warnings +--echo # +# control +SELECT CAST(_ucs2 0x006100000062 AS INT); +SELECT CAST(_ucs2 0x006100010062 AS INT); +# surrogate halfs +SELECT CAST(_ucs2 0x0061D8000062 AS INT); +SELECT CAST(_ucs2 0x0061DFFF0062 AS INT); +# normal characters +SELECT CAST(_ucs2 0x0061D7000062 AS INT); +SELECT CAST(_ucs2 0x0061E0030062 AS INT); + +--echo # --echo # End of 10.1 tests --echo # diff --git a/mysql-test/t/ctype_utf16.test b/mysql-test/t/ctype_utf16.test index bb7eb8c..9e15961 100644 --- a/mysql-test/t/ctype_utf16.test +++ b/mysql-test/t/ctype_utf16.test @@ -893,5 +893,14 @@ SELECT CAST(CONVERT('1IJ3' USING utf16) AS SIGNED); --echo # +--echo # MDEV-8844 Unreadable control characters printed as is in warnings +--echo # +SET NAMES utf8; +# Make sure surrogate halfs (when a part of a full utf16 character) +# are not escaped and the entire utf16 character consisting of two +# surrogate pairs is replaced to a single question mark. +SELECT CAST(_utf16 0x0061D83DDE0E0062 AS INT); + +--echo # --echo # End of 10.1 tests --echo # diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 639f6d4..ac7bc95 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1843,6 +1843,26 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE c>=_utf8'a' COLLATE utf8_general_ci AND c='A'; DROP TABLE t1; +--echo # +--echo # MDEV-8844 Unreadable control characters printed as is in warnings +--echo # +SET NAMES utf8; +# control, part1 +SELECT CAST(_utf8 0x610062 AS INT); +SELECT CAST(_utf8 0x610162 AS INT); +SELECT CAST(_utf8 0x611F62 AS INT); +# control, part2: U+0080..U+009F +SELECT CAST(_utf8 0x617F62 AS INT); +SELECT CAST(_utf8 0x61C28062 AS INT); +SELECT CAST(_utf8 0x61C29F62 AS INT); +# normal characters +SELECT CAST(_utf8 0x612062 AS INT); +SELECT CAST(_utf8 0x617E62 AS INT); +SELECT CAST(_utf8 0x61C2BF62 AS INT); +SELECT CAST(_utf8 'ëëë' AS INT); +SELECT CAST(_utf8 'ÅÅÅ' AS INT); +SELECT CAST(_utf8 'ÑÑÑ' AS INT); + --echo # --echo # End of 10.1 tests diff --git a/sql/sql_error.cc b/sql/sql_error.cc index b72d642..1ed3547 100644 --- a/sql/sql_error.cc +++ b/sql/sql_error.cc @@ -931,7 +931,7 @@ char *err_conv(char *buff, uint to_length, const char *from, else { uint errors; - res= copy_and_convert(to, to_length, system_charset_info, + res= copy_and_convert(to, to_length, &my_charset_errmsg, from, from_length, from_cs, &errors); to[res]= 0; } diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 3c2c812..b1a7427 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -7953,3 +7953,163 @@ struct charset_info_st my_charset_utf8mb4_bin= }; #endif /* HAVE_CHARSET_utf8mb4 */ + + +/** + A special version of utf8 for error handling. + + In error messages let's use the SQL standard Unicode escape sequence + notation to display non-printable characters, which is: + 1. \hhhh (for BMP) + 2. \+hhhhhh (for non-BMP) + + Note, non-BMP characters are replaced to QUESTION MARK. + Perhaps we need to change the DIAGNOSTICS relate code to use utf8mb4. + + As non-BMP characters are not replaced to escape sequences for now, + we need only 5 bytes to display a non-printable character, e.g. "\007F". +*/ +#define MY_CS_ERROR_MB_MAXLEN 5 + + +/** + Detect if a Unicode code point is printable. +*/ +static inline my_bool +my_is_printable(my_wc_t wc) +{ + /* + Blocks: + U+0000 .. U+001F control + U+0020 .. U+007E printable + U+007F .. U+009F control + U+00A0 .. U+00FF printable + U+0100 .. U+10FFFF As of Unicode-6.1.0, this range does not have any + characters of the "Cc" (Other, control) category. + Should be mostly safe to print. + Except for the surrogate halfs, + which are encoding components, not real characters. + */ + if (wc >= 0x20 && wc <= 0x7E) /* Quickly detect ASCII printable */ + return TRUE; + if (wc <= 0x9F) /* The rest of U+0000..U+009F are control characters */ + { + /* NL, CR, TAB are Ok */ + return (wc == '\r' || wc == '\n' || wc == '\t'); + } + /* + Surrogate halfs (when alone) print badly in gnome-terminal: + SELECT _ucs2 0xD800; + Let's escape them as well. + */ + if (wc >= 0xD800 && wc <= 0xDFFF) + return FALSE; + return TRUE; +} + + +/** + Non-printable code points are printed as \hhhh. + Printable code points are encoded as utf8. +*/ +static int +my_wc_mb_errmsg(CHARSET_INFO *cs, my_wc_t wc, uchar *r, uchar *e) +{ + if (my_is_printable(wc)) + return my_uni_utf8(cs, wc, r, e); + + if (r + MY_CS_ERROR_MB_MAXLEN > e) + return MY_CS_TOOSMALLN(MY_CS_ERROR_MB_MAXLEN); + DBUG_ASSERT(wc < 0x10000); + *r++= '\\'; + *r++= _dig_vec_upper[(wc >> 12) & 0x0F]; + *r++= _dig_vec_upper[(wc >> 8) & 0x0F]; + *r++= _dig_vec_upper[(wc >> 4) & 0x0F]; + *r++= _dig_vec_upper[wc & 0x0F]; + return 5; +} + + +/** + A version of utf8 handler for error messages. + It only has a special wc_mb() implementation, + and is equal to my_charset_utf8_handler otherwise. +*/ +static MY_CHARSET_HANDLER my_charset_errmsg_handler= +{ + NULL, /* init */ + my_ismbchar_utf8, + my_mbcharlen_utf8, + my_numchars_mb, + my_charpos_mb, + my_well_formed_len_utf8, + my_lengthsp_8bit, + my_numcells_mb, + my_utf8_uni, + my_wc_mb_errmsg, /* A special wb_wc() implementation */ + my_mb_ctype_mb, + my_caseup_str_utf8, + my_casedn_str_utf8, + my_caseup_utf8, + my_casedn_utf8, + my_snprintf_8bit, + my_long10_to_str_8bit, + my_longlong10_to_str_8bit, + my_fill_8bit, + my_strntol_8bit, + my_strntoul_8bit, + my_strntoll_8bit, + my_strntoull_8bit, + my_strntod_8bit, + my_strtoll10_8bit, + my_strntoull10rnd_8bit, + my_scan_8bit, + my_charlen_utf8, + my_well_formed_char_length_utf8, + my_copy_fix_mb, + my_wc_mb_errmsg, +}; + + +/** + A special version of utf8: + - uses my_charset_errmsg_handler + - defines mbmaxlen as MY_CS_ERROR_MB_MAXLEN + - has the MY_CS_NONASCII to avoid optimization in the conversion routines, + which would go around wc_mb(). + - has unique charset and collation names, for easier debugging purposes. + Otherwise, equal to my_charset_utf8. + Note, as we don't expose it to the SQL level, it's ok to have the same ID=33. +*/ +struct charset_info_st my_charset_errmsg= +{ + 33,0,0, /* number */ + MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE| + MY_CS_NONASCII, /* state */ + "errmsg", /* cs name */ + "errmsg", /* name */ + "", /* comment */ + NULL, /* tailoring */ + ctype_utf8, /* ctype */ + to_lower_utf8, /* to_lower */ + to_upper_utf8, /* to_upper */ + to_upper_utf8, /* sort_order */ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 1, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + MY_CS_ERROR_MB_MAXLEN, /* mbmaxlen */ + 0, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + 1, /* levels_for_order */ + &my_charset_errmsg_handler, + &my_collation_utf8_general_ci_handler +};
_______________________________________________ Mailing list: https://launchpad.net/~maria-developers Post to : maria-developers@lists.launchpad.net Unsubscribe : https://launchpad.net/~maria-developers More help : https://help.launchpad.net/ListHelp