Hello, Seing that the limit is still in place, attached patch against CVS.
Kind Regards, John Hansen
Index: src/backend/utils/mb/wchar.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v retrieving revision 1.38 diff -c -r1.38 wchar.c *** src/backend/utils/mb/wchar.c 17 Sep 2004 21:59:57 -0000 1.38 --- src/backend/utils/mb/wchar.c 16 Nov 2004 04:06:01 -0000 *************** *** 343,348 **** --- 343,373 ---- return (pg_euc_dsplen(s)); } + bool isLegalUTF8(const UTF8 *source, int len) { + if(pg_utf_mblen(source) > len) return false; + UTF8 a; + const UTF8 *srcptr = source + pg_utf_mblen(source); + switch (pg_utf_mblen(source)) { + default: return false; + /* Everything else falls through when "true"... */ + case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + if (*source > 0xFD) return false; + } + return true; + } + /* * convert UTF-8 string to pg_wchar (UCS-2) * caller should allocate enough space for "to" *************** *** 350,404 **** * "from" not necessarily null terminated. */ static int ! pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) { ! unsigned char c1, ! c2, ! c3; ! int cnt = 0; ! ! while (len > 0 && *from) ! { ! if ((*from & 0x80) == 0) ! { ! *to = *from++; ! len--; ! } ! else if ((*from & 0xe0) == 0xc0 && len >= 2) ! { ! c1 = *from++ & 0x1f; ! c2 = *from++ & 0x3f; ! *to = c1 << 6; ! *to |= c2; ! len -= 2; ! } ! else if ((*from & 0xe0) == 0xe0 && len >= 3) ! { ! c1 = *from++ & 0x0f; ! c2 = *from++ & 0x3f; ! c3 = *from++ & 0x3f; ! *to = c1 << 12; ! *to |= c2 << 6; ! *to |= c3; ! len -= 3; ! } ! else ! { ! *to = *from++; ! len--; ! } ! to++; ! cnt++; ! } ! *to = 0; ! return (cnt); } /* * returns the byte length of a UTF-8 word pointed to by s */ int ! pg_utf_mblen(const unsigned char *s) { int len = 1; --- 375,437 ---- * "from" not necessarily null terminated. */ static int ! pg_utf2wchar_with_len(const UTF8 *from, pg_wchar *to, int len) { ! const UTF8* fromEnd = from + len; ! const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; ! unsigned int cnt = 0; ! while (from < fromEnd) { ! UTF32 ch = 0; ! unsigned int extraBytesToRead = pg_utf_mblen(from) - 1; ! if (from + extraBytesToRead >= fromEnd) { ! cnt = 0; break; ! } ! /* Do this check whether lenient or strict */ ! if (! isLegalUTF8(from, extraBytesToRead + 1)) { ! cnt = 0; ! break; ! } ! /* ! * The cases all fall through. See "Note A" below. ! */ ! switch (extraBytesToRead) { ! case 5: ch += *from++; ch <<= 6; ! case 4: ch += *from++; ch <<= 6; ! case 3: ch += *from++; ch <<= 6; ! case 2: ch += *from++; ch <<= 6; ! case 1: ch += *from++; ch <<= 6; ! case 0: ch += *from++; ! } ! ch -= offsetsFromUTF8[extraBytesToRead]; ! ! if (ch <= UNI_MAX_BMP) { /* character is <= 0xFFFF */ ! if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { ! from -= (extraBytesToRead+1); /* return to the illegal value itself */ ! cnt = 0; ! break; ! } else { ! *to++ = ch; /* normal case */ ! } ! } else if (ch > UNI_MAX_UTF16) { ! cnt = 0; ! from -= (extraBytesToRead+1); /* return to the start */ ! break; /* Bail out; shouldn't continue */ ! } else { ! /* character is in range 0xFFFF - 0x10FFFF. */ ! ch -= 0x0010000UL; ! *to++ = (ch >> 10) + UNI_SUR_HIGH_START; ! *to++ = (ch & 0x3FFUL) + UNI_SUR_LOW_START; ! } ! cnt++; ! } ! return cnt; } /* * returns the byte length of a UTF-8 word pointed to by s */ int ! pg_utf_mblen(const UTF8 *s) { int len = 1; *************** *** 406,418 **** len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; ! else if ((*s & 0xe0) == 0xe0) ! len = 3; return (len); } static int ! pg_utf_dsplen(const unsigned char *s) { return 1; /* XXX fix me! */ } --- 439,457 ---- len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; ! else if ((*s & 0xf0) == 0xe0) ! len = 3; ! else if ((*s & 0xf8) == 0xf0) ! len = 4; ! else if ((*s & 0xfc) == 0xf8) ! len = 5; ! else if ((*s & 0xfe) == 0xfc) ! len = 6; return (len); } static int ! pg_utf_dsplen(const UTF8 *s) { return 1; /* XXX fix me! */ } *************** *** 721,728 **** {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */ ! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ --- 760,767 ---- {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */ ! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ *************** *** 744,754 **** {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ ! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ ! {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ ! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ ! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ ! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ }; /* returns the byte length of a word for mule internal code */ --- 783,793 ---- {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ ! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ ! {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ ! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ ! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ ! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ }; /* returns the byte length of a word for mule internal code */ *************** *** 823,837 **** while (len > 0 && *mbstr) { /* special UTF-8 check */ ! if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) ! { ! if (noError) ! return false; ! ereport(ERROR, ! (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); } ! l = pg_mblen(mbstr); for (i = 1; i < l; i++) --- 862,876 ---- while (len > 0 && *mbstr) { /* special UTF-8 check */ ! if (encoding == PG_UTF8) { ! if(!isLegalUTF8(mbstr,len)) { ! if (noError) return false; ! ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr))); ! } else { ! return true; ! } } ! l = pg_mblen(mbstr); for (i = 1; i < l; i++) Index: src/include/mb/pg_wchar.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v retrieving revision 1.52 diff -c -r1.52 pg_wchar.h *** src/include/mb/pg_wchar.h 17 Sep 2004 21:59:57 -0000 1.52 --- src/include/mb/pg_wchar.h 16 Nov 2004 04:06:02 -0000 *************** *** 16,21 **** --- 16,35 ---- * The pg_wchar */ typedef unsigned int pg_wchar; + typedef unsigned int UTF32; /* at least 32 bits */ + typedef unsigned int UTF16; /* at least 16 bits */ + typedef unsigned char UTF8; /* typically 8 bits */ + + /* Some fundamental constants */ + #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD + #define UNI_MAX_BMP (UTF32)0x0000FFFF + #define UNI_MAX_UTF16 (UTF32)0x0010FFFF + #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF + + #define UNI_SUR_HIGH_START (UTF32)0xD800 + #define UNI_SUR_HIGH_END (UTF32)0xDBFF + #define UNI_SUR_LOW_START (UTF32)0xDC00 + #define UNI_SUR_LOW_END (UTF32)0xDFFF /* * various definitions for EUC *************** *** 339,342 **** --- 353,358 ---- extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); + extern bool isLegalUTF8(const UTF8 *source, int len); + #endif /* PG_WCHAR_H */
---------------------------(end of broadcast)--------------------------- TIP 4: Don't 'kill -9' the postmaster