I have backed out this patch. It is unclear it is a bug fix.
It will be saved for 8.1.
---------------------------------------------------------------------------
pgman wrote:
>
> Patch applied. Thanks.
>
> ---------------------------------------------------------------------------
>
>
> John Hansen wrote:
> > 3 times lucky?
> >
> > Last one broke utf8.... Grrrr
> >
> > This one works,.... Too tired, sorry for the inconvenience..
> >
> > ... John
>
> Content-Description: cvs.diff
>
> [ Attachment, skipping... ]
>
> >
> > ---------------------------(end of broadcast)---------------------------
> > TIP 9: the planner will ignore your desire to choose an index scan if your
> > joining column's datatypes do not match
>
> --
> Bruce Momjian | http://candle.pha.pa.us
> [EMAIL PROTECTED] | (610) 359-1001
> + If your life is a hard drive, | 13 Roberts Road
> + Christ can be your backup. | Newtown Square, Pennsylvania 19073
--
Bruce Momjian | http://candle.pha.pa.us
[EMAIL PROTECTED] | (610) 359-1001
+ If your life is a hard drive, | 13 Roberts Road
+ Christ can be your backup. | Newtown Square, Pennsylvania 19073
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.38
diff -c -r1.38 wchar.c
*** src/backend/utils/mb/wchar.c 17 Sep 2004 21:59:57 -0000 1.38
--- src/backend/utils/mb/wchar.c 21 Nov 2004 09:58:36 -0000
***************
*** 343,348 ****
--- 343,373 ----
return (pg_euc_dsplen(s));
}
+ bool isLegalUTF8(const UTF8 *source, int len) {
+ UTF8 a;
+ const UTF8 *srcptr = source+len;
+ if(!source || (pg_utf_mblen(source) != len)) return false;
+ switch (len) {
+ default: return false;
+ /* Everything else falls through when "true"... */
+ case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+ switch (*source) {
+ /* no fall-through in this inner switch */
+ case 0xE0: if (a < 0xA0) return false; break;
+ case 0xF0: if (a < 0x90) return false; break;
+ case 0xF4: if (a > 0x8F) return false; break;
+ default: if (a < 0x80) return false;
+ }
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+ if (*source > 0xFD) return false;
+ }
+ return true;
+ }
+
/*
* convert UTF-8 string to pg_wchar (UCS-2)
* caller should allocate enough space for "to"
***************
*** 398,404 ****
* returns the byte length of a UTF-8 word pointed to by s
*/
int
! pg_utf_mblen(const unsigned char *s)
{
int len = 1;
--- 423,429 ----
* returns the byte length of a UTF-8 word pointed to by s
*/
int
! pg_utf_mblen(const UTF8 *s)
{
int len = 1;
***************
*** 406,418 ****
len = 1;
else if ((*s & 0xe0) == 0xc0)
len = 2;
! else if ((*s & 0xe0) == 0xe0)
! len = 3;
return (len);
}
static int
! pg_utf_dsplen(const unsigned char *s)
{
return 1; /* XXX fix me! */
}
--- 431,449 ----
len = 1;
else if ((*s & 0xe0) == 0xc0)
len = 2;
! else if ((*s & 0xf0) == 0xe0)
! len = 3;
! else if ((*s & 0xf8) == 0xf0)
! len = 4;
! else if ((*s & 0xfc) == 0xf8)
! len = 5;
! else if ((*s & 0xfe) == 0xfc)
! len = 6;
return (len);
}
static int
! pg_utf_dsplen(const UTF8 *s)
{
return 1; /* XXX fix me! */
}
***************
*** 721,728 ****
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},
/* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},
/* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},
/* 5; PG_JOHAB */
! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6;
PG_UNICODE */
! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7;
PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 9; PG_LATIN2 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 10; PG_LATIN3 */
--- 752,759 ----
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},
/* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},
/* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},
/* 5; PG_JOHAB */
! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6},
/* 6; PG_UNICODE */
! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3},
/* 7; PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 9; PG_LATIN2 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 10; PG_LATIN3 */
***************
*** 744,754 ****
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 26; ISO-8859-7 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 27; ISO-8859-8 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 28; PG_WIN1250 */
! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
! {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
};
/* returns the byte length of a word for mule internal code */
--- 775,785 ----
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 26; ISO-8859-7 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 27; ISO-8859-8 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},
/* 28; PG_WIN1250 */
! {0, pg_sjis_mblen, pg_sjis_dsplen, 2},
/* 29; PG_SJIS */
! {0, pg_big5_mblen, pg_big5_dsplen, 2},
/* 30; PG_BIG5 */
! {0, pg_gbk_mblen, pg_gbk_dsplen, 2},
/* 31; PG_GBK */
! {0, pg_uhc_mblen, pg_uhc_dsplen, 2},
/* 32; PG_UHC */
! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2}
/* 33; PG_GB18030 */
};
/* returns the byte length of a word for mule internal code */
***************
*** 822,872 ****
while (len > 0 && *mbstr)
{
- /* special UTF-8 check */
- if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
- {
- if (noError)
- return false;
- ereport(ERROR,
-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("Unicode characters greater
than or equal to 0x10000 are not supported")));
- }
-
l = pg_mblen(mbstr);
! for (i = 1; i < l; i++)
! {
! /*
! * we expect that every multibyte char consists of bytes
! * having the 8th bit set
! */
! if (i >= len || (mbstr[i] & 0x80) == 0)
{
! char buf[8 * 2 + 1];
! char *p = buf;
! int j,
jlimit;
! if (noError)
! return false;
! jlimit = Min(l, len);
! jlimit = Min(jlimit, 8); /*
prevent buffer overrun */
! for (j = 0; j < jlimit; j++)
! p += sprintf(p, "%02x", mbstr[j]);
! ereport(ERROR,
!
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("invalid byte sequence for encoding
\"%s\": 0x%s",
! GetDatabaseEncodingName(), buf)));
}
- }
len -= l;
mbstr += l;
}
-
return true;
}
--- 853,900 ----
while (len > 0 && *mbstr)
{
l = pg_mblen(mbstr);
! /* special UTF-8 check */
! if (encoding == PG_UTF8) {
! if(!isLegalUTF8(mbstr,l)) {
! if (noError) return false;
!
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid
UNICODE byte sequence detected near character %c",*mbstr)));
! }
! } else {
! for (i = 1; i < l; i++)
{
! /*
! * we expect that every multibyte char consists
of bytes
! * having the 8th bit set
! */
! if (i >= len || (mbstr[i] & 0x80) == 0)
! {
! char buf[8 * 2 + 1];
! char *p = buf;
! int j,
jlimit;
! if (noError)
! return false;
! jlimit = Min(l, len);
! jlimit = Min(jlimit, 8);
/* prevent buffer overrun */
! for (j = 0; j < jlimit; j++)
! p += sprintf(p, "%02x",
mbstr[j]);
! ereport(ERROR,
!
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("invalid byte sequence for
encoding \"%s\": 0x%s",
! GetDatabaseEncodingName(),
buf)));
! }
}
+ }
len -= l;
mbstr += l;
}
return true;
}
Index: src/include/mb/pg_wchar.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.52
diff -c -r1.52 pg_wchar.h
*** src/include/mb/pg_wchar.h 17 Sep 2004 21:59:57 -0000 1.52
--- src/include/mb/pg_wchar.h 21 Nov 2004 09:58:36 -0000
***************
*** 17,22 ****
--- 17,30 ----
*/
typedef unsigned int pg_wchar;
+
+ /*
+ * The UTF types
+ */
+ typedef unsigned int UTF32; /* at least 32 bits */
+ typedef unsigned short UTF16; /* at least 16 bits */
+ typedef unsigned char UTF8; /* typically 8 bits */
+
/*
* various definitions for EUC
*/
***************
*** 339,342 ****
--- 347,352 ----
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len,
int lc, unsigned char *tab);
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int
len, int lc, unsigned char *tab);
+ extern bool isLegalUTF8(const UTF8 *source, int len);
+
#endif /* PG_WCHAR_H */
---------------------------(end of broadcast)---------------------------
TIP 3: if posting/reading through Usenet, please send an appropriate
subscribe-nomail command to [EMAIL PROTECTED] so that your
message can get through to the mailing list cleanly