Re: [PATCHES] Simplify formatting.c
Euler Taveira de Oliveira wrote: > Tom Lane wrote: > > > Also, it seems a bit inconsistent to be relying on > > oracle_compat.c for upper/lower but not initcap. > > > I saw this inconsistence while I'm doing the patch. What about moving > that upper/lower/initcap and wcs* code to another file. pg_locale.c? > BTW, formatting.c and oracle_compat.c already include pg_locale.h. I researched this idea but is seems pg_locale.c contains only locale-specific stuff, while these functions have locale and non-locale versions; I ended up moving the common stuff into formatting.c. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Bruce Momjian wrote: > > I am starting to think that the simplest case is to keep the single-copy > > version in there for single-byte encodings and not worry about the > > overhead of the multi-byte case. > > My new idea is if we pass the length to str_initcap, we can eliminate > the string copy from text to char *. That leaves us with just one extra > string copy from char * to text, which seems acceptable. We still have > the wide char copy but I don't see any easy way to eliminate that > because the multi-byte code is complex and not something we want to > duplicate. I ended up going in this direction, and did the same for upper and lower. Patch attached and applied. I don't see any other cleanups in this area. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/utils/adt/formatting.c === RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v retrieving revision 1.142 diff -c -c -r1.142 formatting.c *** src/backend/utils/adt/formatting.c 17 Jun 2008 16:09:06 - 1.142 --- src/backend/utils/adt/formatting.c 23 Jun 2008 19:24:35 - *** *** 925,933 static char *str_numth(char *dest, char *num, int type); static int strspace_len(char *str); static int strdigits_len(char *str); - static char *str_toupper(char *buff); - static char *str_tolower(char *buff); - static char *str_initcap(char *buff); static int seq_search(char *name, char **array, int type, int max, int *len); static void do_to_timestamp(text *date_txt, text *fmt, --- 925,930 *** *** 1424,1435 return dest; } /* -- ! * Convert string to upper case. It is designed to be multibyte-aware. * -- */ ! static char * ! str_toupper(char *buff) { char *result; --- 1421,1444 return dest; } + /* + * If the system provides the needed functions for wide-character manipulation + * (which are all standardized by C99), then we implement upper/lower/initcap + * using wide-character functions, if necessary. Otherwise we use the + * traditional functions, which of course will not work as desired + * in multibyte character sets. Note that in either case we are effectively + * assuming that the database character encoding matches the encoding implied + * by LC_CTYPE. + */ + /* -- ! * wide-character-aware lower function ! * We pass the number of bytes so we can pass varlena and char* ! * to this function. * -- */ ! char * ! str_tolower(char *buff, size_t nbytes) { char *result; *** *** 1438,1464 #ifdef USE_WIDE_UPPER_LOWER if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) ! result = wstring_upper(buff); else #endif /* USE_WIDE_UPPER_LOWER */ { char *p; ! result = pstrdup(buff); for (p = result; *p; p++) ! *p = pg_toupper((unsigned char) *p); } return result; } /* -- ! * Convert string to lower case. It is designed to be multibyte-aware. * -- */ ! static char * ! str_tolower(char *buff) { char *result; --- 1447,1492 #ifdef USE_WIDE_UPPER_LOWER if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) ! { ! wchar_t *workspace; ! int curr_char = 0; ! ! /* Output workspace cannot have more codes than input bytes */ ! workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); ! ! char2wchar(workspace, nbytes + 1, buff, nbytes + 1); ! ! for (curr_char = 0; workspace[curr_char] != 0; curr_char++) ! workspace[curr_char] = towlower(workspace[curr_char]); ! ! /* Make result large enough; case change might change number of bytes */ ! result = palloc(curr_char * MB_CUR_MAX + 1); ! ! wchar2char(result, workspace, curr_char * MB_CUR_MAX + 1); ! pfree(workspace); ! } else #endif /* USE_WIDE_UPPER_LOWER */ { char *p; ! result = pnstrdup(buff, nbytes); for (p = result; *p; p++) ! *p = pg_tolower((unsigned char) *p); } return result; } /* -- ! * wide-character-aware upper function ! * We pass the number of bytes so we can pass varlena and char* ! * to this function. * -- */ ! char * ! str_toupper(char *buff, size_t nbytes) { char *result; *** *** 1467,1493 #ifdef USE_WIDE_UPPER_LOWER if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) ! result = wstring_lower(buff); else #endif /* USE_WIDE_UPPER_LOWER */ { char *p; ! result = pstrdup(buff); for (p = result; *p; p++) ! *p = pg_tolower((unsigned char) *p); } return result; } ! /* -- * wide-character-aware initcap function * -- */ ! static char * ! str_initcap(char *buff) { char *result;
Re: [PATCHES] Simplify formatting.c
Bruce Momjian wrote: > > Actually it seems like the hard part is not so much the input > > representation as the output representation --- what should the > > base-level initcap routine return, to be reasonably efficient for > > both cases? > > I hadn't gotten to trying it out yet, but I can see the output being a > problem. You can't even really pre-allocate the storage before passing > it because you don't know the length after case change. You could pass > back a char* and repalloc to get the varlena header in there but that is > very messy. > > Add to that that the multi-byte case also has to be converted to wide > characters, so you have text -> char * -> wide chars -> char * -> text > for the most complex case. > > I am starting to think that the simplest case is to keep the single-copy > version in there for single-byte encodings and not worry about the > overhead of the multi-byte case. My new idea is if we pass the length to str_initcap, we can eliminate the string copy from text to char *. That leaves us with just one extra string copy from char * to text, which seems acceptable. We still have the wide char copy but I don't see any easy way to eliminate that because the multi-byte code is complex and not something we want to duplicate. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Tom Lane wrote: > Bruce Momjian <[EMAIL PROTECTED]> writes: > > Tom Lane wrote: > >> I'd say not. Can't we do some more refactoring and avoid so many > >> useless conversions? Seems like str_initcap is the wrong primitive API > >> --- the work ought to be done by a function that takes a char pointer > >> and a length. That would be a suitable basis for functions operating > >> on both text datums and C strings. > > > Yea, I thought about that idea too but it is going to add a strlen() > > calls in some places, but not in critical ones. > > Sure, but the cost-per-byte of the strlen should be a good bit less than > the cost-per-byte of the actual conversion, so that doesn't bother me > too much. > > Actually it seems like the hard part is not so much the input > representation as the output representation --- what should the > base-level initcap routine return, to be reasonably efficient for > both cases? I hadn't gotten to trying it out yet, but I can see the output being a problem. You can't even really pre-allocate the storage before passing it because you don't know the length after case change. You could pass back a char* and repalloc to get the varlena header in there but that is very messy. Add to that that the multi-byte case also has to be converted to wide characters, so you have text -> char * -> wide chars -> char * -> text for the most complex case. I am starto to think that the simplest case is to keep the single-copy version in there for single-byte encodings and not worry about the overhead of the multi-byte case. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Bruce Momjian <[EMAIL PROTECTED]> writes: > Tom Lane wrote: >> I'd say not. Can't we do some more refactoring and avoid so many >> useless conversions? Seems like str_initcap is the wrong primitive API >> --- the work ought to be done by a function that takes a char pointer >> and a length. That would be a suitable basis for functions operating >> on both text datums and C strings. > Yea, I thought about that idea too but it is going to add a strlen() > calls in some places, but not in critical ones. Sure, but the cost-per-byte of the strlen should be a good bit less than the cost-per-byte of the actual conversion, so that doesn't bother me too much. Actually it seems like the hard part is not so much the input representation as the output representation --- what should the base-level initcap routine return, to be reasonably efficient for both cases? regards, tom lane -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Tom Lane wrote: > Bruce Momjian <[EMAIL PROTECTED]> writes: > > The third step is for oracle_compat.c::initcap() to use > > formatting.c::str_initcap(). You can see the result; patch attached > > (not applied). > > > This greatly reduces the size of initcap(), with the downside that we > > are making two extra copies of the string to convert it to/from char*. > > > Is this acceptable? > > I'd say not. Can't we do some more refactoring and avoid so many > useless conversions? Seems like str_initcap is the wrong primitive API > --- the work ought to be done by a function that takes a char pointer > and a length. That would be a suitable basis for functions operating > on both text datums and C strings. Yea, I thought about that idea too but it is going to add a strlen() calls in some places, but not in critical ones. > (Perhaps what I should be asking is whether the performance of upper() > and lower() is equally bad. Certainly all three should have comparable > code, so maybe they all need refactoring.) Yes, they do. I will work on the length idea and see how that goes. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Bruce Momjian <[EMAIL PROTECTED]> writes: > The third step is for oracle_compat.c::initcap() to use > formatting.c::str_initcap(). You can see the result; patch attached > (not applied). > This greatly reduces the size of initcap(), with the downside that we > are making two extra copies of the string to convert it to/from char*. > Is this acceptable? I'd say not. Can't we do some more refactoring and avoid so many useless conversions? Seems like str_initcap is the wrong primitive API --- the work ought to be done by a function that takes a char pointer and a length. That would be a suitable basis for functions operating on both text datums and C strings. (Perhaps what I should be asking is whether the performance of upper() and lower() is equally bad. Certainly all three should have comparable code, so maybe they all need refactoring.) regards, tom lane -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Bruce Momjian wrote: > Bruce Momjian wrote: > > Alvaro Herrera wrote: > > > Bruce Momjian wrote: > > > > > > > I moved str_initcap() over into oracle_compat.c and then had initcap() > > > > convert to/from TEXT to call it. The code is a little weird because > > > > str_initcap() needs to convert to text to use texttowcs(), so in > > > > multibyte encodings initcap converts the string to text, then to char, > > > > then to text to call texttowcs(). I didn't see a cleaner way to do > > > > this. > > > > > > Why not use wchar2char? It seems there's room for extra cleanup here. > > > > > > Also, the prototype of str_initcap in builtins.h looks out of place. > > > > I talked to Alvaro on IM, and there is certainly much more cleanup to do > > in this area. I will work from the bottom up. First, is moving the > > USE_WIDE_UPPER_LOWER define to c.h, and removing TS_USE_WIDE and using > > USE_WIDE_UPPER_LOWER instead. Patch attached and applied. > > The second step is to move wchar2char() and char2wchar() from tsearch > into /mb to be easier to use for other modules; also move pnstrdup(). The third step is for oracle_compat.c::initcap() to use formatting.c::str_initcap(). You can see the result; patch attached (not applied). This greatly reduces the size of initcap(), with the downside that we are making two extra copies of the string to convert it to/from char*. Is this acceptable? If it is I will do the same for uppper()/lower() with similar code size reduction and modularity. If not perhaps I should keep the non-multibyte code in initcap() and have only the multi-byte use str_initcap(). -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/utils/adt/formatting.c === RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v retrieving revision 1.142 diff -c -c -r1.142 formatting.c *** src/backend/utils/adt/formatting.c 17 Jun 2008 16:09:06 - 1.142 --- src/backend/utils/adt/formatting.c 21 Jun 2008 20:00:45 - *** *** 1499,1526 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) { wchar_t *workspace; ! text *in_text; ! text *out_text; ! int i; ! in_text = cstring_to_text(buff); ! workspace = texttowcs(in_text); ! for (i = 0; workspace[i] != 0; i++) { if (wasalnum) ! workspace[i] = towlower(workspace[i]); else ! workspace[i] = towupper(workspace[i]); ! wasalnum = iswalnum(workspace[i]); } ! out_text = wcstotext(workspace, i); ! result = text_to_cstring(out_text); pfree(workspace); - pfree(in_text); - pfree(out_text); } else #endif /* USE_WIDE_UPPER_LOWER */ --- 1499,1525 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) { wchar_t *workspace; ! int curr_char = 0; ! /* Output workspace cannot have more codes than input bytes */ ! workspace = (wchar_t *) palloc((strlen(buff) + 1) * sizeof(wchar_t)); ! char2wchar(workspace, strlen(buff) + 1, buff, strlen(buff) + 1); ! ! for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { if (wasalnum) ! workspace[curr_char] = towlower(workspace[curr_char]); else ! workspace[curr_char] = towupper(workspace[curr_char]); ! wasalnum = iswalnum(workspace[curr_char]); } ! /* Make result large enough; case change might change number of bytes */ ! result = palloc(curr_char * MB_CUR_MAX + 1); + wchar2char(result, workspace, curr_char * MB_CUR_MAX + 1); pfree(workspace); } else #endif /* USE_WIDE_UPPER_LOWER */ Index: src/backend/utils/adt/oracle_compat.c === RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v retrieving revision 1.80 diff -c -c -r1.80 oracle_compat.c *** src/backend/utils/adt/oracle_compat.c 17 Jun 2008 16:09:06 - 1.80 --- src/backend/utils/adt/oracle_compat.c 21 Jun 2008 20:00:45 - *** *** 467,530 Datum initcap(PG_FUNCTION_ARGS) { ! #ifdef USE_WIDE_UPPER_LOWER ! /* ! * Use wide char code only when max encoding length > 1 and ctype != C. ! * Some operating systems fail with multi-byte encodings and a C locale. ! * Also, for a C locale there is no need to process as multibyte. ! */ ! if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) ! { ! text *string = PG_GETARG_TEXT_PP(0); ! text *result; ! wchar_t*workspace; ! int wasalnum = 0; ! int i; ! ! workspace = texttowcs(string); ! ! for (i = 0; workspace[i] != 0; i++) ! { ! if (wasalnum) ! workspace[i] = towlower(workspace[i]); ! else ! workspace[i] = towupper(workspace[i]); ! wasalnum = iswalnum(workspace[i]); ! } ! ! result = wcstotext(workspace, i); ! ! pfree(worksp
Re: [PATCHES] Simplify formatting.c
Bruce Momjian wrote: > Alvaro Herrera wrote: > > Bruce Momjian wrote: > > > > > I moved str_initcap() over into oracle_compat.c and then had initcap() > > > convert to/from TEXT to call it. The code is a little weird because > > > str_initcap() needs to convert to text to use texttowcs(), so in > > > multibyte encodings initcap converts the string to text, then to char, > > > then to text to call texttowcs(). I didn't see a cleaner way to do > > > this. > > > > Why not use wchar2char? It seems there's room for extra cleanup here. > > > > Also, the prototype of str_initcap in builtins.h looks out of place. > > I talked to Alvaro on IM, and there is certainly much more cleanup to do > in this area. I will work from the bottom up. First, is moving the > USE_WIDE_UPPER_LOWER define to c.h, and removing TS_USE_WIDE and using > USE_WIDE_UPPER_LOWER instead. Patch attached and applied. The second step is to move wchar2char() and char2wchar() from tsearch into /mb to be easier to use for other modules; also move pnstrdup(). Patch attached and applied. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/tsearch/ts_locale.c === RCS file: /cvsroot/pgsql/src/backend/tsearch/ts_locale.c,v retrieving revision 1.8 diff -c -c -r1.8 ts_locale.c *** src/backend/tsearch/ts_locale.c 17 Jun 2008 16:09:06 - 1.8 --- src/backend/tsearch/ts_locale.c 18 Jun 2008 18:37:02 - *** *** 16,140 #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" - #ifdef USE_WIDE_UPPER_LOWER - /* - * wchar2char --- convert wide characters to multibyte format - * - * This has the same API as the standard wcstombs() function; in particular, - * tolen is the maximum number of bytes to store at *to, and *from must be - * zero-terminated. The output will be zero-terminated iff there is room. - */ - size_t - wchar2char(char *to, const wchar_t *from, size_t tolen) - { - if (tolen == 0) - return 0; - - #ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8) - { - int r; - - r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, - NULL, NULL); - - if (r <= 0) - return (size_t) -1; - - Assert(r <= tolen); - - /* Microsoft counts the zero terminator in the result */ - return r - 1; - } - #endif /* WIN32 */ - - return wcstombs(to, from, tolen); - } - - /* - * char2wchar --- convert multibyte characters to wide characters - * - * This has almost the API of mbstowcs(), except that *from need not be - * null-terminated; instead, the number of input bytes is specified as - * fromlen. Also, we ereport() rather than returning -1 for invalid - * input encoding. tolen is the maximum number of wchar_t's to store at *to. - * The output will be zero-terminated iff there is room. - */ - size_t - char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen) - { - if (tolen == 0) - return 0; - - #ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8) - { - int r; - - /* stupid Microsloth API does not work for zero-length input */ - if (fromlen == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1); - - if (r <= 0) - { - /* see notes in oracle_compat.c about error reporting */ - pg_verifymbstr(from, fromlen, false); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid multibyte character for locale"), - errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); - } - } - - Assert(r < tolen); - to[r] = 0; - - return r; - } - #endif /* WIN32 */ - - if (lc_ctype_is_c()) - { - /* - * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be - * allocated with sufficient space - */ - return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen); - } - else - { - /* - * mbstowcs requires ending '\0' - */ - char *str = pnstrdup(from, fromlen); - size_t result; - - result = mbstowcs(to, str, tolen); - - pfree(str); - - if (result == (size_t) -1) - { - pg_verifymbstr(from, fromlen, false); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid multibyte character for locale"), - errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); - } - - if (result < tolen) - to[result] = 0; - - return result; - } - } - - int t_isdigit(const char *ptr) { --- 16,23 Index: src/backend/tsearch/ts_utils.c === RCS file: /cvsroot/pgsql/src/backend/tsearch/ts_utils.c,v retrieving revision 1.9 diff -c -c -r1.9 ts_utils.c *** src/backend/tsearch/ts_utils
Re: [PATCHES] Simplify formatting.c
Alvaro Herrera wrote: > Bruce Momjian wrote: > > > I moved str_initcap() over into oracle_compat.c and then had initcap() > > convert to/from TEXT to call it. The code is a little weird because > > str_initcap() needs to convert to text to use texttowcs(), so in > > multibyte encodings initcap converts the string to text, then to char, > > then to text to call texttowcs(). I didn't see a cleaner way to do > > this. > > Why not use wchar2char? It seems there's room for extra cleanup here. > > Also, the prototype of str_initcap in builtins.h looks out of place. I talked to Alvaro on IM, and there is certainly much more cleanup to do in this area. I will work from the bottom up. First, is moving the USE_WIDE_UPPER_LOWER define to c.h, and removing TS_USE_WIDE and using USE_WIDE_UPPER_LOWER instead. Patch attached and applied. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/tsearch/regis.c === RCS file: /cvsroot/pgsql/src/backend/tsearch/regis.c,v retrieving revision 1.4 diff -c -c -r1.4 regis.c *** src/backend/tsearch/regis.c 21 Jan 2008 02:46:10 - 1.4 --- src/backend/tsearch/regis.c 17 Jun 2008 16:06:54 - *** *** 178,184 r->node = NULL; } ! #ifdef TS_USE_WIDE static bool mb_strchr(char *str, char *c) { --- 178,184 r->node = NULL; } ! #ifdef USE_WIDE_UPPER_LOWER static bool mb_strchr(char *str, char *c) { Index: src/backend/tsearch/ts_locale.c === RCS file: /cvsroot/pgsql/src/backend/tsearch/ts_locale.c,v retrieving revision 1.7 diff -c -c -r1.7 ts_locale.c *** src/backend/tsearch/ts_locale.c 1 Jan 2008 19:45:52 - 1.7 --- src/backend/tsearch/ts_locale.c 17 Jun 2008 16:06:54 - *** *** 17,23 #include "tsearch/ts_public.h" ! #ifdef TS_USE_WIDE /* * wchar2char --- convert wide characters to multibyte format --- 17,23 #include "tsearch/ts_public.h" ! #ifdef USE_WIDE_UPPER_LOWER /* * wchar2char --- convert wide characters to multibyte format *** *** 190,196 return iswprint((wint_t) character[0]); } ! #endif /* TS_USE_WIDE */ /* --- 190,196 return iswprint((wint_t) character[0]); } ! #endif /* USE_WIDE_UPPER_LOWER */ /* *** *** 260,266 if (len == 0) return pstrdup(""); ! #ifdef TS_USE_WIDE /* * Use wide char code only when max encoding length > 1 and ctype != C. --- 260,266 if (len == 0) return pstrdup(""); ! #ifdef USE_WIDE_UPPER_LOWER /* * Use wide char code only when max encoding length > 1 and ctype != C. *** *** 307,313 Assert(wlen < len); } else ! #endif /* TS_USE_WIDE */ { const char *ptr = str; char *outptr; --- 307,313 Assert(wlen < len); } else ! #endif /* USE_WIDE_UPPER_LOWER */ { const char *ptr = str; char *outptr; Index: src/backend/tsearch/wparser_def.c === RCS file: /cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v retrieving revision 1.14 diff -c -c -r1.14 wparser_def.c *** src/backend/tsearch/wparser_def.c 1 Jan 2008 19:45:52 - 1.14 --- src/backend/tsearch/wparser_def.c 17 Jun 2008 16:06:54 - *** *** 238,244 /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ ! #ifdef TS_USE_WIDE wchar_t*wstr; /* wide character string */ int lenwstr; /* length of wsting */ #endif --- 238,244 /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ ! #ifdef USE_WIDE_UPPER_LOWER wchar_t*wstr; /* wide character string */ int lenwstr; /* length of wsting */ #endif *** *** 291,297 prs->str = str; prs->lenstr = len; ! #ifdef TS_USE_WIDE /* * Use wide char code only when max encoding length > 1. --- 291,297 prs->str = str; prs->lenstr = len; ! #ifdef USE_WIDE_UPPER_LOWER /* * Use wide char code only when max encoding length > 1. *** *** 328,334 prs->state = ptr; } ! #ifdef TS_USE_WIDE if (prs->wstr) pfree(prs->wstr); #endif --- 328,334 prs->state = ptr; } ! #ifdef USE_WIDE_UPPER_LOWER if (prs->wstr) pfree(prs->wstr); #endif *** *** 344,350 * often are used for Asian languages */ ! #ifdef TS_USE_WIDE #define p_iswhat(type) \ static int \ --- 344,350 * often are used for Asian languages */ ! #ifdef USE_WIDE_UPPER_LOWER #define p_iswhat(type)
Re: [PATCHES] Simplify formatting.c
Bruce Momjian wrote: > I moved str_initcap() over into oracle_compat.c and then had initcap() > convert to/from TEXT to call it. The code is a little weird because > str_initcap() needs to convert to text to use texttowcs(), so in > multibyte encodings initcap converts the string to text, then to char, > then to text to call texttowcs(). I didn't see a cleaner way to do > this. Why not use wchar2char? It seems there's room for extra cleanup here. Also, the prototype of str_initcap in builtins.h looks out of place. -- Alvaro Herrerahttp://www.CommandPrompt.com/ PostgreSQL Replication, Consulting, Custom Development, 24x7 support -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Tom Lane wrote: > Euler Taveira de Oliveira <[EMAIL PROTECTED]> writes: > > Tom Lane wrote: > >> Also, it seems a bit inconsistent to be relying on > >> oracle_compat.c for upper/lower but not initcap. > >> > > I saw this inconsistence while I'm doing the patch. What about moving > > that upper/lower/initcap and wcs* code to another file. pg_locale.c? > > That doesn't seem a particularly appropriate place for them. pg_locale > is about dealing with the locale state, not about doing actual > operations based on the locale data. > > I was just thinking of having oracle_compat expose an initcap routine. You mean like the attached? I moved str_initcap() over into oracle_compat.c and then had initcap() convert to/from TEXT to call it. The code is a little weird because str_initcap() needs to convert to text to use texttowcs(), so in multibyte encodings initcap converts the string to text, then to char, then to text to call texttowcs(). I didn't see a cleaner way to do this. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/utils/adt/formatting.c === RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v retrieving revision 1.141 diff -c -c -r1.141 formatting.c *** src/backend/utils/adt/formatting.c 20 May 2008 01:41:02 - 1.141 --- src/backend/utils/adt/formatting.c 13 Jun 2008 22:01:18 - *** *** 927,933 static int strdigits_len(char *str); static char *str_toupper(char *buff); static char *str_tolower(char *buff); - static char *str_initcap(char *buff); static int seq_search(char *name, char **array, int type, int max, int *len); static void do_to_timestamp(text *date_txt, text *fmt, --- 927,932 *** *** 1484,1549 } /* -- - * wide-character-aware initcap function - * -- - */ - static char * - str_initcap(char *buff) - { - char *result; - bool wasalnum = false; - - if (!buff) - return NULL; - - #ifdef USE_WIDE_UPPER_LOWER - if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) - { - wchar_t *workspace; - text *in_text; - text *out_text; - int i; - - in_text = cstring_to_text(buff); - workspace = texttowcs(in_text); - - for (i = 0; workspace[i] != 0; i++) - { - if (wasalnum) - workspace[i] = towlower(workspace[i]); - else - workspace[i] = towupper(workspace[i]); - wasalnum = iswalnum(workspace[i]); - } - - out_text = wcstotext(workspace, i); - result = text_to_cstring(out_text); - - pfree(workspace); - pfree(in_text); - pfree(out_text); - } - else - #endif /* USE_WIDE_UPPER_LOWER */ - { - char *p; - - result = pstrdup(buff); - - for (p = result; *p; p++) - { - if (wasalnum) - *p = pg_tolower((unsigned char) *p); - else - *p = pg_toupper((unsigned char) *p); - wasalnum = isalnum((unsigned char) *p); - } - } - - return result; - } - - /* -- * Sequential search with to upper/lower conversion * -- */ --- 1483,1488 Index: src/backend/utils/adt/oracle_compat.c === RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v retrieving revision 1.79 diff -c -c -r1.79 oracle_compat.c *** src/backend/utils/adt/oracle_compat.c 19 May 2008 18:08:16 - 1.79 --- src/backend/utils/adt/oracle_compat.c 13 Jun 2008 22:01:18 - *** *** 471,478 Datum initcap(PG_FUNCTION_ARGS) { ! #ifdef USE_WIDE_UPPER_LOWER /* * Use wide char code only when max encoding length > 1 and ctype != C. * Some operating systems fail with multi-byte encodings and a C locale. --- 471,496 Datum initcap(PG_FUNCTION_ARGS) { ! text *string = PG_GETARG_TEXT_PP(0); ! char *str2; ! ! str2 = str_initcap(DatumGetCString(string)); ! string = cstring_to_text(str2); ! pfree(str2); ! PG_RETURN_TEXT_P(string); ! } ! + char * + str_initcap(char *str) + { + char *result; + int wasalnum = 0; + + if (!str) + return NULL; + + #ifdef USE_WIDE_UPPER_LOWER /* * Use wide char code only when max encoding length > 1 and ctype != C. * Some operating systems fail with multi-byte encodings and a C locale. *** *** 480,492 */ if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) { - text *string = PG_GETARG_TEXT_PP(0); - text *result; wchar_t*workspace; ! int wasalnum = 0; int i; ! workspace = texttowcs(string); for (i = 0; workspace[i] != 0; i++) { --- 498,510 */ if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) { wchar_t*workspace; ! text *in_text; ! text *out_text; int i; ! in_text = cstring_to_text(str); ! workspace = texttowc
Re: [PATCHES] Simplify formatting.c
Euler Taveira de Oliveira <[EMAIL PROTECTED]> writes: > Tom Lane wrote: >> Also, it seems a bit inconsistent to be relying on >> oracle_compat.c for upper/lower but not initcap. >> > I saw this inconsistence while I'm doing the patch. What about moving > that upper/lower/initcap and wcs* code to another file. pg_locale.c? That doesn't seem a particularly appropriate place for them. pg_locale is about dealing with the locale state, not about doing actual operations based on the locale data. I was just thinking of having oracle_compat expose an initcap routine. regards, tom lane -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Tom Lane wrote: Also, it seems a bit inconsistent to be relying on oracle_compat.c for upper/lower but not initcap. I saw this inconsistence while I'm doing the patch. What about moving that upper/lower/initcap and wcs* code to another file. pg_locale.c? BTW, formatting.c and oracle_compat.c already include pg_locale.h. -- Euler Taveira de Oliveira http://www.timbira.com/ -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
Re: [PATCHES] Simplify formatting.c
Bruce Momjian <[EMAIL PROTECTED]> writes: > Now that upper/lower/initcase do not modify the passed string, I have > simplified formatting.c with the attached patch. I was thinking the same thing while reading the patch. But please, make str_toupper() and friends declare their argument "const" if you're going to do this. Another issue in this area is that these functions could do with some refactoring to eliminate useless text/cstring conversions; I'm pretty sure some code paths are doing cstring->text->cstring in direct succession. Also, it seems a bit inconsistent to be relying on oracle_compat.c for upper/lower but not initcap. regards, tom lane -- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
[PATCHES] Simplify formatting.c
Now that upper/lower/initcase do not modify the passed string, I have simplified formatting.c with the attached patch. -- Bruce Momjian <[EMAIL PROTECTED]>http://momjian.us EnterpriseDB http://enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/utils/adt/formatting.c === RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v retrieving revision 1.140 diff -c -c -r1.140 formatting.c *** src/backend/utils/adt/formatting.c 19 May 2008 18:08:15 - 1.140 --- src/backend/utils/adt/formatting.c 20 May 2008 01:37:23 - *** *** 1894,1903 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_full_months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_toupper(workbuff)); ! } else { strcpy(workbuff, months_full[tm->tm_mon - 1]); --- 1894,1900 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! strcpy(s, str_toupper(localized_full_months[tm->tm_mon - 1])); else { strcpy(workbuff, months_full[tm->tm_mon - 1]); *** *** 1910,1923 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_full_months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_initcap(workbuff)); ! } else - { sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]); - } s += strlen(s); break; case DCH_month: --- 1907,1915 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! strcpy(s, str_initcap(localized_full_months[tm->tm_mon - 1])); else sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]); s += strlen(s); break; case DCH_month: *** *** 1925,1934 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_full_months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_tolower(workbuff)); ! } else { sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]); --- 1917,1923 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! strcpy(s, str_tolower(localized_full_months[tm->tm_mon - 1])); else { sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]); *** *** 1941,1955 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_abbrev_months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_toupper(workbuff)); ! } else ! { ! strcpy(workbuff, months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_toupper(workbuff)); ! } s += strlen(s); break; case DCH_Mon: --- 1930,1938 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! strcpy(s, str_toupper(localized_abbrev_months[tm->tm_mon - 1])); else ! strcpy(s, str_toupper(months[tm->tm_mon - 1])); s += strlen(s); break; case DCH_Mon: *** *** 1957,1970 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_abbrev_months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_initcap(workbuff)); ! } else - { strcpy(s, months[tm->tm_mon - 1]); - } s += strlen(s); break; case DCH_mon: --- 1940,1948 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! strcpy(s, str_initcap(localized_abbrev_months[tm->tm_mon - 1])); else strcpy(s, months[tm->tm_mon - 1]); s += strlen(s); break; case DCH_mon: *** *** 1972,1981 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_abbrev_months[tm->tm_mon - 1]); ! sprintf(s, "%*s", 0, str_tolower(workbuff)); ! } else { strcpy(s, months[tm->tm_mon - 1]); --- 1950,1956 if (!tm->tm_mon) break; if (S_TM(n->suffix)) ! strcpy(s, str_tolower(localized_abbrev_months[tm->tm_mon - 1])); else { strcpy(s, months[tm->tm_mon - 1]); *** *** 1992,2001 case DCH_DAY: INVALID_FOR_INTERVAL; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_full_days[tm->tm_wday]); ! sprintf(s, "%*s", 0, str_toupper(workbuff)); ! } else { strcpy(workbuff, days[tm->tm_wday]); --- 1967,1973 case DCH_DAY: INVALID_FOR_INTERVAL; if (S_TM(n->suffix)) ! strcpy(s, str_toupper(localized_full_days[tm->tm_wday])); else { strcpy(workbuff, days[tm->tm_wday]); *** *** 2006,2028 case DCH_Day: INVALID_FOR_INTERVAL; if (S_TM(n->suffix)) ! { ! strcpy(workbuff, localized_full_days[tm->t