Re: [PATCHES] Simplify formatting.c

2008-06-23 Thread Bruce Momjian
Euler Taveira de Oliveira wrote:
> Tom Lane wrote:
> 
> > Also, it seems a bit inconsistent to be relying on
> > oracle_compat.c for upper/lower but not initcap.
> > 
> I saw this inconsistence while I'm doing the patch. What about moving 
> that upper/lower/initcap and wcs* code to another file. pg_locale.c? 
> BTW, formatting.c and oracle_compat.c already include pg_locale.h.

I researched this idea but is seems pg_locale.c contains only
locale-specific stuff, while these functions have locale and non-locale
versions;  I ended up moving the common stuff into formatting.c.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-23 Thread Bruce Momjian
Bruce Momjian wrote:
> > I am starting to think that the simplest case is to keep the single-copy
> > version in there for single-byte encodings and not worry about the
> > overhead of the multi-byte case.
> 
> My new idea is if we pass the length to str_initcap, we can eliminate
> the string copy from text to char *.  That leaves us with just one extra
> string copy from char * to text, which seems acceptable.  We still have
> the wide char copy but I don't see any easy way to eliminate that
> because the multi-byte code is complex and not something we want to
> duplicate.

I ended up going in this direction, and did the same for upper and
lower.  Patch attached and applied.   I don't see any other cleanups in
this area.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/utils/adt/formatting.c
===
RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v
retrieving revision 1.142
diff -c -c -r1.142 formatting.c
*** src/backend/utils/adt/formatting.c	17 Jun 2008 16:09:06 -	1.142
--- src/backend/utils/adt/formatting.c	23 Jun 2008 19:24:35 -
***
*** 925,933 
  static char *str_numth(char *dest, char *num, int type);
  static int	strspace_len(char *str);
  static int	strdigits_len(char *str);
- static char *str_toupper(char *buff);
- static char *str_tolower(char *buff);
- static char *str_initcap(char *buff);
  
  static int	seq_search(char *name, char **array, int type, int max, int *len);
  static void do_to_timestamp(text *date_txt, text *fmt,
--- 925,930 
***
*** 1424,1435 
  	return dest;
  }
  
  /* --
!  * Convert string to upper case. It is designed to be multibyte-aware.
   * --
   */
! static char *
! str_toupper(char *buff)
  {
  	char		*result;
  
--- 1421,1444 
  	return dest;
  }
  
+ /*
+  * If the system provides the needed functions for wide-character manipulation
+  * (which are all standardized by C99), then we implement upper/lower/initcap
+  * using wide-character functions, if necessary.  Otherwise we use the
+  * traditional  functions, which of course will not work as desired
+  * in multibyte character sets.  Note that in either case we are effectively
+  * assuming that the database character encoding matches the encoding implied
+  * by LC_CTYPE.
+  */
+ 
  /* --
!  * wide-character-aware lower function
!  * We pass the number of bytes so we can pass varlena and char*
!  * to this function.
   * --
   */
! char *
! str_tolower(char *buff, size_t nbytes)
  {
  	char		*result;
  
***
*** 1438,1464 
  
  #ifdef USE_WIDE_UPPER_LOWER
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
! 		result = wstring_upper(buff);
  	else
  #endif		/* USE_WIDE_UPPER_LOWER */
  	{
  		char *p;
  
! 		result = pstrdup(buff);
  
  		for (p = result; *p; p++)
! 			*p = pg_toupper((unsigned char) *p);
  	}
  
  	return result;
  }
  
  /* --
!  * Convert string to lower case. It is designed to be multibyte-aware.
   * --
   */
! static char *
! str_tolower(char *buff)
  {
  	char		*result;
  
--- 1447,1492 
  
  #ifdef USE_WIDE_UPPER_LOWER
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
! 	{
! 		wchar_t		*workspace;
! 		int			curr_char = 0;
! 
! 		/* Output workspace cannot have more codes than input bytes */
! 		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
! 
! 		char2wchar(workspace, nbytes + 1, buff, nbytes + 1);
! 
! 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
! 			workspace[curr_char] = towlower(workspace[curr_char]);
! 
! 		/* Make result large enough; case change might change number of bytes */
! 		result = palloc(curr_char * MB_CUR_MAX + 1);
! 
! 		wchar2char(result, workspace, curr_char * MB_CUR_MAX + 1);
! 		pfree(workspace);
! 	}
  	else
  #endif		/* USE_WIDE_UPPER_LOWER */
  	{
  		char *p;
  
! 		result = pnstrdup(buff, nbytes);
  
  		for (p = result; *p; p++)
! 			*p = pg_tolower((unsigned char) *p);
  	}
  
  	return result;
  }
  
  /* --
!  * wide-character-aware upper function
!  * We pass the number of bytes so we can pass varlena and char*
!  * to this function.
   * --
   */
! char *
! str_toupper(char *buff, size_t nbytes)
  {
  	char		*result;
  
***
*** 1467,1493 
  
  #ifdef USE_WIDE_UPPER_LOWER
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
! 		result = wstring_lower(buff);
  	else
  #endif		/* USE_WIDE_UPPER_LOWER */
  	{
  		char *p;
  
! 		result = pstrdup(buff);
  
  		for (p = result; *p; p++)
! 			*p = pg_tolower((unsigned char) *p);
  	}
  
  	return result;
  }
!   
  /* --
   * wide-character-aware initcap function
   * --
   */
! static char *
! str_initcap(char *buff)
  {
  	char		*result;

Re: [PATCHES] Simplify formatting.c

2008-06-22 Thread Bruce Momjian
Bruce Momjian wrote:
> > Actually it seems like the hard part is not so much the input
> > representation as the output representation --- what should the
> > base-level initcap routine return, to be reasonably efficient for
> > both cases?
> 
> I hadn't gotten to trying it out yet, but I can see the output being a
> problem.  You can't even really pre-allocate the storage before passing
> it because you don't know the length after case change.  You could pass
> back a char* and repalloc to get the varlena header in there but that is
> very messy.
> 
> Add to that that the multi-byte case also has to be converted to wide
> characters, so you have text -> char * -> wide chars -> char * -> text
> for the most complex case.
> 
> I am starting to think that the simplest case is to keep the single-copy
> version in there for single-byte encodings and not worry about the
> overhead of the multi-byte case.

My new idea is if we pass the length to str_initcap, we can eliminate
the string copy from text to char *.  That leaves us with just one extra
string copy from char * to text, which seems acceptable.  We still have
the wide char copy but I don't see any easy way to eliminate that
because the multi-byte code is complex and not something we want to
duplicate.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-21 Thread Bruce Momjian
Tom Lane wrote:
> Bruce Momjian <[EMAIL PROTECTED]> writes:
> > Tom Lane wrote:
> >> I'd say not.  Can't we do some more refactoring and avoid so many
> >> useless conversions?  Seems like str_initcap is the wrong primitive API
> >> --- the work ought to be done by a function that takes a char pointer
> >> and a length.  That would be a suitable basis for functions operating
> >> on both text datums and C strings.
> 
> > Yea, I thought about that idea too but it is going to add a strlen()
> > calls in some places, but not in critical ones.
> 
> Sure, but the cost-per-byte of the strlen should be a good bit less than
> the cost-per-byte of the actual conversion, so that doesn't bother me
> too much.
> 
> Actually it seems like the hard part is not so much the input
> representation as the output representation --- what should the
> base-level initcap routine return, to be reasonably efficient for
> both cases?

I hadn't gotten to trying it out yet, but I can see the output being a
problem.  You can't even really pre-allocate the storage before passing
it because you don't know the length after case change.  You could pass
back a char* and repalloc to get the varlena header in there but that is
very messy.

Add to that that the multi-byte case also has to be converted to wide
characters, so you have text -> char * -> wide chars -> char * -> text
for the most complex case.

I am starto to think that the simplest case is to keep the single-copy
version in there for single-byte encodings and not worry about the
overhead of the multi-byte case.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-21 Thread Tom Lane
Bruce Momjian <[EMAIL PROTECTED]> writes:
> Tom Lane wrote:
>> I'd say not.  Can't we do some more refactoring and avoid so many
>> useless conversions?  Seems like str_initcap is the wrong primitive API
>> --- the work ought to be done by a function that takes a char pointer
>> and a length.  That would be a suitable basis for functions operating
>> on both text datums and C strings.

> Yea, I thought about that idea too but it is going to add a strlen()
> calls in some places, but not in critical ones.

Sure, but the cost-per-byte of the strlen should be a good bit less than
the cost-per-byte of the actual conversion, so that doesn't bother me
too much.

Actually it seems like the hard part is not so much the input
representation as the output representation --- what should the
base-level initcap routine return, to be reasonably efficient for
both cases?

regards, tom lane

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-21 Thread Bruce Momjian
Tom Lane wrote:
> Bruce Momjian <[EMAIL PROTECTED]> writes:
> > The third step is for oracle_compat.c::initcap() to use
> > formatting.c::str_initcap().  You can see the result;  patch attached
> > (not applied).
> 
> > This greatly reduces the size of initcap(), with the downside that we
> > are making two extra copies of the string to convert it to/from char*.
> 
> > Is this acceptable?
> 
> I'd say not.  Can't we do some more refactoring and avoid so many
> useless conversions?  Seems like str_initcap is the wrong primitive API
> --- the work ought to be done by a function that takes a char pointer
> and a length.  That would be a suitable basis for functions operating
> on both text datums and C strings.

Yea, I thought about that idea too but it is going to add a strlen()
calls in some places, but not in critical ones.

> (Perhaps what I should be asking is whether the performance of upper()
> and lower() is equally bad.  Certainly all three should have comparable
> code, so maybe they all need refactoring.)

Yes, they do.  I will work on the length idea and see how that goes.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-21 Thread Tom Lane
Bruce Momjian <[EMAIL PROTECTED]> writes:
> The third step is for oracle_compat.c::initcap() to use
> formatting.c::str_initcap().  You can see the result;  patch attached
> (not applied).

> This greatly reduces the size of initcap(), with the downside that we
> are making two extra copies of the string to convert it to/from char*.

> Is this acceptable?

I'd say not.  Can't we do some more refactoring and avoid so many
useless conversions?  Seems like str_initcap is the wrong primitive API
--- the work ought to be done by a function that takes a char pointer
and a length.  That would be a suitable basis for functions operating
on both text datums and C strings.

(Perhaps what I should be asking is whether the performance of upper()
and lower() is equally bad.  Certainly all three should have comparable
code, so maybe they all need refactoring.)

regards, tom lane

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-21 Thread Bruce Momjian
Bruce Momjian wrote:
> Bruce Momjian wrote:
> > Alvaro Herrera wrote:
> > > Bruce Momjian wrote:
> > > 
> > > > I moved str_initcap() over into oracle_compat.c and then had initcap()
> > > > convert to/from TEXT to call it.  The code is a little weird because
> > > > str_initcap() needs to convert to text to use texttowcs(), so in
> > > > multibyte encodings initcap converts the string to text, then to char,
> > > > then to text to call texttowcs().  I didn't see a cleaner way to do
> > > > this.
> > > 
> > > Why not use wchar2char?  It seems there's room for extra cleanup here.
> > > 
> > > Also, the prototype of str_initcap in builtins.h looks out of place.
> > 
> > I talked to Alvaro on IM, and there is certainly much more cleanup to do
> > in this area. I will work from the bottom up.  First, is moving the
> > USE_WIDE_UPPER_LOWER define to c.h, and removing TS_USE_WIDE and using
> > USE_WIDE_UPPER_LOWER instead.  Patch attached and applied.
> 
> The second step is to move wchar2char() and char2wchar() from tsearch
> into /mb to be easier to use for other modules;  also move pnstrdup(). 

The third step is for oracle_compat.c::initcap() to use
formatting.c::str_initcap().  You can see the result;  patch attached
(not applied).

This greatly reduces the size of initcap(), with the downside that we
are making two extra copies of the string to convert it to/from char*.

Is this acceptable?  If it is I will do the same for uppper()/lower()
with similar code size reduction and modularity.

If not perhaps I should keep the non-multibyte code in initcap() and
have only the multi-byte use str_initcap().

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/utils/adt/formatting.c
===
RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v
retrieving revision 1.142
diff -c -c -r1.142 formatting.c
*** src/backend/utils/adt/formatting.c	17 Jun 2008 16:09:06 -	1.142
--- src/backend/utils/adt/formatting.c	21 Jun 2008 20:00:45 -
***
*** 1499,1526 
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
  	{
  		wchar_t		*workspace;
! 		text		*in_text;
! 		text		*out_text;
! 		int			i;
  
! 		in_text = cstring_to_text(buff);
! 		workspace = texttowcs(in_text);
  
! 		for (i = 0; workspace[i] != 0; i++)
  		{
  			if (wasalnum)
! workspace[i] = towlower(workspace[i]);
  			else
! workspace[i] = towupper(workspace[i]);
! 			wasalnum = iswalnum(workspace[i]);
  		}
  
! 		out_text = wcstotext(workspace, i);
! 		result = text_to_cstring(out_text);
  
  		pfree(workspace);
- 		pfree(in_text);
- 		pfree(out_text);
  	}
  	else
  #endif		/* USE_WIDE_UPPER_LOWER */
--- 1499,1525 
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
  	{
  		wchar_t		*workspace;
! 		int			curr_char = 0;
  
! 		/* Output workspace cannot have more codes than input bytes */
! 		workspace = (wchar_t *) palloc((strlen(buff) + 1) * sizeof(wchar_t));
  
! 		char2wchar(workspace, strlen(buff) + 1, buff, strlen(buff) + 1);
! 
! 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
  		{
  			if (wasalnum)
! workspace[curr_char] = towlower(workspace[curr_char]);
  			else
! workspace[curr_char] = towupper(workspace[curr_char]);
! 			wasalnum = iswalnum(workspace[curr_char]);
  		}
  
! 		/* Make result large enough; case change might change number of bytes */
! 		result = palloc(curr_char * MB_CUR_MAX + 1);
  
+ 		wchar2char(result, workspace, curr_char * MB_CUR_MAX + 1);
  		pfree(workspace);
  	}
  	else
  #endif		/* USE_WIDE_UPPER_LOWER */
Index: src/backend/utils/adt/oracle_compat.c
===
RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v
retrieving revision 1.80
diff -c -c -r1.80 oracle_compat.c
*** src/backend/utils/adt/oracle_compat.c	17 Jun 2008 16:09:06 -	1.80
--- src/backend/utils/adt/oracle_compat.c	21 Jun 2008 20:00:45 -
***
*** 467,530 
  Datum
  initcap(PG_FUNCTION_ARGS)
  {
! #ifdef USE_WIDE_UPPER_LOWER
  
! 	/*
! 	 * Use wide char code only when max encoding length > 1 and ctype != C.
! 	 * Some operating systems fail with multi-byte encodings and a C locale.
! 	 * Also, for a C locale there is no need to process as multibyte.
! 	 */
! 	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
! 	{
! 		text	   *string = PG_GETARG_TEXT_PP(0);
! 		text	   *result;
! 		wchar_t*workspace;
! 		int			wasalnum = 0;
! 		int			i;
! 
! 		workspace = texttowcs(string);
! 
! 		for (i = 0; workspace[i] != 0; i++)
! 		{
! 			if (wasalnum)
! workspace[i] = towlower(workspace[i]);
! 			else
! workspace[i] = towupper(workspace[i]);
! 			wasalnum = iswalnum(workspace[i]);
! 		}
! 
! 		result = wcstotext(workspace, i);
! 
! 		pfree(worksp

Re: [PATCHES] Simplify formatting.c

2008-06-18 Thread Bruce Momjian
Bruce Momjian wrote:
> Alvaro Herrera wrote:
> > Bruce Momjian wrote:
> > 
> > > I moved str_initcap() over into oracle_compat.c and then had initcap()
> > > convert to/from TEXT to call it.  The code is a little weird because
> > > str_initcap() needs to convert to text to use texttowcs(), so in
> > > multibyte encodings initcap converts the string to text, then to char,
> > > then to text to call texttowcs().  I didn't see a cleaner way to do
> > > this.
> > 
> > Why not use wchar2char?  It seems there's room for extra cleanup here.
> > 
> > Also, the prototype of str_initcap in builtins.h looks out of place.
> 
> I talked to Alvaro on IM, and there is certainly much more cleanup to do
> in this area. I will work from the bottom up.  First, is moving the
> USE_WIDE_UPPER_LOWER define to c.h, and removing TS_USE_WIDE and using
> USE_WIDE_UPPER_LOWER instead.  Patch attached and applied.

The second step is to move wchar2char() and char2wchar() from tsearch
into /mb to be easier to use for other modules;  also move pnstrdup(). 

Patch attached and applied.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/tsearch/ts_locale.c
===
RCS file: /cvsroot/pgsql/src/backend/tsearch/ts_locale.c,v
retrieving revision 1.8
diff -c -c -r1.8 ts_locale.c
*** src/backend/tsearch/ts_locale.c	17 Jun 2008 16:09:06 -	1.8
--- src/backend/tsearch/ts_locale.c	18 Jun 2008 18:37:02 -
***
*** 16,140 
  #include "tsearch/ts_locale.h"
  #include "tsearch/ts_public.h"
  
- 
  #ifdef USE_WIDE_UPPER_LOWER
  
- /*
-  * wchar2char --- convert wide characters to multibyte format
-  *
-  * This has the same API as the standard wcstombs() function; in particular,
-  * tolen is the maximum number of bytes to store at *to, and *from must be
-  * zero-terminated.  The output will be zero-terminated iff there is room.
-  */
- size_t
- wchar2char(char *to, const wchar_t *from, size_t tolen)
- {
- 	if (tolen == 0)
- 		return 0;
- 
- #ifdef WIN32
- 	if (GetDatabaseEncoding() == PG_UTF8)
- 	{
- 		int			r;
- 
- 		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
- NULL, NULL);
- 
- 		if (r <= 0)
- 			return (size_t) -1;
- 
- 		Assert(r <= tolen);
- 
- 		/* Microsoft counts the zero terminator in the result */
- 		return r - 1;
- 	}
- #endif   /* WIN32 */
- 
- 	return wcstombs(to, from, tolen);
- }
- 
- /*
-  * char2wchar --- convert multibyte characters to wide characters
-  *
-  * This has almost the API of mbstowcs(), except that *from need not be
-  * null-terminated; instead, the number of input bytes is specified as
-  * fromlen.  Also, we ereport() rather than returning -1 for invalid
-  * input encoding.	tolen is the maximum number of wchar_t's to store at *to.
-  * The output will be zero-terminated iff there is room.
-  */
- size_t
- char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
- {
- 	if (tolen == 0)
- 		return 0;
- 
- #ifdef WIN32
- 	if (GetDatabaseEncoding() == PG_UTF8)
- 	{
- 		int			r;
- 
- 		/* stupid Microsloth API does not work for zero-length input */
- 		if (fromlen == 0)
- 			r = 0;
- 		else
- 		{
- 			r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
- 
- 			if (r <= 0)
- 			{
- /* see notes in oracle_compat.c about error reporting */
- pg_verifymbstr(from, fromlen, false);
- ereport(ERROR,
- 		(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- 		 errmsg("invalid multibyte character for locale"),
- 		 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
- 			}
- 		}
- 
- 		Assert(r < tolen);
- 		to[r] = 0;
- 
- 		return r;
- 	}
- #endif   /* WIN32 */
- 
- 	if (lc_ctype_is_c())
- 	{
- 		/*
- 		 * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
- 		 * allocated with sufficient space
- 		 */
- 		return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
- 	}
- 	else
- 	{
- 		/*
- 		 * mbstowcs requires ending '\0'
- 		 */
- 		char	   *str = pnstrdup(from, fromlen);
- 		size_t		result;
- 
- 		result = mbstowcs(to, str, tolen);
- 
- 		pfree(str);
- 
- 		if (result == (size_t) -1)
- 		{
- 			pg_verifymbstr(from, fromlen, false);
- 			ereport(ERROR,
- 	(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- 	 errmsg("invalid multibyte character for locale"),
- 	 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
- 		}
- 
- 		if (result < tolen)
- 			to[result] = 0;
- 
- 		return result;
- 	}
- }
- 
- 
  int
  t_isdigit(const char *ptr)
  {
--- 16,23 
Index: src/backend/tsearch/ts_utils.c
===
RCS file: /cvsroot/pgsql/src/backend/tsearch/ts_utils.c,v
retrieving revision 1.9
diff -c -c -r1.9 ts_utils.c
*** src/backend/tsearch/ts_utils

Re: [PATCHES] Simplify formatting.c

2008-06-17 Thread Bruce Momjian
Alvaro Herrera wrote:
> Bruce Momjian wrote:
> 
> > I moved str_initcap() over into oracle_compat.c and then had initcap()
> > convert to/from TEXT to call it.  The code is a little weird because
> > str_initcap() needs to convert to text to use texttowcs(), so in
> > multibyte encodings initcap converts the string to text, then to char,
> > then to text to call texttowcs().  I didn't see a cleaner way to do
> > this.
> 
> Why not use wchar2char?  It seems there's room for extra cleanup here.
> 
> Also, the prototype of str_initcap in builtins.h looks out of place.

I talked to Alvaro on IM, and there is certainly much more cleanup to do
in this area. I will work from the bottom up.  First, is moving the
USE_WIDE_UPPER_LOWER define to c.h, and removing TS_USE_WIDE and using
USE_WIDE_UPPER_LOWER instead.  Patch attached and applied.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/tsearch/regis.c
===
RCS file: /cvsroot/pgsql/src/backend/tsearch/regis.c,v
retrieving revision 1.4
diff -c -c -r1.4 regis.c
*** src/backend/tsearch/regis.c	21 Jan 2008 02:46:10 -	1.4
--- src/backend/tsearch/regis.c	17 Jun 2008 16:06:54 -
***
*** 178,184 
  	r->node = NULL;
  }
  
! #ifdef TS_USE_WIDE
  static bool
  mb_strchr(char *str, char *c)
  {
--- 178,184 
  	r->node = NULL;
  }
  
! #ifdef USE_WIDE_UPPER_LOWER
  static bool
  mb_strchr(char *str, char *c)
  {
Index: src/backend/tsearch/ts_locale.c
===
RCS file: /cvsroot/pgsql/src/backend/tsearch/ts_locale.c,v
retrieving revision 1.7
diff -c -c -r1.7 ts_locale.c
*** src/backend/tsearch/ts_locale.c	1 Jan 2008 19:45:52 -	1.7
--- src/backend/tsearch/ts_locale.c	17 Jun 2008 16:06:54 -
***
*** 17,23 
  #include "tsearch/ts_public.h"
  
  
! #ifdef TS_USE_WIDE
  
  /*
   * wchar2char --- convert wide characters to multibyte format
--- 17,23 
  #include "tsearch/ts_public.h"
  
  
! #ifdef USE_WIDE_UPPER_LOWER
  
  /*
   * wchar2char --- convert wide characters to multibyte format
***
*** 190,196 
  
  	return iswprint((wint_t) character[0]);
  }
! #endif   /* TS_USE_WIDE */
  
  
  /*
--- 190,196 
  
  	return iswprint((wint_t) character[0]);
  }
! #endif   /* USE_WIDE_UPPER_LOWER */
  
  
  /*
***
*** 260,266 
  	if (len == 0)
  		return pstrdup("");
  
! #ifdef TS_USE_WIDE
  
  	/*
  	 * Use wide char code only when max encoding length > 1 and ctype != C.
--- 260,266 
  	if (len == 0)
  		return pstrdup("");
  
! #ifdef USE_WIDE_UPPER_LOWER
  
  	/*
  	 * Use wide char code only when max encoding length > 1 and ctype != C.
***
*** 307,313 
  		Assert(wlen < len);
  	}
  	else
! #endif   /* TS_USE_WIDE */
  	{
  		const char *ptr = str;
  		char	   *outptr;
--- 307,313 
  		Assert(wlen < len);
  	}
  	else
! #endif   /* USE_WIDE_UPPER_LOWER */
  	{
  		const char *ptr = str;
  		char	   *outptr;
Index: src/backend/tsearch/wparser_def.c
===
RCS file: /cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.14
diff -c -c -r1.14 wparser_def.c
*** src/backend/tsearch/wparser_def.c	1 Jan 2008 19:45:52 -	1.14
--- src/backend/tsearch/wparser_def.c	17 Jun 2008 16:06:54 -
***
*** 238,244 
  	/* string and position information */
  	char	   *str;			/* multibyte string */
  	int			lenstr;			/* length of mbstring */
! #ifdef TS_USE_WIDE
  	wchar_t*wstr;			/* wide character string */
  	int			lenwstr;		/* length of wsting */
  #endif
--- 238,244 
  	/* string and position information */
  	char	   *str;			/* multibyte string */
  	int			lenstr;			/* length of mbstring */
! #ifdef USE_WIDE_UPPER_LOWER
  	wchar_t*wstr;			/* wide character string */
  	int			lenwstr;		/* length of wsting */
  #endif
***
*** 291,297 
  	prs->str = str;
  	prs->lenstr = len;
  
! #ifdef TS_USE_WIDE
  
  	/*
  	 * Use wide char code only when max encoding length > 1.
--- 291,297 
  	prs->str = str;
  	prs->lenstr = len;
  
! #ifdef USE_WIDE_UPPER_LOWER
  
  	/*
  	 * Use wide char code only when max encoding length > 1.
***
*** 328,334 
  		prs->state = ptr;
  	}
  
! #ifdef TS_USE_WIDE
  	if (prs->wstr)
  		pfree(prs->wstr);
  #endif
--- 328,334 
  		prs->state = ptr;
  	}
  
! #ifdef USE_WIDE_UPPER_LOWER
  	if (prs->wstr)
  		pfree(prs->wstr);
  #endif
***
*** 344,350 
   * often are used for Asian languages
   */
  
! #ifdef TS_USE_WIDE
  
  #define p_iswhat(type)		\
  static int	\
--- 344,350 
   * often are used for Asian languages
   */
  
! #ifdef USE_WIDE_UPPER_LOWER
  
  #define p_iswhat(type)	

Re: [PATCHES] Simplify formatting.c

2008-06-14 Thread Alvaro Herrera
Bruce Momjian wrote:

> I moved str_initcap() over into oracle_compat.c and then had initcap()
> convert to/from TEXT to call it.  The code is a little weird because
> str_initcap() needs to convert to text to use texttowcs(), so in
> multibyte encodings initcap converts the string to text, then to char,
> then to text to call texttowcs().  I didn't see a cleaner way to do
> this.

Why not use wchar2char?  It seems there's room for extra cleanup here.

Also, the prototype of str_initcap in builtins.h looks out of place.

-- 
Alvaro Herrerahttp://www.CommandPrompt.com/
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-06-13 Thread Bruce Momjian
Tom Lane wrote:
> Euler Taveira de Oliveira <[EMAIL PROTECTED]> writes:
> > Tom Lane wrote:
> >> Also, it seems a bit inconsistent to be relying on
> >> oracle_compat.c for upper/lower but not initcap.
> >> 
> > I saw this inconsistence while I'm doing the patch. What about moving 
> > that upper/lower/initcap and wcs* code to another file. pg_locale.c? 
> 
> That doesn't seem a particularly appropriate place for them.  pg_locale
> is about dealing with the locale state, not about doing actual
> operations based on the locale data.
> 
> I was just thinking of having oracle_compat expose an initcap routine.

You mean like the attached?

I moved str_initcap() over into oracle_compat.c and then had initcap()
convert to/from TEXT to call it.  The code is a little weird because
str_initcap() needs to convert to text to use texttowcs(), so in
multibyte encodings initcap converts the string to text, then to char,
then to text to call texttowcs().  I didn't see a cleaner way to do
this.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/utils/adt/formatting.c
===
RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v
retrieving revision 1.141
diff -c -c -r1.141 formatting.c
*** src/backend/utils/adt/formatting.c	20 May 2008 01:41:02 -	1.141
--- src/backend/utils/adt/formatting.c	13 Jun 2008 22:01:18 -
***
*** 927,933 
  static int	strdigits_len(char *str);
  static char *str_toupper(char *buff);
  static char *str_tolower(char *buff);
- static char *str_initcap(char *buff);
  
  static int	seq_search(char *name, char **array, int type, int max, int *len);
  static void do_to_timestamp(text *date_txt, text *fmt,
--- 927,932 
***
*** 1484,1549 
  }

  /* --
-  * wide-character-aware initcap function
-  * --
-  */
- static char *
- str_initcap(char *buff)
- {
- 	char		*result;
- 	bool		wasalnum = false;
- 
- 	if (!buff)
- 		return NULL;
- 
- #ifdef USE_WIDE_UPPER_LOWER
- 	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
- 	{
- 		wchar_t		*workspace;
- 		text		*in_text;
- 		text		*out_text;
- 		int			i;
- 
- 		in_text = cstring_to_text(buff);
- 		workspace = texttowcs(in_text);
- 
- 		for (i = 0; workspace[i] != 0; i++)
- 		{
- 			if (wasalnum)
- workspace[i] = towlower(workspace[i]);
- 			else
- workspace[i] = towupper(workspace[i]);
- 			wasalnum = iswalnum(workspace[i]);
- 		}
- 
- 		out_text = wcstotext(workspace, i);
- 		result = text_to_cstring(out_text);
- 
- 		pfree(workspace);
- 		pfree(in_text);
- 		pfree(out_text);
- 	}
- 	else
- #endif		/* USE_WIDE_UPPER_LOWER */
- 	{
- 		char *p;
- 
- 		result = pstrdup(buff);
- 
- 		for (p = result; *p; p++)
- 		{
- 			if (wasalnum)
- *p = pg_tolower((unsigned char) *p);
- 			else
- *p = pg_toupper((unsigned char) *p);
- 			wasalnum = isalnum((unsigned char) *p);
- 		}
- 	}
- 
- 	return result;
- }
- 
- /* --
   * Sequential search with to upper/lower conversion
   * --
   */
--- 1483,1488 
Index: src/backend/utils/adt/oracle_compat.c
===
RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v
retrieving revision 1.79
diff -c -c -r1.79 oracle_compat.c
*** src/backend/utils/adt/oracle_compat.c	19 May 2008 18:08:16 -	1.79
--- src/backend/utils/adt/oracle_compat.c	13 Jun 2008 22:01:18 -
***
*** 471,478 
  Datum
  initcap(PG_FUNCTION_ARGS)
  {
! #ifdef USE_WIDE_UPPER_LOWER
  
  	/*
  	 * Use wide char code only when max encoding length > 1 and ctype != C.
  	 * Some operating systems fail with multi-byte encodings and a C locale.
--- 471,496 
  Datum
  initcap(PG_FUNCTION_ARGS)
  {
! 	text	   *string = PG_GETARG_TEXT_PP(0);
! 	char	   *str2;
! 
! 	str2 = str_initcap(DatumGetCString(string));
! 	string = cstring_to_text(str2);
! 	pfree(str2);
! 	PG_RETURN_TEXT_P(string);
! }
! 
  
+ char *
+ str_initcap(char *str)
+ {
+ 	char		*result;
+ 	int			wasalnum = 0;
+ 
+ 	if (!str)
+ 		return NULL;
+ 
+ #ifdef USE_WIDE_UPPER_LOWER
  	/*
  	 * Use wide char code only when max encoding length > 1 and ctype != C.
  	 * Some operating systems fail with multi-byte encodings and a C locale.
***
*** 480,492 
  	 */
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
  	{
- 		text	   *string = PG_GETARG_TEXT_PP(0);
- 		text	   *result;
  		wchar_t*workspace;
! 		int			wasalnum = 0;
  		int			i;
  
! 		workspace = texttowcs(string);
  
  		for (i = 0; workspace[i] != 0; i++)
  		{
--- 498,510 
  	 */
  	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
  	{
  		wchar_t*workspace;
! 		text		*in_text;
! 		text		*out_text;
  		int			i;
  
! 		in_text = cstring_to_text(str);
! 		workspace = texttowc

Re: [PATCHES] Simplify formatting.c

2008-05-20 Thread Tom Lane
Euler Taveira de Oliveira <[EMAIL PROTECTED]> writes:
> Tom Lane wrote:
>> Also, it seems a bit inconsistent to be relying on
>> oracle_compat.c for upper/lower but not initcap.
>> 
> I saw this inconsistence while I'm doing the patch. What about moving 
> that upper/lower/initcap and wcs* code to another file. pg_locale.c? 

That doesn't seem a particularly appropriate place for them.  pg_locale
is about dealing with the locale state, not about doing actual
operations based on the locale data.

I was just thinking of having oracle_compat expose an initcap routine.

regards, tom lane

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-05-20 Thread Euler Taveira de Oliveira

Tom Lane wrote:


Also, it seems a bit inconsistent to be relying on
oracle_compat.c for upper/lower but not initcap.

I saw this inconsistence while I'm doing the patch. What about moving 
that upper/lower/initcap and wcs* code to another file. pg_locale.c? 
BTW, formatting.c and oracle_compat.c already include pg_locale.h.



--
  Euler Taveira de Oliveira
  http://www.timbira.com/

--
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


Re: [PATCHES] Simplify formatting.c

2008-05-19 Thread Tom Lane
Bruce Momjian <[EMAIL PROTECTED]> writes:
> Now that upper/lower/initcase do not modify the passed string, I have
> simplified formatting.c with the attached patch.

I was thinking the same thing while reading the patch.  But please,
make str_toupper() and friends declare their argument "const" if you're
going to do this.

Another issue in this area is that these functions could do with some
refactoring to eliminate useless text/cstring conversions; I'm pretty
sure some code paths are doing cstring->text->cstring in direct
succession.  Also, it seems a bit inconsistent to be relying on
oracle_compat.c for upper/lower but not initcap.

regards, tom lane

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches


[PATCHES] Simplify formatting.c

2008-05-19 Thread Bruce Momjian
Now that upper/lower/initcase do not modify the passed string, I have
simplified formatting.c with the attached patch.

-- 
  Bruce Momjian  <[EMAIL PROTECTED]>http://momjian.us
  EnterpriseDB http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/utils/adt/formatting.c
===
RCS file: /cvsroot/pgsql/src/backend/utils/adt/formatting.c,v
retrieving revision 1.140
diff -c -c -r1.140 formatting.c
*** src/backend/utils/adt/formatting.c	19 May 2008 18:08:15 -	1.140
--- src/backend/utils/adt/formatting.c	20 May 2008 01:37:23 -
***
*** 1894,1903 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_full_months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_toupper(workbuff));
! }
  else
  {
  	strcpy(workbuff, months_full[tm->tm_mon - 1]);
--- 1894,1900 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! 	strcpy(s, str_toupper(localized_full_months[tm->tm_mon - 1]));
  else
  {
  	strcpy(workbuff, months_full[tm->tm_mon - 1]);
***
*** 1910,1923 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_full_months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_initcap(workbuff));
! }
  else
- {
  	sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]);
- }
  s += strlen(s);
  break;
  			case DCH_month:
--- 1907,1915 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! 	strcpy(s, str_initcap(localized_full_months[tm->tm_mon - 1]));
  else
  	sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]);
  s += strlen(s);
  break;
  			case DCH_month:
***
*** 1925,1934 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_full_months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_tolower(workbuff));
! }
  else
  {
  	sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]);
--- 1917,1923 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! 	strcpy(s, str_tolower(localized_full_months[tm->tm_mon - 1]));
  else
  {
  	sprintf(s, "%*s", S_FM(n->suffix) ? 0 : -9, months_full[tm->tm_mon - 1]);
***
*** 1941,1955 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_abbrev_months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_toupper(workbuff));
! }
  else
! {
! 	strcpy(workbuff, months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_toupper(workbuff));
! }
  s += strlen(s);
  break;
  			case DCH_Mon:
--- 1930,1938 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! 	strcpy(s, str_toupper(localized_abbrev_months[tm->tm_mon - 1]));
  else
! 	strcpy(s, str_toupper(months[tm->tm_mon - 1]));
  s += strlen(s);
  break;
  			case DCH_Mon:
***
*** 1957,1970 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_abbrev_months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_initcap(workbuff));
! }
  else
- {
  	strcpy(s, months[tm->tm_mon - 1]);
- }
  s += strlen(s);
  break;
  			case DCH_mon:
--- 1940,1948 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! 	strcpy(s, str_initcap(localized_abbrev_months[tm->tm_mon - 1]));
  else
  	strcpy(s, months[tm->tm_mon - 1]);
  s += strlen(s);
  break;
  			case DCH_mon:
***
*** 1972,1981 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_abbrev_months[tm->tm_mon - 1]);
! 	sprintf(s, "%*s", 0, str_tolower(workbuff));
! }
  else
  {
  	strcpy(s, months[tm->tm_mon - 1]);
--- 1950,1956 
  if (!tm->tm_mon)
  	break;
  if (S_TM(n->suffix))
! 	strcpy(s, str_tolower(localized_abbrev_months[tm->tm_mon - 1]));
  else
  {
  	strcpy(s, months[tm->tm_mon - 1]);
***
*** 1992,2001 
  			case DCH_DAY:
  INVALID_FOR_INTERVAL;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_full_days[tm->tm_wday]);
! 	sprintf(s, "%*s", 0, str_toupper(workbuff));
! }
  else
  {
  	strcpy(workbuff, days[tm->tm_wday]);
--- 1967,1973 
  			case DCH_DAY:
  INVALID_FOR_INTERVAL;
  if (S_TM(n->suffix))
! 	strcpy(s, str_toupper(localized_full_days[tm->tm_wday]));
  else
  {
  	strcpy(workbuff, days[tm->tm_wday]);
***
*** 2006,2028 
  			case DCH_Day:
  INVALID_FOR_INTERVAL;
  if (S_TM(n->suffix))
! {
! 	strcpy(workbuff, localized_full_days[tm->t