Re: [HACKERS] Notes about fixing regexes and UTF-8 (yet again)

Tom Lane Fri, 17 Feb 2012 18:18:18 -0800

I wrote:
> The answer, on a reasonably new desktop machine (2.0GHz Xeon E5503)
> running Fedora 16 in en_US.utf8 locale, is that 64K iterations of
> pg_wc_isalpha or sibling functions requires a shade under 2ms.
> So this definitely justifies caching the values to avoid computing
> them more than once per session, but I'm not convinced there are
> grounds for trying harder than that.


And here's a poorly-tested draft patch for that.

                        regards, tom lane

diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
index fb6f06b5243f50bfad2cefa5c016d4e842791a3d..98f3c597678b492dd59afcd956e5cdfecdba4f86 100644
*** a/src/backend/regex/regc_cvec.c
--- b/src/backend/regex/regc_cvec.c
*************** static void
*** 77,82 ****
--- 77,83 ----
  addchr(struct cvec * cv,		/* character vector */
  	   chr c)					/* character to add */
  {
+ 	assert(cv->nchrs < cv->chrspace);
  	cv->chrs[cv->nchrs++] = (chr) c;
  }
  
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 6cf27958b1545a61fba01e76dc4d37aca32789dc..44ce582bdad1a7d830d4122cada45a39c188981c 100644
*** a/src/backend/regex/regc_locale.c
--- b/src/backend/regex/regc_locale.c
*************** static const struct cname
*** 351,356 ****
--- 351,366 ----
  
  
  /*
+  * We do not use the hard-wired Unicode classification tables that Tcl does.
+  * This is because (a) we need to deal with other encodings besides Unicode,
+  * and (b) we want to track the behavior of the libc locale routines as
+  * closely as possible.  For example, it wouldn't be unreasonable for a
+  * locale to not consider every Unicode letter as a letter.  So we build
+  * character classification cvecs by asking libc, even for Unicode.
+  */
+ 
+ 
+ /*
   * element - map collating-element name to celt
   */
  static celt
*************** cclass(struct vars * v,			/* context */
*** 498,503 ****
--- 508,514 ----
  	   int cases)				/* case-independent? */
  {
  	size_t		len;
+ 	const struct cvec *ccv = NULL;
  	struct cvec *cv = NULL;
  	const char * const *namePtr;
  	int			i,
*************** cclass(struct vars * v,			/* context */
*** 549,626 ****
  
  	/*
  	 * Now compute the character class contents.
- 	 *
- 	 * For the moment, assume that only char codes < 256 can be in these
- 	 * classes.
  	 */
  
  	switch ((enum classes) index)
  	{
  		case CC_PRINT:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_isprint((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_ALNUM:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_isalnum((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_ALPHA:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_isalpha((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_ASCII:
  			cv = getcvec(v, 0, 1);
  			if (cv)
  				addrange(cv, 0, 0x7f);
  			break;
  		case CC_BLANK:
  			cv = getcvec(v, 2, 0);
  			addchr(cv, '\t');
  			addchr(cv, ' ');
  			break;
  		case CC_CNTRL:
  			cv = getcvec(v, 0, 2);
  			addrange(cv, 0x0, 0x1f);
  			addrange(cv, 0x7f, 0x9f);
  			break;
  		case CC_DIGIT:
! 			cv = getcvec(v, 0, 1);
! 			if (cv)
! 				addrange(cv, (chr) '0', (chr) '9');
  			break;
  		case CC_PUNCT:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_ispunct((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_XDIGIT:
  			cv = getcvec(v, 0, 3);
  			if (cv)
  			{
--- 560,608 ----
  
  	/*
  	 * Now compute the character class contents.
  	 */
  
  	switch ((enum classes) index)
  	{
  		case CC_PRINT:
! 			ccv = pg_ctype_get_cache(pg_wc_isprint);
  			break;
  		case CC_ALNUM:
! 			ccv = pg_ctype_get_cache(pg_wc_isalnum);
  			break;
  		case CC_ALPHA:
! 			ccv = pg_ctype_get_cache(pg_wc_isalpha);
  			break;
  		case CC_ASCII:
+ 			/* hard-wired meaning */
  			cv = getcvec(v, 0, 1);
  			if (cv)
  				addrange(cv, 0, 0x7f);
  			break;
  		case CC_BLANK:
+ 			/* hard-wired meaning */
  			cv = getcvec(v, 2, 0);
  			addchr(cv, '\t');
  			addchr(cv, ' ');
  			break;
  		case CC_CNTRL:
+ 			/* hard-wired meaning */
  			cv = getcvec(v, 0, 2);
  			addrange(cv, 0x0, 0x1f);
  			addrange(cv, 0x7f, 0x9f);
  			break;
  		case CC_DIGIT:
! 			ccv = pg_ctype_get_cache(pg_wc_isdigit);
  			break;
  		case CC_PUNCT:
! 			ccv = pg_ctype_get_cache(pg_wc_ispunct);
  			break;
  		case CC_XDIGIT:
+ 			/*
+ 			 * It's not clear how to define this in non-western locales, and
+ 			 * even less clear that there's any particular use in trying.
+ 			 * So just hard-wire the meaning.
+ 			 */
  			cv = getcvec(v, 0, 3);
  			if (cv)
  			{
*************** cclass(struct vars * v,			/* context */
*** 630,679 ****
  			}
  			break;
  		case CC_SPACE:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_isspace((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_LOWER:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_islower((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_UPPER:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_isupper((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  		case CC_GRAPH:
! 			cv = getcvec(v, UCHAR_MAX, 0);
! 			if (cv)
! 			{
! 				for (i = 0; i <= UCHAR_MAX; i++)
! 				{
! 					if (pg_wc_isgraph((chr) i))
! 						addchr(cv, (chr) i);
! 				}
! 			}
  			break;
  	}
  	if (cv == NULL)
  		ERR(REG_ESPACE);
  	return cv;
--- 612,643 ----
  			}
  			break;
  		case CC_SPACE:
! 			ccv = pg_ctype_get_cache(pg_wc_isspace);
  			break;
  		case CC_LOWER:
! 			ccv = pg_ctype_get_cache(pg_wc_islower);
  			break;
  		case CC_UPPER:
! 			ccv = pg_ctype_get_cache(pg_wc_isupper);
  			break;
  		case CC_GRAPH:
! 			ccv = pg_ctype_get_cache(pg_wc_isgraph);
  			break;
  	}
+ 
+ 	/* If we got a cached cvec, copy it to a freeable one. */
+ 	if (ccv != NULL)
+ 	{
+ 		cv = getcvec(v, ccv->nchrs, ccv->nranges);
+ 		if (cv)
+ 		{
+ 			cv->nchrs = ccv->nchrs;
+ 			memcpy(cv->chrs, ccv->chrs, cv->nchrs * sizeof(chr));
+ 			cv->nranges = ccv->nranges;
+ 			memcpy(cv->ranges, ccv->ranges, cv->nranges * sizeof(chr) * 2);
+ 		}
+ 	}
+ 
  	if (cv == NULL)
  		ERR(REG_ESPACE);
  	return cv;
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 7c010e372858065bedbc47382a84cb3650e03efa..9ea7a81b176de59ed989238b1849672108374d97 100644
*** a/src/backend/regex/regc_pg_locale.c
--- b/src/backend/regex/regc_pg_locale.c
***************
*** 1,7 ****
  /*-------------------------------------------------------------------------
   *
   * regc_pg_locale.c
!  *	  ctype functions adapted to work on pg_wchar (a/k/a chr)
   *
   * This file is #included by regcomp.c; it's not meant to compile standalone.
   *
--- 1,8 ----
  /*-------------------------------------------------------------------------
   *
   * regc_pg_locale.c
!  *	  ctype functions adapted to work on pg_wchar (a/k/a chr),
!  *	  and functions to cache the results of wholesale ctype probing.
   *
   * This file is #included by regcomp.c; it's not meant to compile standalone.
   *
*************** typedef enum
*** 72,77 ****
--- 73,79 ----
  
  static PG_Locale_Strategy pg_regex_strategy;
  static pg_locale_t pg_regex_locale;
+ static Oid	pg_regex_collation;
  
  /*
   * Hard-wired character properties for C locale
*************** pg_set_regex_collation(Oid collation)
*** 233,238 ****
--- 235,241 ----
  		/* C/POSIX collations use this path regardless of database encoding */
  		pg_regex_strategy = PG_REGEX_LOCALE_C;
  		pg_regex_locale = 0;
+ 		pg_regex_collation = C_COLLATION_OID;
  	}
  	else
  	{
*************** pg_set_regex_collation(Oid collation)
*** 275,280 ****
--- 278,285 ----
  			else
  				pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
  		}
+ 
+ 		pg_regex_collation = collation;
  	}
  }
  
*************** pg_wc_tolower(pg_wchar c)
*** 656,658 ****
--- 661,873 ----
  	}
  	return 0;					/* can't get here, but keep compiler quiet */
  }
+ 
+ 
+ /*
+  * These functions cache the results of probing libc's ctype behavior for
+  * all character codes of interest in a given encoding/collation.  The
+  * result is provided as a "struct cvec", but notice that the representation
+  * is a touch different from a cvec created by regc_cvec.c: we allocate the
+  * chrs[] and ranges[] arrays separately from the struct so that we can
+  * realloc them larger at need.  This is okay since the cvecs made here
+  * should never be freed by freecvec().
+  *
+  * We use malloc not palloc since we mustn't lose control on out-of-memory;
+  * the main regex code expects us to return a failure indication instead.
+  */
+ 
+ typedef int (*pg_wc_probefunc) (pg_wchar c);
+ 
+ typedef struct pg_ctype_cache
+ {
+ 	pg_wc_probefunc probefunc;		/* pg_wc_isalpha or a sibling */
+ 	Oid			collation;			/* collation this entry is for */
+ 	struct cvec cv;					/* cache entry contents */
+ 	struct pg_ctype_cache *next;	/* chain link */
+ } pg_ctype_cache;
+ 
+ static pg_ctype_cache *pg_ctype_cache_list = NULL;
+ 
+ /*
+  * Add a chr or range to pcc->cv; return false if run out of memory
+  */
+ static bool
+ store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
+ {
+ 	chr		   *newchrs;
+ 
+ 	if (nchrs > 1)
+ 	{
+ 		if (pcc->cv.nranges >= pcc->cv.rangespace)
+ 		{
+ 			pcc->cv.rangespace *= 2;
+ 			newchrs = (chr *) realloc(pcc->cv.ranges,
+ 									  pcc->cv.rangespace * sizeof(chr) * 2);
+ 			if (newchrs == NULL)
+ 				return false;
+ 			pcc->cv.ranges = newchrs;
+ 		}
+ 		pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
+ 		pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
+ 		pcc->cv.nranges++;
+ 	}
+ 	else
+ 	{
+ 		assert(nchrs == 1);
+ 		if (pcc->cv.nchrs >= pcc->cv.chrspace)
+ 		{
+ 			pcc->cv.chrspace *= 2;
+ 			newchrs = (chr *) realloc(pcc->cv.chrs,
+ 									  pcc->cv.chrspace * sizeof(chr));
+ 			if (newchrs == NULL)
+ 				return false;
+ 			pcc->cv.chrs = newchrs;
+ 		}
+ 		pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
+ 	}
+ 	return true;
+ }
+ 
+ /*
+  * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
+  * chrs satisfying the probe function.  The active collation is the one
+  * previously set by pg_set_regex_collation.  Returns NULL if out of memory.
+  *
+  * Note that the result must NOT be freed or modified by caller.
+  */
+ static const struct cvec *
+ pg_ctype_get_cache(pg_wc_probefunc probefunc)
+ {
+ 	pg_ctype_cache *pcc;
+ 	pg_wchar	max_chr;
+ 	pg_wchar	cur_chr;
+ 	int			seq;
+ 	chr		   *newchrs;
+ 
+ 	/*
+ 	 * Do we already have the answer cached?
+ 	 */
+ 	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
+ 	{
+ 		if (pcc->probefunc == probefunc &&
+ 			pcc->collation == pg_regex_collation)
+ 			return &pcc->cv;
+ 	}
+ 
+ 	/*
+ 	 * Nope, so initialize some workspace ...
+ 	 */
+ 	pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
+ 	if (pcc == NULL)
+ 		return NULL;
+ 	pcc->probefunc = probefunc;
+ 	pcc->collation = pg_regex_collation;
+ 	pcc->cv.nchrs = 0;
+ 	pcc->cv.chrspace = 256;
+ 	pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
+ 	pcc->cv.nranges = 0;
+ 	pcc->cv.rangespace = 256;
+ 	pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
+ 	if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
+ 		goto out_of_memory;
+ 
+ 	/*
+ 	 * Decide how many character codes we ought to look through.  For C locale
+ 	 * there's no need to go further than 127.  Otherwise, if the encoding is
+ 	 * UTF8 go up to 0xFFFF (the end of the Basic Multilingual Plane).
+ 	 * Otherwise, go up to 255.  These limits are interrelated with
+ 	 * restrictions discussed at the head of this file.
+ 	 */
+ 	switch (pg_regex_strategy)
+ 	{
+ 		case PG_REGEX_LOCALE_C:
+ 			max_chr = (pg_wchar) 127;
+ 			break;
+ 		case PG_REGEX_LOCALE_WIDE:
+ 		case PG_REGEX_LOCALE_WIDE_L:
+ 			max_chr = (pg_wchar) 0xFFFF;
+ 			break;
+ 		case PG_REGEX_LOCALE_1BYTE:
+ 		case PG_REGEX_LOCALE_1BYTE_L:
+ 			max_chr = (pg_wchar) UCHAR_MAX;
+ 			break;
+ 		default:
+ 			max_chr = 0;		/* can't get here, but keep compiler quiet */
+ 			break;
+ 	}
+ 
+ 	/*
+ 	 * And scan 'em ...
+ 	 */
+ 	seq = 0;					/* number of consecutive matches */
+ 
+ 	for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
+ 	{
+ 		if ((*probefunc) (cur_chr))
+ 			seq++;
+ 		else if (seq > 0)
+ 		{
+ 			if (!store_match(pcc, cur_chr - seq, seq))
+ 				goto out_of_memory;
+ 			seq = 0;
+ 		}
+ 	}
+ 
+ 	if (seq > 0)
+ 		if (!store_match(pcc, cur_chr - seq, seq))
+ 			goto out_of_memory;
+ 
+ 	/*
+ 	 * We might have allocated more memory than needed, if so free it
+ 	 */
+ 	if (pcc->cv.nchrs == 0)
+ 	{
+ 		free(pcc->cv.chrs);
+ 		pcc->cv.chrs = NULL;
+ 		pcc->cv.chrspace = 0;
+ 	}
+ 	else if (pcc->cv.nchrs < pcc->cv.chrspace)
+ 	{
+ 		newchrs = (chr *) realloc(pcc->cv.chrs,
+ 								  pcc->cv.nchrs * sizeof(chr));
+ 		if (newchrs == NULL)
+ 			goto out_of_memory;
+ 		pcc->cv.chrs = newchrs;
+ 		pcc->cv.chrspace = pcc->cv.nchrs;
+ 	}
+ 	if (pcc->cv.nranges == 0)
+ 	{
+ 		free(pcc->cv.ranges);
+ 		pcc->cv.ranges = NULL;
+ 		pcc->cv.rangespace = 0;
+ 	}
+ 	else if (pcc->cv.nranges < pcc->cv.rangespace)
+ 	{
+ 		newchrs = (chr *) realloc(pcc->cv.ranges,
+ 								  pcc->cv.nranges * sizeof(chr) * 2);
+ 		if (newchrs == NULL)
+ 			goto out_of_memory;
+ 		pcc->cv.ranges = newchrs;
+ 		pcc->cv.rangespace = pcc->cv.nranges;
+ 	}
+ 
+ 	/*
+ 	 * Success, link it into cache chain
+ 	 */
+ 	pcc->next = pg_ctype_cache_list;
+ 	pg_ctype_cache_list = pcc;
+ 
+ 	return &pcc->cv;
+ 
+ 	/*
+ 	 * Failure, clean up
+ 	 */
+ out_of_memory:
+ 	if (pcc->cv.chrs)
+ 		free(pcc->cv.chrs);
+ 	if (pcc->cv.ranges)
+ 		free(pcc->cv.ranges);
+ 	free(pcc);
+ 
+ 	return NULL;
+ }

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Notes about fixing regexes and UTF-8 (yet again)

Reply via email to