I wrote: > The answer, on a reasonably new desktop machine (2.0GHz Xeon E5503) > running Fedora 16 in en_US.utf8 locale, is that 64K iterations of > pg_wc_isalpha or sibling functions requires a shade under 2ms. > So this definitely justifies caching the values to avoid computing > them more than once per session, but I'm not convinced there are > grounds for trying harder than that.
And here's a poorly-tested draft patch for that. regards, tom lane
diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c index fb6f06b5243f50bfad2cefa5c016d4e842791a3d..98f3c597678b492dd59afcd956e5cdfecdba4f86 100644 *** a/src/backend/regex/regc_cvec.c --- b/src/backend/regex/regc_cvec.c *************** static void *** 77,82 **** --- 77,83 ---- addchr(struct cvec * cv, /* character vector */ chr c) /* character to add */ { + assert(cv->nchrs < cv->chrspace); cv->chrs[cv->nchrs++] = (chr) c; } diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 6cf27958b1545a61fba01e76dc4d37aca32789dc..44ce582bdad1a7d830d4122cada45a39c188981c 100644 *** a/src/backend/regex/regc_locale.c --- b/src/backend/regex/regc_locale.c *************** static const struct cname *** 351,356 **** --- 351,366 ---- /* + * We do not use the hard-wired Unicode classification tables that Tcl does. + * This is because (a) we need to deal with other encodings besides Unicode, + * and (b) we want to track the behavior of the libc locale routines as + * closely as possible. For example, it wouldn't be unreasonable for a + * locale to not consider every Unicode letter as a letter. So we build + * character classification cvecs by asking libc, even for Unicode. + */ + + + /* * element - map collating-element name to celt */ static celt *************** cclass(struct vars * v, /* context */ *** 498,503 **** --- 508,514 ---- int cases) /* case-independent? */ { size_t len; + const struct cvec *ccv = NULL; struct cvec *cv = NULL; const char * const *namePtr; int i, *************** cclass(struct vars * v, /* context */ *** 549,626 **** /* * Now compute the character class contents. - * - * For the moment, assume that only char codes < 256 can be in these - * classes. */ switch ((enum classes) index) { case CC_PRINT: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_isprint((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_ALNUM: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_isalnum((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_ALPHA: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_isalpha((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_ASCII: cv = getcvec(v, 0, 1); if (cv) addrange(cv, 0, 0x7f); break; case CC_BLANK: cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: ! cv = getcvec(v, 0, 1); ! if (cv) ! addrange(cv, (chr) '0', (chr) '9'); break; case CC_PUNCT: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_ispunct((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_XDIGIT: cv = getcvec(v, 0, 3); if (cv) { --- 560,608 ---- /* * Now compute the character class contents. */ switch ((enum classes) index) { case CC_PRINT: ! ccv = pg_ctype_get_cache(pg_wc_isprint); break; case CC_ALNUM: ! ccv = pg_ctype_get_cache(pg_wc_isalnum); break; case CC_ALPHA: ! ccv = pg_ctype_get_cache(pg_wc_isalpha); break; case CC_ASCII: + /* hard-wired meaning */ cv = getcvec(v, 0, 1); if (cv) addrange(cv, 0, 0x7f); break; case CC_BLANK: + /* hard-wired meaning */ cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: + /* hard-wired meaning */ cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: ! ccv = pg_ctype_get_cache(pg_wc_isdigit); break; case CC_PUNCT: ! ccv = pg_ctype_get_cache(pg_wc_ispunct); break; case CC_XDIGIT: + /* + * It's not clear how to define this in non-western locales, and + * even less clear that there's any particular use in trying. + * So just hard-wire the meaning. + */ cv = getcvec(v, 0, 3); if (cv) { *************** cclass(struct vars * v, /* context */ *** 630,679 **** } break; case CC_SPACE: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_isspace((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_LOWER: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_islower((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_UPPER: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_isupper((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; case CC_GRAPH: ! cv = getcvec(v, UCHAR_MAX, 0); ! if (cv) ! { ! for (i = 0; i <= UCHAR_MAX; i++) ! { ! if (pg_wc_isgraph((chr) i)) ! addchr(cv, (chr) i); ! } ! } break; } if (cv == NULL) ERR(REG_ESPACE); return cv; --- 612,643 ---- } break; case CC_SPACE: ! ccv = pg_ctype_get_cache(pg_wc_isspace); break; case CC_LOWER: ! ccv = pg_ctype_get_cache(pg_wc_islower); break; case CC_UPPER: ! ccv = pg_ctype_get_cache(pg_wc_isupper); break; case CC_GRAPH: ! ccv = pg_ctype_get_cache(pg_wc_isgraph); break; } + + /* If we got a cached cvec, copy it to a freeable one. */ + if (ccv != NULL) + { + cv = getcvec(v, ccv->nchrs, ccv->nranges); + if (cv) + { + cv->nchrs = ccv->nchrs; + memcpy(cv->chrs, ccv->chrs, cv->nchrs * sizeof(chr)); + cv->nranges = ccv->nranges; + memcpy(cv->ranges, ccv->ranges, cv->nranges * sizeof(chr) * 2); + } + } + if (cv == NULL) ERR(REG_ESPACE); return cv; diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 7c010e372858065bedbc47382a84cb3650e03efa..9ea7a81b176de59ed989238b1849672108374d97 100644 *** a/src/backend/regex/regc_pg_locale.c --- b/src/backend/regex/regc_pg_locale.c *************** *** 1,7 **** /*------------------------------------------------------------------------- * * regc_pg_locale.c ! * ctype functions adapted to work on pg_wchar (a/k/a chr) * * This file is #included by regcomp.c; it's not meant to compile standalone. * --- 1,8 ---- /*------------------------------------------------------------------------- * * regc_pg_locale.c ! * ctype functions adapted to work on pg_wchar (a/k/a chr), ! * and functions to cache the results of wholesale ctype probing. * * This file is #included by regcomp.c; it's not meant to compile standalone. * *************** typedef enum *** 72,77 **** --- 73,79 ---- static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; + static Oid pg_regex_collation; /* * Hard-wired character properties for C locale *************** pg_set_regex_collation(Oid collation) *** 233,238 **** --- 235,241 ---- /* C/POSIX collations use this path regardless of database encoding */ pg_regex_strategy = PG_REGEX_LOCALE_C; pg_regex_locale = 0; + pg_regex_collation = C_COLLATION_OID; } else { *************** pg_set_regex_collation(Oid collation) *** 275,280 **** --- 278,285 ---- else pg_regex_strategy = PG_REGEX_LOCALE_1BYTE; } + + pg_regex_collation = collation; } } *************** pg_wc_tolower(pg_wchar c) *** 656,658 **** --- 661,873 ---- } return 0; /* can't get here, but keep compiler quiet */ } + + + /* + * These functions cache the results of probing libc's ctype behavior for + * all character codes of interest in a given encoding/collation. The + * result is provided as a "struct cvec", but notice that the representation + * is a touch different from a cvec created by regc_cvec.c: we allocate the + * chrs[] and ranges[] arrays separately from the struct so that we can + * realloc them larger at need. This is okay since the cvecs made here + * should never be freed by freecvec(). + * + * We use malloc not palloc since we mustn't lose control on out-of-memory; + * the main regex code expects us to return a failure indication instead. + */ + + typedef int (*pg_wc_probefunc) (pg_wchar c); + + typedef struct pg_ctype_cache + { + pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */ + Oid collation; /* collation this entry is for */ + struct cvec cv; /* cache entry contents */ + struct pg_ctype_cache *next; /* chain link */ + } pg_ctype_cache; + + static pg_ctype_cache *pg_ctype_cache_list = NULL; + + /* + * Add a chr or range to pcc->cv; return false if run out of memory + */ + static bool + store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs) + { + chr *newchrs; + + if (nchrs > 1) + { + if (pcc->cv.nranges >= pcc->cv.rangespace) + { + pcc->cv.rangespace *= 2; + newchrs = (chr *) realloc(pcc->cv.ranges, + pcc->cv.rangespace * sizeof(chr) * 2); + if (newchrs == NULL) + return false; + pcc->cv.ranges = newchrs; + } + pcc->cv.ranges[pcc->cv.nranges * 2] = chr1; + pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1; + pcc->cv.nranges++; + } + else + { + assert(nchrs == 1); + if (pcc->cv.nchrs >= pcc->cv.chrspace) + { + pcc->cv.chrspace *= 2; + newchrs = (chr *) realloc(pcc->cv.chrs, + pcc->cv.chrspace * sizeof(chr)); + if (newchrs == NULL) + return false; + pcc->cv.chrs = newchrs; + } + pcc->cv.chrs[pcc->cv.nchrs++] = chr1; + } + return true; + } + + /* + * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all + * chrs satisfying the probe function. The active collation is the one + * previously set by pg_set_regex_collation. Returns NULL if out of memory. + * + * Note that the result must NOT be freed or modified by caller. + */ + static const struct cvec * + pg_ctype_get_cache(pg_wc_probefunc probefunc) + { + pg_ctype_cache *pcc; + pg_wchar max_chr; + pg_wchar cur_chr; + int seq; + chr *newchrs; + + /* + * Do we already have the answer cached? + */ + for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next) + { + if (pcc->probefunc == probefunc && + pcc->collation == pg_regex_collation) + return &pcc->cv; + } + + /* + * Nope, so initialize some workspace ... + */ + pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache)); + if (pcc == NULL) + return NULL; + pcc->probefunc = probefunc; + pcc->collation = pg_regex_collation; + pcc->cv.nchrs = 0; + pcc->cv.chrspace = 256; + pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr)); + pcc->cv.nranges = 0; + pcc->cv.rangespace = 256; + pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2); + if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL) + goto out_of_memory; + + /* + * Decide how many character codes we ought to look through. For C locale + * there's no need to go further than 127. Otherwise, if the encoding is + * UTF8 go up to 0xFFFF (the end of the Basic Multilingual Plane). + * Otherwise, go up to 255. These limits are interrelated with + * restrictions discussed at the head of this file. + */ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + max_chr = (pg_wchar) 127; + break; + case PG_REGEX_LOCALE_WIDE: + case PG_REGEX_LOCALE_WIDE_L: + max_chr = (pg_wchar) 0xFFFF; + break; + case PG_REGEX_LOCALE_1BYTE: + case PG_REGEX_LOCALE_1BYTE_L: + max_chr = (pg_wchar) UCHAR_MAX; + break; + default: + max_chr = 0; /* can't get here, but keep compiler quiet */ + break; + } + + /* + * And scan 'em ... + */ + seq = 0; /* number of consecutive matches */ + + for (cur_chr = 0; cur_chr <= max_chr; cur_chr++) + { + if ((*probefunc) (cur_chr)) + seq++; + else if (seq > 0) + { + if (!store_match(pcc, cur_chr - seq, seq)) + goto out_of_memory; + seq = 0; + } + } + + if (seq > 0) + if (!store_match(pcc, cur_chr - seq, seq)) + goto out_of_memory; + + /* + * We might have allocated more memory than needed, if so free it + */ + if (pcc->cv.nchrs == 0) + { + free(pcc->cv.chrs); + pcc->cv.chrs = NULL; + pcc->cv.chrspace = 0; + } + else if (pcc->cv.nchrs < pcc->cv.chrspace) + { + newchrs = (chr *) realloc(pcc->cv.chrs, + pcc->cv.nchrs * sizeof(chr)); + if (newchrs == NULL) + goto out_of_memory; + pcc->cv.chrs = newchrs; + pcc->cv.chrspace = pcc->cv.nchrs; + } + if (pcc->cv.nranges == 0) + { + free(pcc->cv.ranges); + pcc->cv.ranges = NULL; + pcc->cv.rangespace = 0; + } + else if (pcc->cv.nranges < pcc->cv.rangespace) + { + newchrs = (chr *) realloc(pcc->cv.ranges, + pcc->cv.nranges * sizeof(chr) * 2); + if (newchrs == NULL) + goto out_of_memory; + pcc->cv.ranges = newchrs; + pcc->cv.rangespace = pcc->cv.nranges; + } + + /* + * Success, link it into cache chain + */ + pcc->next = pg_ctype_cache_list; + pg_ctype_cache_list = pcc; + + return &pcc->cv; + + /* + * Failure, clean up + */ + out_of_memory: + if (pcc->cv.chrs) + free(pcc->cv.chrs); + if (pcc->cv.ranges) + free(pcc->cv.ranges); + free(pcc); + + return NULL; + }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers