On Fri, 2014-07-11 at 11:51 -0400, Tom Lane wrote: > Jeff Davis <pg...@j-davis.com> writes: > > Attached is a small patch to $SUBJECT. > > In master, only single-byte characters are allowed as an escape. Of > > course, with the patch it must still be a single character, but it may > > be multi-byte. > > I'm concerned about the performance cost of this patch. Have you done > any measurements about what kind of overhead you are putting on the > inner loop of similar_escape?
I didn't consider this very performance critical, because this is looping through the pattern, which I wouldn't expect to be a long string. On my machine using en_US.UTF-8, the difference is imperceptible for a SIMILAR TO ... ESCAPE query. I was able to see about a 2% increase in runtime when using the similar_escape function directly. I made a 10M tuple table and did: explain analyze select similar_escape('ΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ','#') from t; which was the worst reasonable case I could think of. (It appears that selecting from a table is faster than from generate_series. I'm curious what you use when testing the performance of an individual function at the SQL level.) > At the very least, please don't call GetDatabaseEncoding() over again > every single time through the inner loop. More generally, why do we > need to use pg_encoding_verifymb? The input data is presumably validly > encoded. ISTM the significantly cheaper pg_mblen() would be more > appropriate. Thank you. Using the non-verifying variants reduces the penalty in the above test to 1%. If needed, we could optimize further through code specialization, as like_escape() does. Though I think like_escape() is specialized just because MatchText() is specialized; and MatchText is specialized because it operates on the actual data, not just the pattern. So I don't see a reason to specialize similar_escape(). Regards, Jeff Davis
*** a/src/backend/utils/adt/regexp.c --- b/src/backend/utils/adt/regexp.c *************** *** 688,698 **** similar_escape(PG_FUNCTION_ARGS) elen = VARSIZE_ANY_EXHDR(esc_text); if (elen == 0) e = NULL; /* no escape character */ ! else if (elen != 1) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), ! errmsg("invalid escape string"), ! errhint("Escape string must be empty or one character."))); } /*---------- --- 688,703 ---- elen = VARSIZE_ANY_EXHDR(esc_text); if (elen == 0) e = NULL; /* no escape character */ ! else ! { ! int escape_mblen = pg_mbstrlen_with_len(e, elen); ! ! if (escape_mblen > 1) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), ! errmsg("invalid escape string"), ! errhint("Escape string must be empty or one character."))); ! } } /*---------- *************** *** 722,781 **** similar_escape(PG_FUNCTION_ARGS) while (plen > 0) { ! char pchar = *p; ! if (afterescape) { ! if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */ ! *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; ! else { *r++ = '\\'; *r++ = pchar; } ! afterescape = false; ! } ! else if (e && pchar == *e) ! { ! /* SQL99 escape character; do not send to output */ ! afterescape = true; } ! else if (incharclass) { ! if (pchar == '\\') *r++ = '\\'; ! *r++ = pchar; ! if (pchar == ']') ! incharclass = false; ! } ! else if (pchar == '[') ! { ! *r++ = pchar; ! incharclass = true; ! } ! else if (pchar == '%') ! { ! *r++ = '.'; ! *r++ = '*'; ! } ! else if (pchar == '_') ! *r++ = '.'; ! else if (pchar == '(') ! { ! /* convert to non-capturing parenthesis */ ! *r++ = '('; ! *r++ = '?'; ! *r++ = ':'; ! } ! else if (pchar == '\\' || pchar == '.' || ! pchar == '^' || pchar == '$') ! { ! *r++ = '\\'; ! *r++ = pchar; } - else - *r++ = pchar; - p++, plen--; } *r++ = ')'; --- 727,813 ---- while (plen > 0) { ! char pchar = *p; ! int mblen = pg_mblen(p); ! if (mblen == 1) { ! if (afterescape) ! { ! if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */ ! *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; ! else ! { ! *r++ = '\\'; ! *r++ = pchar; ! } ! afterescape = false; ! } ! else if (e && pchar == *e) ! { ! /* SQL99 escape character; do not send to output */ ! afterescape = true; ! } ! else if (incharclass) ! { ! if (pchar == '\\') ! *r++ = '\\'; ! *r++ = pchar; ! if (pchar == ']') ! incharclass = false; ! } ! else if (pchar == '[') ! { ! *r++ = pchar; ! incharclass = true; ! } ! else if (pchar == '%') ! { ! *r++ = '.'; ! *r++ = '*'; ! } ! else if (pchar == '_') ! *r++ = '.'; ! else if (pchar == '(') ! { ! /* convert to non-capturing parenthesis */ ! *r++ = '('; ! *r++ = '?'; ! *r++ = ':'; ! } ! else if (pchar == '\\' || pchar == '.' || ! pchar == '^' || pchar == '$') { *r++ = '\\'; *r++ = pchar; } ! else ! *r++ = pchar; ! p++, plen--; } ! else { ! if (afterescape) ! { *r++ = '\\'; ! memcpy(r, p, mblen); ! r += mblen; ! afterescape = false; ! } ! else if (e && elen == mblen && memcmp(e, p, mblen) == 0) ! { ! /* SQL99 escape character; do not send to output */ ! afterescape = true; ! } ! else ! { ! memcpy(r, p, mblen); ! r += mblen; ! } ! ! p += mblen; ! plen -= mblen; } } *r++ = ')';
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers