We have a couple of open bug reports in which the regex library takes unreasonable amounts of time and/or memory to process a regexp. While I have hopes that both of the known issues can be improved, it seems likely that there will always be cases where sufficiently complicated regexps just take too long. Right now, any such case is an effective DOS since the regex code contains no provisions for detecting a pending query cancel: the backend just locks up for however long it takes, unless it runs out of memory. I've resisted adding such provisions because I have hopes of making that code into a standalone library someday ... but I think it's time to capitulate. Arguably, other applications making use of a regex library could want query-cancel support too, so as long as there's an API for specifying a callback function to detect a pending query cancel, this doesn't seem any worse than some of the other things we've done to Spencer's code.
In the attached proposed patch, I've not actually invented such an API, but I have encapsulated the cancel check in an internal callback function, so that it wouldn't take much more work to add an API for setting a different callback function. I don't claim that the two places I added cancel checks necessarily provide 100% coverage of long code paths in the library; but they're pretty central to regex compilation and execution respectively, and they do stop the two known problem cases quite quickly. We can always add more checks if found necessary. Barring objection or better ideas, I propose to back-patch this into all live branches. regards, tom lane
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index ae2dbe4..f6dad01 100644 *** a/src/backend/regex/regc_nfa.c --- b/src/backend/regex/regc_nfa.c *************** newstate(struct nfa * nfa) *** 174,184 **** --- 174,196 ---- { struct state *s; + /* + * This is a handy place to check for operation cancel during regex + * compilation, since no code path will go very long without making a new + * state. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return NULL; + } + if (TooManyStates(nfa)) { NERR(REG_ETOOBIG); return NULL; } + if (nfa->free != NULL) { s = nfa->free; diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index ca1ccc5..f10f7a8 100644 *** a/src/backend/regex/regcomp.c --- b/src/backend/regex/regcomp.c *************** *** 34,39 **** --- 34,41 ---- #include "regex/regguts.h" + #include "miscadmin.h" /* needed by rcancelrequested() */ + /* * forward declarations, up here so forward datatypes etc. are defined early */ *************** static long nfanode(struct vars *, struc *** 67,72 **** --- 69,75 ---- static int newlacon(struct vars *, struct state *, struct state *, int); static void freelacons(struct subre *, int); static void rfree(regex_t *); + static int rcancelrequested(void); #ifdef REG_DEBUG static void dump(regex_t *, FILE *); *************** struct vars *** 276,281 **** --- 279,285 ---- /* static function list */ static const struct fns functions = { rfree, /* regfree insides */ + rcancelrequested /* check for cancel request */ }; *************** rfree(regex_t *re) *** 1893,1898 **** --- 1897,1918 ---- } } + /* + * rcancelrequested - check for external request to cancel regex operation + * + * Return nonzero to fail the operation with error code REG_CANCEL, + * zero to keep going + * + * The current implementation is Postgres-specific. If we ever get around + * to splitting this out as a standalone library, there will need to be some + * API to let applications provide a callback function for this. + */ + static int + rcancelrequested(void) + { + return InterruptPending && (QueryCancelPending || ProcDiePending); + } + #ifdef REG_DEBUG /* diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 78ebdee..0edb83c 100644 *** a/src/backend/regex/regexec.c --- b/src/backend/regex/regexec.c *************** cdissect(struct vars * v, *** 595,600 **** --- 595,604 ---- assert(t != NULL); MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); + /* handy place to check for operation cancel */ + if (CANCEL_REQUESTED(v->re)) + return REG_CANCEL; + switch (t->op) { case '=': /* terminal node */ diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 588052c..7c5b0d5 100644 *** a/src/backend/utils/adt/regexp.c --- b/src/backend/utils/adt/regexp.c *************** *** 31,36 **** --- 31,37 ---- #include "catalog/pg_type.h" #include "funcapi.h" + #include "miscadmin.h" #include "regex/regex.h" #include "utils/array.h" #include "utils/builtins.h" *************** RE_compile_and_cache(text *text_re, int *** 188,193 **** --- 189,203 ---- if (regcomp_result != REG_OKAY) { /* re didn't compile (no need for pg_regfree, if so) */ + + /* + * Here and in other places in this file, do CHECK_FOR_INTERRUPTS + * before reporting a regex error. This is so that if the regex + * library aborts and returns REG_CANCEL, we don't print an error + * message that implies the regex was invalid. + */ + CHECK_FOR_INTERRUPTS(); + pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), *************** RE_wchar_execute(regex_t *re, pg_wchar * *** 268,273 **** --- 278,284 ---- if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) { /* re failed??? */ + CHECK_FOR_INTERRUPTS(); pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), *************** regexp_fixed_prefix(text *text_re, bool *** 1216,1221 **** --- 1227,1233 ---- default: /* re failed??? */ + CHECK_FOR_INTERRUPTS(); pg_regerror(re_result, re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 8ac402b..cb07a06 100644 *** a/src/backend/utils/adt/varlena.c --- b/src/backend/utils/adt/varlena.c *************** replace_text_regexp(text *src_text, void *** 3035,3040 **** --- 3035,3041 ---- { char errMsg[100]; + CHECK_FOR_INTERRUPTS(); pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), diff --git a/src/include/regex/regerrs.h b/src/include/regex/regerrs.h index f02711e..809b511 100644 *** a/src/include/regex/regerrs.h --- b/src/include/regex/regerrs.h *************** *** 81,83 **** --- 81,87 ---- { REG_ECOLORS, "REG_ECOLORS", "too many colors" }, + + { + REG_CANCEL, "REG_CANCEL", "operation cancelled" + }, diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index 3e87dff..2c7fa4d 100644 *** a/src/include/regex/regex.h --- b/src/include/regex/regex.h *************** typedef struct *** 154,159 **** --- 154,160 ---- #define REG_BADOPT 18 /* invalid embedded option */ #define REG_ETOOBIG 19 /* nfa has too many states */ #define REG_ECOLORS 20 /* too many colors */ + #define REG_CANCEL 21 /* operation cancelled */ /* two specials for debugging and testing */ #define REG_ATOI 101 /* convert error-code name to number */ #define REG_ITOA 102 /* convert error-code number to name */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 3a39755..5361411 100644 *** a/src/include/regex/regguts.h --- b/src/include/regex/regguts.h *************** struct subre *** 446,453 **** --- 446,456 ---- struct fns { void FUNCPTR(free, (regex_t *)); + int FUNCPTR(cancel_requested, (void)); }; + #define CANCEL_REQUESTED(re) \ + ((*((struct fns *) (re)->re_fns)->cancel_requested) ()) /*
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers