I've written a variant of regexp_matches called regexp_matches_positions which instead of returning matching substrings will return matching positions. I found use of this when processing OCR scanned text and wanted to prioritize matches based on their position.
The patch is for discussion. I'd also appriciate general suggestions as this is my first experience with the postgresql code base. The patch is against the master branch and includes a simple regression test.
*** /tmp/DQoMjJ_regexp.c 2014-01-28 19:59:37.470271459 +0100 --- src/backend/utils/adt/regexp.c 2014-01-28 19:44:47.298288383 +0100 *************** *** 113,118 **** --- 113,119 ---- bool ignore_degenerate); static void cleanup_regexp_matches(regexp_matches_ctx *matchctx); static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx); + static ArrayType *build_regexp_matches_positions_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); *************** *** 833,838 **** --- 834,898 ---- return regexp_matches(fcinfo); } + + /* + * regexp_matches_positions() + * Return a table of matched locations of a pattern within a string. + */ + Datum + regexp_matches_positions(PG_FUNCTION_ARGS) + { + FuncCallContext *funcctx; + regexp_matches_ctx *matchctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_PP(1); + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + MemoryContext oldcontext; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* be sure to copy the input string into the multi-call ctx */ + matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, + flags, + PG_GET_COLLATION(), + false, true, false); + + /* Pre-create workspace that build_regexp_matches_positions_result needs */ + matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); + matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) matchctx; + } + + funcctx = SRF_PERCALL_SETUP(); + matchctx = (regexp_matches_ctx *) funcctx->user_fctx; + + if (matchctx->next_match < matchctx->nmatches) + { + ArrayType *result_ary; + + result_ary = build_regexp_matches_positions_result(matchctx); + matchctx->next_match++; + SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); + } + + /* release space in multi-call ctx to avoid intraquery memory leak */ + cleanup_regexp_matches(matchctx); + + SRF_RETURN_DONE(funcctx); + } + + /* This is separate to keep the opr_sanity regression test from complaining */ + Datum + regexp_matches_positions_no_flags(PG_FUNCTION_ARGS) + { + return regexp_matches_positions(fcinfo); + } + /* * setup_regexp_matches --- do the initial matching for regexp_matches() * or regexp_split() *************** *** 1035,1040 **** --- 1095,1140 ---- } /* + * build_regexp_matches_positions_result - build output array for current match + */ + static ArrayType * + build_regexp_matches_positions_result(regexp_matches_ctx *matchctx) + { + Datum *elems = matchctx->elems; + bool *nulls = matchctx->nulls; + int dims[1]; + int lbs[1]; + int loc; + int i; + + /* Extract matching substrings from the original string */ + loc = matchctx->next_match * matchctx->npatterns * 2; + for (i = 0; i < matchctx->npatterns; i++) + { + int so = matchctx->match_locs[loc++]; + int eo = matchctx->match_locs[loc++]; + + if (so < 0 || eo < 0) + { + elems[i] = (Datum) 0; + nulls[i] = true; + } + else + { + elems[i] = Int32GetDatum(so)+1; + nulls[i] = false; + } + } + + /* And form an array */ + dims[0] = matchctx->npatterns; + lbs[0] = 1; + /* XXX: this hardcodes assumptions about the int4 type */ + return construct_md_array(elems, nulls, 1, dims, lbs, + INT4OID, 4, true, 'i'); + } + + /* * regexp_split_to_table() * Split the string at matches of the pattern, returning the * split-out substrings as a table. *** /tmp/xPfd4G_pg_proc.h 2014-01-28 19:59:37.478271459 +0100 --- src/include/catalog/pg_proc.h 2014-01-28 19:44:47.298288383 +0100 *************** *** 1899,1904 **** --- 1899,1908 ---- DESCR("find all match groups for regexp"); DATA(insert OID = 2764 ( regexp_matches PGNSP PGUID 12 1 10 0 0 f f f f t t i 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ regexp_matches _null_ _null_ _null_ )); DESCR("find all match groups for regexp"); + DATA(insert OID = 7769 ( regexp_matches_positions PGNSP PGUID 12 1 1 0 0 f f f f t t i 2 0 1009 "25 25" _null_ _null_ _null_ _null_ regexp_matches_positions_no_flags _null_ _null_ _null_ )); + DESCR("find all match positions for regexp"); + DATA(insert OID = 7770 ( regexp_matches_positions PGNSP PGUID 12 1 10 0 0 f f f f t t i 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ regexp_matches_positions _null_ _null_ _null_ )); + DESCR("find all match positions for regexp"); DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 25 "25 25 23" _null_ _null_ _null_ _null_ split_text _null_ _null_ _null_ )); DESCR("split string by field_sep and return field_num"); DATA(insert OID = 2765 ( regexp_split_to_table PGNSP PGUID 12 1 1000 0 0 f f f f t t i 2 0 25 "25 25" _null_ _null_ _null_ _null_ regexp_split_to_table_no_flags _null_ _null_ _null_ )); *** /tmp/lXOnOH_builtins.h 2014-01-28 19:59:37.490271458 +0100 --- src/include/utils/builtins.h 2014-01-28 19:44:47.302288383 +0100 *************** *** 587,592 **** --- 587,594 ---- extern Datum similar_escape(PG_FUNCTION_ARGS); extern Datum regexp_matches(PG_FUNCTION_ARGS); extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS); + extern Datum regexp_matches_positions(PG_FUNCTION_ARGS); + extern Datum regexp_matches_no_flags_positions(PG_FUNCTION_ARGS); extern Datum regexp_split_to_table(PG_FUNCTION_ARGS); extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS); extern Datum regexp_split_to_array(PG_FUNCTION_ARGS); *** /tmp/LiDRpL_strings.out 2014-01-28 19:59:37.498271458 +0100 --- src/test/regress/expected/strings.out 2014-01-28 19:44:47.302288383 +0100 *************** *** 505,510 **** --- 505,517 ---- ERROR: invalid regular expression: parentheses () not balanced SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$); ERROR: invalid regular expression: invalid repetition count(s) + -- return all match positions from regexp + SELECT regexp_matches_positions('foobarbequebaz', $re$(bar)(beque)$re$); + regexp_matches_positions + -------------------------- + {4,7} + (1 row) + -- split string on regexp SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s+$re$) AS foo; foo | length *** /tmp/h7moDJ_strings.sql 2014-01-28 19:59:37.506271458 +0100 --- src/test/regress/sql/strings.sql 2014-01-28 19:44:47.302288383 +0100 *************** *** 170,175 **** --- 170,178 ---- SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$); SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$); + -- return all match positions from regexp + SELECT regexp_matches_positions('foobarbequebaz', $re$(bar)(beque)$re$); + -- split string on regexp SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s+$re$) AS foo; SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s+$re$);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers