I've written a variant of regexp_matches called regexp_matches_positions
which instead of returning matching substrings will return matching
positions. I found use of this when processing OCR scanned text and wanted
to prioritize matches based on their position.

The patch is for discussion. I'd also appriciate general suggestions as
this is my first experience with the postgresql code base.

The patch is against the master branch and includes a simple regression
test.
*** /tmp/DQoMjJ_regexp.c        2014-01-28 19:59:37.470271459 +0100
--- src/backend/utils/adt/regexp.c      2014-01-28 19:44:47.298288383 +0100
***************
*** 113,118 ****
--- 113,119 ----
                                         bool ignore_degenerate);
  static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
  static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
+ static ArrayType *build_regexp_matches_positions_result(regexp_matches_ctx 
*matchctx);
  static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
  
  
***************
*** 833,838 ****
--- 834,898 ----
        return regexp_matches(fcinfo);
  }
  
+ 
+ /*
+  * regexp_matches_positions()
+  *            Return a table of matched locations of a pattern within a 
string.
+  */
+ Datum
+ regexp_matches_positions(PG_FUNCTION_ARGS)
+ {
+       FuncCallContext *funcctx;
+       regexp_matches_ctx *matchctx;
+ 
+       if (SRF_IS_FIRSTCALL())
+       {
+               text       *pattern = PG_GETARG_TEXT_PP(1);
+               text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
+               MemoryContext oldcontext;
+ 
+               funcctx = SRF_FIRSTCALL_INIT();
+               oldcontext = 
MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+ 
+               /* be sure to copy the input string into the multi-call ctx */
+               matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), 
pattern,
+                                                                               
flags,
+                                                                               
PG_GET_COLLATION(),
+                                                                               
false, true, false);
+ 
+               /* Pre-create workspace that 
build_regexp_matches_positions_result needs */
+               matchctx->elems = (Datum *) palloc(sizeof(Datum) * 
matchctx->npatterns);
+               matchctx->nulls = (bool *) palloc(sizeof(bool) * 
matchctx->npatterns);
+ 
+               MemoryContextSwitchTo(oldcontext);
+               funcctx->user_fctx = (void *) matchctx;
+       }
+ 
+       funcctx = SRF_PERCALL_SETUP();
+       matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
+ 
+       if (matchctx->next_match < matchctx->nmatches)
+       {
+               ArrayType  *result_ary;
+ 
+               result_ary = build_regexp_matches_positions_result(matchctx);
+               matchctx->next_match++;
+               SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
+       }
+ 
+       /* release space in multi-call ctx to avoid intraquery memory leak */
+       cleanup_regexp_matches(matchctx);
+ 
+       SRF_RETURN_DONE(funcctx);
+ }
+ 
+ /* This is separate to keep the opr_sanity regression test from complaining */
+ Datum
+ regexp_matches_positions_no_flags(PG_FUNCTION_ARGS)
+ {
+       return regexp_matches_positions(fcinfo);
+ }
+ 
  /*
   * setup_regexp_matches --- do the initial matching for regexp_matches()
   *            or regexp_split()
***************
*** 1035,1040 ****
--- 1095,1140 ----
  }
  
  /*
+  * build_regexp_matches_positions_result - build output array for current 
match
+  */
+ static ArrayType *
+ build_regexp_matches_positions_result(regexp_matches_ctx *matchctx)
+ {
+       Datum      *elems = matchctx->elems;
+       bool       *nulls = matchctx->nulls;
+       int                     dims[1];
+       int                     lbs[1];
+       int                     loc;
+       int                     i;
+ 
+       /* Extract matching substrings from the original string */
+       loc = matchctx->next_match * matchctx->npatterns * 2;
+       for (i = 0; i < matchctx->npatterns; i++)
+       {
+               int                     so = matchctx->match_locs[loc++];
+               int                     eo = matchctx->match_locs[loc++];
+ 
+               if (so < 0 || eo < 0)
+               {
+                       elems[i] = (Datum) 0;
+                       nulls[i] = true;
+               }
+               else
+               {
+                       elems[i] = Int32GetDatum(so)+1;
+                       nulls[i] = false;
+               }
+       }
+ 
+       /* And form an array */
+       dims[0] = matchctx->npatterns;
+       lbs[0] = 1;
+       /* XXX: this hardcodes assumptions about the int4 type */
+       return construct_md_array(elems, nulls, 1, dims, lbs,
+                                                         INT4OID, 4, true, 
'i');
+ }
+ 
+ /*
   * regexp_split_to_table()
   *            Split the string at matches of the pattern, returning the
   *            split-out substrings as a table.
*** /tmp/xPfd4G_pg_proc.h       2014-01-28 19:59:37.478271459 +0100
--- src/include/catalog/pg_proc.h       2014-01-28 19:44:47.298288383 +0100
***************
*** 1899,1904 ****
--- 1899,1908 ----
  DESCR("find all match groups for regexp");
  DATA(insert OID =  2764 ( regexp_matches   PGNSP PGUID 12 1 10 0 0 f f f f t 
t i 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ regexp_matches _null_ 
_null_ _null_ ));
  DESCR("find all match groups for regexp");
+ DATA(insert OID =  7769 ( regexp_matches_positions   PGNSP PGUID 12 1 1 0 0 f 
f f f t t i 2 0 1009 "25 25" _null_ _null_ _null_ _null_ 
regexp_matches_positions_no_flags _null_ _null_ _null_ ));
+ DESCR("find all match positions for regexp");
+ DATA(insert OID =  7770 ( regexp_matches_positions   PGNSP PGUID 12 1 10 0 0 
f f f f t t i 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ 
regexp_matches_positions _null_ _null_ _null_ ));
+ DESCR("find all match positions for regexp");
  DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 
0 25 "25 25 23" _null_ _null_ _null_ _null_     split_text _null_ _null_ _null_ 
));
  DESCR("split string by field_sep and return field_num");
  DATA(insert OID =  2765 ( regexp_split_to_table PGNSP PGUID 12 1 1000 0 0 f f 
f f t t i 2 0 25 "25 25" _null_ _null_ _null_ _null_    
regexp_split_to_table_no_flags _null_ _null_ _null_ ));
*** /tmp/lXOnOH_builtins.h      2014-01-28 19:59:37.490271458 +0100
--- src/include/utils/builtins.h        2014-01-28 19:44:47.302288383 +0100
***************
*** 587,592 ****
--- 587,594 ----
  extern Datum similar_escape(PG_FUNCTION_ARGS);
  extern Datum regexp_matches(PG_FUNCTION_ARGS);
  extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
+ extern Datum regexp_matches_positions(PG_FUNCTION_ARGS);
+ extern Datum regexp_matches_no_flags_positions(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
*** /tmp/LiDRpL_strings.out     2014-01-28 19:59:37.498271458 +0100
--- src/test/regress/expected/strings.out       2014-01-28 19:44:47.302288383 
+0100
***************
*** 505,510 ****
--- 505,517 ----
  ERROR:  invalid regular expression: parentheses () not balanced
  SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
  ERROR:  invalid regular expression: invalid repetition count(s)
+ -- return all match positions from regexp
+ SELECT regexp_matches_positions('foobarbequebaz', $re$(bar)(beque)$re$);
+  regexp_matches_positions 
+ --------------------------
+  {4,7}
+ (1 row)
+ 
  -- split string on regexp
  SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps 
over the lazy dog', $re$\s+$re$) AS foo;
    foo  | length 
*** /tmp/h7moDJ_strings.sql     2014-01-28 19:59:37.506271458 +0100
--- src/test/regress/sql/strings.sql    2014-01-28 19:44:47.302288383 +0100
***************
*** 170,175 ****
--- 170,178 ----
  SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
  SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
  
+ -- return all match positions from regexp
+ SELECT regexp_matches_positions('foobarbequebaz', $re$(bar)(beque)$re$);
+ 
  -- split string on regexp
  SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps 
over the lazy dog', $re$\s+$re$) AS foo;
  SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 
$re$\s+$re$);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to