proposal - function string_to_table

Pavel Stehule Fri, 17 Apr 2020 10:48:13 -0700

Hi

I propose new function string_to_table. This function is significantly
faster (and simpler) variant of regexp_split_to_array function. There was
same process years ago when we implemented string_agg as faster variant of
array_to_string(array_agg()). string_to_table is faster variant (and little
bit more intuitive alternative of unnest(string_to_array()).


string_to_table is about 15% faster than unnest(string_to_array()) and
about 40% faster than regexp_split_to_array.

Initial patch is attached

Notes, comments?

Regards

Pavel

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 7a270eb0ab..4046646267 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -14160,6 +14160,9 @@ SELECT NULLIF(value, '(none)') ...
   <indexterm>
     <primary>string_to_array</primary>
   </indexterm>
+  <indexterm>
+    <primary>string_to_table</primary>
+  </indexterm>
   <indexterm>
     <primary>unnest</primary>
   </indexterm>
@@ -14362,6 +14365,22 @@ SELECT NULLIF(value, '(none)') ...
         <entry><literal>string_to_array('xx~^~yy~^~zz', '~^~', 'yy')</literal></entry>
         <entry><literal>{xx,NULL,zz}</literal></entry>
        </row>
+
+
+       <row>
+        <entry>
+         <literal>
+          <function>string_to_table</function>(<type>text</type>, <type>text</type> <optional>, <type>text</type></optional>)
+         </literal>
+        </entry>
+        <entry><type>setof text</type></entry>
+        <entry>splits string into table using supplied delimiter and
+         optional null string</entry>
+        <entry><literal>string_to_table('xx~^~yy~^~zz', '~^~', 'yy')</literal></entry>
+        <entry><literallayout class="monospaced">xx
+yy
+zz</literallayout>(3 rows)</entry>
+       </row>
        <row>
         <entry>
          <literal>
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2eaabd6231..5d0826e668 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -26,6 +26,7 @@
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
+#include "nodes/execnodes.h"
 #include "parser/scansup.h"
 #include "port/pg_bswap.h"
 #include "regex/regex.h"
@@ -35,6 +36,7 @@
 #include "utils/memutils.h"
 #include "utils/pg_locale.h"
 #include "utils/sortsupport.h"
+#include "utils/tuplestore.h"
 #include "utils/varlena.h"
 
 
@@ -92,6 +94,16 @@ typedef struct
 	pg_locale_t locale;
 } VarStringSortSupport;
 
+/*
+ * Holds target metadata used for split string to array or to table.
+ */
+typedef struct
+{
+	ArrayBuildState	*astate;
+	Tuplestorestate *tupstore;
+	TupleDesc	tupdesc;
+} SplitStringTargetData;
+
 /*
  * This should be large enough that most strings will fit, but small enough
  * that we feel comfortable putting it on the stack
@@ -139,7 +151,7 @@ static bytea *bytea_substring(Datum str,
 							  bool length_not_specified);
 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 static void appendStringInfoText(StringInfo str, const text *t);
-static Datum text_to_array_internal(PG_FUNCTION_ARGS);
+static bool text_to_array_internal(FunctionCallInfo fcinfo, SplitStringTargetData *tstate);
 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
 									const char *fldsep, const char *null_string);
 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
@@ -4679,7 +4691,19 @@ text_isequal(text *txt1, text *txt2, Oid collid)
 Datum
 text_to_array(PG_FUNCTION_ARGS)
 {
-	return text_to_array_internal(fcinfo);
+	SplitStringTargetData tstate;
+
+	/* reset tstate */
+	memset(&tstate, 0, sizeof(tstate));
+
+	if (!text_to_array_internal(fcinfo, &tstate))
+		PG_RETURN_NULL();
+
+	if (!tstate.astate)
+		PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
+
+	PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate,
+										  CurrentMemoryContext));
 }
 
 /*
@@ -4693,16 +4717,98 @@ text_to_array(PG_FUNCTION_ARGS)
 Datum
 text_to_array_null(PG_FUNCTION_ARGS)
 {
-	return text_to_array_internal(fcinfo);
+	return text_to_array(fcinfo);
+}
+
+/*
+ * text_to_table
+ * Parse input string and returns substrings as a table.
+ */
+Datum
+text_to_table(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo	   *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
+	SplitStringTargetData tstate;
+	MemoryContext		old_cxt;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+
+	if (!(rsi->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not "
+						"allowed in this context")));
+
+	old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory);
+
+	tstate.astate = NULL;
+	tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
+	tstate.tupstore = tuplestore_begin_heap(rsi->allowedModes & SFRM_Materialize_Random,
+											false, work_mem);
+
+	MemoryContextSwitchTo(old_cxt);
+
+	(void) text_to_array_internal(fcinfo, &tstate);
+
+	tuplestore_donestoring(tstate.tupstore);
+
+	rsi->returnMode = SFRM_Materialize;
+	rsi->setResult = tstate.tupstore;
+	rsi->setDesc = tstate.tupdesc;
+
+	return (Datum) 0;
+}
+
+Datum
+text_to_table_null(PG_FUNCTION_ARGS)
+{
+	return text_to_table(fcinfo);
+}
+
+/*
+ * Add text to result set (table or array). Build a table when set is a expected or build
+ * a array
+ */
+static void
+accum_result(SplitStringTargetData *tstate,
+			 text *result_text,
+			 bool is_null)
+{
+	if (tstate->tupdesc)
+	{
+		HeapTuple	tuple;
+		Datum		values[1];
+		bool		nulls[1];
+
+		values[0] = PointerGetDatum(result_text);
+		nulls[0] = is_null;
+
+		tuple = heap_form_tuple(tstate->tupdesc, values, nulls);
+		tuplestore_puttuple(tstate->tupstore, tuple);
+	}
+	else
+	{
+		tstate->astate = accumArrayResult(tstate->astate,
+										 PointerGetDatum(result_text),
+										 is_null,
+										 TEXTOID,
+										 CurrentMemoryContext);
+	}
 }
 
 /*
  * common code for text_to_array and text_to_array_null functions
  *
  * These are not strict so we have to test for null inputs explicitly.
+ * Returns false, when result is null, else returns true.
+ *
  */
-static Datum
-text_to_array_internal(PG_FUNCTION_ARGS)
+static bool
+text_to_array_internal(FunctionCallInfo fcinfo, SplitStringTargetData *tstate)
 {
 	text	   *inputstring;
 	text	   *fldsep;
@@ -4712,11 +4818,10 @@ text_to_array_internal(PG_FUNCTION_ARGS)
 	char	   *start_ptr;
 	text	   *result_text;
 	bool		is_null;
-	ArrayBuildState *astate = NULL;
 
 	/* when input string is NULL, then result is NULL too */
 	if (PG_ARGISNULL(0))
-		PG_RETURN_NULL();
+		return false;
 
 	inputstring = PG_GETARG_TEXT_PP(0);
 
@@ -4745,7 +4850,7 @@ text_to_array_internal(PG_FUNCTION_ARGS)
 
 		/* return empty array for empty input string */
 		if (inputstring_len < 1)
-			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
+			return true;
 
 		/*
 		 * empty field separator: return the input string as a one-element
@@ -4753,22 +4858,11 @@ text_to_array_internal(PG_FUNCTION_ARGS)
 		 */
 		if (fldsep_len < 1)
 		{
-			Datum		elems[1];
-			bool		nulls[1];
-			int			dims[1];
-			int			lbs[1];
-
 			/* single element can be a NULL too */
 			is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
 
-			elems[0] = PointerGetDatum(inputstring);
-			nulls[0] = is_null;
-			dims[0] = 1;
-			lbs[0] = 1;
-			/* XXX: this hardcodes assumptions about the text type */
-			PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
-													 1, dims, lbs,
-													 TEXTOID, -1, false, TYPALIGN_INT));
+			accum_result(tstate, inputstring, is_null);
+			return true;
 		}
 
 		text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
@@ -4802,12 +4896,7 @@ text_to_array_internal(PG_FUNCTION_ARGS)
 			is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
 
 			/* stash away this field */
-			astate = accumArrayResult(astate,
-									  PointerGetDatum(result_text),
-									  is_null,
-									  TEXTOID,
-									  CurrentMemoryContext);
-
+			accum_result(tstate, result_text, is_null);
 			pfree(result_text);
 
 			if (!found)
@@ -4844,12 +4933,7 @@ text_to_array_internal(PG_FUNCTION_ARGS)
 			is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
 
 			/* stash away this field */
-			astate = accumArrayResult(astate,
-									  PointerGetDatum(result_text),
-									  is_null,
-									  TEXTOID,
-									  CurrentMemoryContext);
-
+			accum_result(tstate, result_text, is_null);
 			pfree(result_text);
 
 			start_ptr += chunk_len;
@@ -4857,8 +4941,7 @@ text_to_array_internal(PG_FUNCTION_ARGS)
 		}
 	}
 
-	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
-										  CurrentMemoryContext));
+	return true;
 }
 
 /*
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 4bce3ad8de..2fc97f830f 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3561,6 +3561,14 @@
 { oid => '2768', descr => 'split string by pattern',
   proname => 'regexp_split_to_array', prorettype => '_text',
   proargtypes => 'text text text', prosrc => 'regexp_split_to_array' },
+{ oid => '2228', descr => 'split delimited text',
+  proname => 'string_to_table', prorows => '1000', proretset => 't',
+  prorettype => 'text', proargtypes => 'text text',
+  prosrc => 'text_to_table' },
+{ oid => '2282', descr => 'split delimited text with null string',
+  proname => 'string_to_table', prorows => '1000', proretset => 't',
+  prorettype => 'text', proargtypes => 'text text text',
+  prosrc => 'text_to_table_null' },
 { oid => '2089', descr => 'convert int4 number to hex',
   proname => 'to_hex', prorettype => 'text', proargtypes => 'int4',
   prosrc => 'to_hex32' },
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index c730563f03..f024b153cc 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -1755,6 +1755,109 @@ select string_to_array('1,2,3,4,*,6', ',', '*');
  {1,2,3,4,NULL,6}
 (1 row)
 
+select string_to_table('1|2|3', '|');
+ string_to_table 
+-----------------
+ 1
+ 2
+ 3
+(3 rows)
+
+select string_to_table('1|2|3|', '|');
+ string_to_table 
+-----------------
+ 1
+ 2
+ 3
+ 
+(4 rows)
+
+select string_to_table('1||2|3||', '||');
+ string_to_table 
+-----------------
+ 1
+ 2|3
+ 
+(3 rows)
+
+select string_to_table('1|2|3', '');
+ string_to_table 
+-----------------
+ 1|2|3
+(1 row)
+
+select string_to_table('', '|');
+ string_to_table 
+-----------------
+(0 rows)
+
+select string_to_table('1|2|3', NULL);
+ string_to_table 
+-----------------
+(0 rows)
+
+select string_to_table(NULL, '|') IS NULL;
+ ?column? 
+----------
+(0 rows)
+
+select string_to_table('abc', '');
+ string_to_table 
+-----------------
+ abc
+(1 row)
+
+select string_to_table('abc', '', 'abc');
+ string_to_table 
+-----------------
+ 
+(1 row)
+
+select string_to_table('abc', ',');
+ string_to_table 
+-----------------
+ abc
+(1 row)
+
+select string_to_table('abc', ',', 'abc');
+ string_to_table 
+-----------------
+ 
+(1 row)
+
+select string_to_table('1,2,3,4,,6', ',');
+ string_to_table 
+-----------------
+ 1
+ 2
+ 3
+ 4
+ 
+ 6
+(6 rows)
+
+select string_to_table('1,2,3,4,,6', ',', '');
+ string_to_table 
+-----------------
+ 1
+ 2
+ 3
+ 4
+ 
+ 6
+(6 rows)
+
+select string_to_table('1,2,3,4,*,6', ',', '*');
+ string_to_table 
+-----------------
+ 1
+ 2
+ 3
+ 4
+ 
+ 6
+(6 rows)
+
 select array_to_string(NULL::int4[], ',') IS NULL;
  ?column? 
 ----------
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index 25dd4e2c6d..d57352cf47 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -544,6 +544,21 @@ select string_to_array('1,2,3,4,,6', ',');
 select string_to_array('1,2,3,4,,6', ',', '');
 select string_to_array('1,2,3,4,*,6', ',', '*');
 
+select string_to_table('1|2|3', '|');
+select string_to_table('1|2|3|', '|');
+select string_to_table('1||2|3||', '||');
+select string_to_table('1|2|3', '');
+select string_to_table('', '|');
+select string_to_table('1|2|3', NULL);
+select string_to_table(NULL, '|') IS NULL;
+select string_to_table('abc', '');
+select string_to_table('abc', '', 'abc');
+select string_to_table('abc', ',');
+select string_to_table('abc', ',', 'abc');
+select string_to_table('1,2,3,4,,6', ',');
+select string_to_table('1,2,3,4,,6', ',', '');
+select string_to_table('1,2,3,4,*,6', ',', '*');
+
 select array_to_string(NULL::int4[], ',') IS NULL;
 select array_to_string('{}'::int4[], ',');
 select array_to_string(array[1,2,3,4,NULL,6], ',');

proposal - function string_to_table

Reply via email to