incubator-hawq git commit: HAWQ-445. Support large strings (up to a GB) in text_to_array()

iweng Wed, 09 Mar 2016 17:24:06 -0800

Repository: incubator-hawq
Updated Branches:
  refs/heads/master 87d13b673 -> e29e13345



HAWQ-445. Support large strings (up to a GB) in text_to_array()


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/e29e1334
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/e29e1334
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/e29e1334

Branch: refs/heads/master
Commit: e29e13345f14b70d4193eb16e4b904737871c486
Parents: 87d13b6
Author: ivan <iw...@pivotal.io>
Authored: Thu Mar 10 09:22:49 2016 +0800
Committer: ivan <iw...@pivotal.io>
Committed: Thu Mar 10 09:22:49 2016 +0800

----------------------------------------------------------------------
 src/backend/utils/adt/test/Makefile       |  42 ++++
 src/backend/utils/adt/test/varlena_test.c | 251 +++++++++++++++++++
 src/backend/utils/adt/varlena.c           | 334 +++++++++++++++++++------
 3 files changed, 550 insertions(+), 77 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/test/Makefile
----------------------------------------------------------------------
diff --git a/src/backend/utils/adt/test/Makefile 
b/src/backend/utils/adt/test/Makefile
new file mode 100644
index 0000000..a0cd950
--- /dev/null
+++ b/src/backend/utils/adt/test/Makefile
@@ -0,0 +1,42 @@
+top_builddir=../../../../..
+subdir=src/backend/utils/adt
+
+TARGETS=varlena
+
+# Objects from backend, which don't need to be mocked but need to be linked.
+common_REAL_OBJS=\
+    $(top_srcdir)/src/backend/access/hash/hashfunc.o \
+    $(top_srcdir)/src/backend/bootstrap/bootparse.o \
+    $(top_srcdir)/src/backend/lib/stringinfo.o \
+    $(top_srcdir)/src/backend/nodes/bitmapset.o \
+    $(top_srcdir)/src/backend/nodes/equalfuncs.o \
+    $(top_srcdir)/src/backend/nodes/list.o \
+    $(top_srcdir)/src/backend/parser/gram.o \
+    $(top_srcdir)/src/backend/regex/regcomp.o \
+    $(top_srcdir)/src/backend/regex/regerror.o \
+    $(top_srcdir)/src/backend/regex/regexec.o \
+    $(top_srcdir)/src/backend/regex/regfree.o \
+    $(top_srcdir)/src/backend/storage/page/itemptr.o \
+    $(top_srcdir)/src/backend/utils/adt/datum.o \
+    $(top_srcdir)/src/backend/utils/adt/like.o \
+    $(top_srcdir)/src/backend/utils/hash/dynahash.o \
+    $(top_srcdir)/src/backend/utils/hash/hashfn.o \
+    $(top_srcdir)/src/backend/utils/misc/guc.o \
+    $(top_srcdir)/src/backend/utils/init/globals.o \
+    $(top_srcdir)/src/backend/utils/mmgr/mcxt.o \
+    $(top_srcdir)/src/backend/utils/mmgr/aset.o \
+    $(top_srcdir)/src/backend/utils/mmgr/memprot.o \
+    $(top_srcdir)/src/port/exec.o \
+    $(top_srcdir)/src/port/path.o \
+    $(top_srcdir)/src/port/pgsleep.o \
+    $(top_srcdir)/src/port/pgstrcasecmp.o \
+    $(top_srcdir)/src/port/qsort.o \
+    $(top_srcdir)/src/port/strlcpy.o \
+    $(top_srcdir)/src/port/thread.o \
+    $(top_srcdir)/src/timezone/localtime.o \
+    $(top_srcdir)/src/timezone/pgtz.o    
+
+varlena_REAL_OBJS=$(common_REAL_OBJS)
+
+include ../../../../Makefile.mock
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/test/varlena_test.c
----------------------------------------------------------------------
diff --git a/src/backend/utils/adt/test/varlena_test.c 
b/src/backend/utils/adt/test/varlena_test.c
new file mode 100644
index 0000000..46035fa
--- /dev/null
+++ b/src/backend/utils/adt/test/varlena_test.c
@@ -0,0 +1,251 @@
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include "cmockery.h"
+
+#include "c.h"
+#include "postgres.h"
+#include "nodes/nodes.h"
+#include "../varlena.c"
+
+#define MEMORY_LIMIT 8 /* 8 bytes memory limit */
+
+#ifdef USE_ASSERT_CHECKING
+void
+_ExceptionalCondition( )
+{
+     PG_RE_THROW();
+}
+#endif
+
+/*
+ * Checks if the small strings that fit in memory fails assertion.
+ */
+void
+test__find_memory_limited_substring__small_string(void **state)
+{
+       int subStringByteLength = -1;
+       int subStringCharLength = -1;
+       int totalByteLength = MEMORY_LIMIT;
+       char *strStart = 0xabcdefab;
+
+#ifdef USE_ASSERT_CHECKING
+       expect_any(ExceptionalCondition,conditionName);
+       expect_any(ExceptionalCondition,errorType);
+       expect_any(ExceptionalCondition,fileName);
+       expect_any(ExceptionalCondition,lineNumber);
+       
will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+       /* Test if within memory-limit strings cause assertion failure */
+       PG_TRY();
+       {
+               find_memory_limited_substring(strStart, totalByteLength, 
MEMORY_LIMIT, &subStringByteLength, &subStringCharLength);
+               assert_true(false);
+       }
+       PG_CATCH();
+       {
+       }
+       PG_END_TRY();
+#endif
+}
+
+/*
+ * Checks if null input string causes assertion failure.
+ */
+void
+test__find_memory_limited_substring__null_string(void **state)
+{
+       int subStringByteLength = -1;
+       int subStringCharLength = -1;
+       int totalByteLength = MEMORY_LIMIT + 1;
+       char *strStart = NULL;
+
+#ifdef USE_ASSERT_CHECKING
+       expect_any(ExceptionalCondition,conditionName);
+       expect_any(ExceptionalCondition,errorType);
+       expect_any(ExceptionalCondition,fileName);
+       expect_any(ExceptionalCondition,lineNumber);
+       
will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+       /* Test if null strings cause assertion failure */
+       PG_TRY();
+       {
+               find_memory_limited_substring(strStart, totalByteLength, 
MEMORY_LIMIT, &subStringByteLength, &subStringCharLength);
+               assert_true(false);
+       }
+       PG_CATCH();
+       {
+       }
+       PG_END_TRY();
+#endif
+}
+
+/*
+ * Checks if the returned string segments are within memory limit for ascii 
characters.
+ */
+void
+test__find_memory_limited_substring__ascii_chars_within_memory_limit(void 
**state)
+{
+       int subStringByteLength = -1;
+       int subStringCharLength = -1;
+       int cumulativeLengthConsidered = 0;
+
+       char *strStart = 0xabcdefab;
+
+       int totalByteLength = 25;
+
+       while (cumulativeLengthConsidered < totalByteLength - MEMORY_LIMIT)
+       {
+               will_return(pg_database_encoding_max_length, 1);
+               find_memory_limited_substring(strStart, totalByteLength - 
cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, 
&subStringCharLength);
+               cumulativeLengthConsidered += subStringByteLength;
+               assert_true(subStringByteLength == MEMORY_LIMIT);
+               assert_true(subStringByteLength == subStringCharLength);
+       }
+
+#ifdef USE_ASSERT_CHECKING
+       expect_any(ExceptionalCondition,conditionName);
+       expect_any(ExceptionalCondition,errorType);
+       expect_any(ExceptionalCondition,fileName);
+       expect_any(ExceptionalCondition,lineNumber);
+       
will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+       /* Test if the left-over string that fits in memory cause assertion 
failure */
+       PG_TRY();
+       {
+               find_memory_limited_substring(strStart, totalByteLength - 
cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, 
&subStringCharLength);
+               assert_true(false);
+       }
+       PG_CATCH();
+       {
+       }
+       PG_END_TRY();
+
+       expect_any(ExceptionalCondition,conditionName);
+       expect_any(ExceptionalCondition,errorType);
+       expect_any(ExceptionalCondition,fileName);
+       expect_any(ExceptionalCondition,lineNumber);
+       
will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+       /* Test if null strings cause assertion failure */
+       PG_TRY();
+       {
+               find_memory_limited_substring(NULL, totalByteLength, 
MEMORY_LIMIT, &subStringByteLength, &subStringCharLength);
+       }
+       PG_CATCH();
+       {
+               return;
+       }
+       PG_END_TRY();
+       assert_true(false);
+#endif
+}
+
+
+/*
+ * Checks if the returned string segments are within memory limit for 
multi-bytes chars.
+ */
+void
+test__find_memory_limited_substring__mb_chars_within_memory_limit(void **state)
+{
+       int subStringByteLength = -1;
+       int subStringCharLength = -1;
+       int cumulativeLengthConsidered = 0;
+
+       /* Lengths of the multi-byte characters at different positions */
+       int stringByteLengths[] = {3, 3, 3 /* seg1 */, 2, 2, 1, 2 /* seg2 */, 
2, 1, 1, 1, 2, /* seg3 */ 5, 4 /* seg4 */, 4};
+
+       /* Total length in terms of number of characters */
+       int stringCharLength = sizeof(stringByteLengths) / sizeof(int);
+
+       /* Total byte lengths of all the characters */
+       int totalByteLength = 0;
+       for (int charIndex = 0; charIndex < stringCharLength; charIndex++)
+       {
+               totalByteLength += stringByteLengths[charIndex];
+       }
+
+       int segmentByteLength = 0; /* Number of bytes in current segment */
+       int segmentCharLength = 0; /* Number of characters in current segment */
+
+       /* Length of the char that spilled over from one partition to another */
+       int carryoverLength = 0;
+
+       /* Fictitious multi-byte string to segment */
+       char *strStart = 0xabcdefab;
+
+       for (int charIndex = 0; charIndex < stringCharLength; charIndex++)
+       {
+               if (carryoverLength > 0)
+               {
+                       expect_any(pg_mblen, mbstr);
+                       will_return(pg_mblen, carryoverLength);
+                       carryoverLength = 0;
+               }
+
+               expect_any(pg_mblen, mbstr);
+               will_return(pg_mblen, stringByteLengths[charIndex]);
+               segmentByteLength += stringByteLengths[charIndex];
+               segmentCharLength++;
+
+               if (segmentByteLength > MEMORY_LIMIT)
+               {
+
+                       will_return(pg_database_encoding_max_length, 6);
+                       find_memory_limited_substring(strStart, totalByteLength 
- cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, 
&subStringCharLength);
+                       assert_true(subStringByteLength == (segmentByteLength - 
stringByteLengths[charIndex]));
+                       assert_true(subStringCharLength == (segmentCharLength - 
1));
+                       assert_true(subStringByteLength <= MEMORY_LIMIT);
+                       assert_true(subStringCharLength <= MEMORY_LIMIT);
+
+                       cumulativeLengthConsidered += subStringByteLength;
+
+                       segmentByteLength = stringByteLengths[charIndex];
+                       segmentCharLength = 1;
+                       carryoverLength = stringByteLengths[charIndex];
+               }
+       }
+
+       /* Now purge any unused pg_mblen call because of the suffix that does 
not exceed MEMORY_LIMIT */
+       for (int partitionCharIndex = 0; partitionCharIndex < 
segmentCharLength; partitionCharIndex++)
+       {
+               pg_mblen("a");
+       }
+
+#ifdef USE_ASSERT_CHECKING
+       expect_any(ExceptionalCondition,conditionName);
+       expect_any(ExceptionalCondition,errorType);
+       expect_any(ExceptionalCondition,fileName);
+       expect_any(ExceptionalCondition,lineNumber);
+       
will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL);
+
+       /* Test if the left-over string that fits in memory cause assertion 
failure */
+       PG_TRY();
+       {
+               find_memory_limited_substring(strStart, totalByteLength - 
cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, 
&subStringCharLength);
+       }
+       PG_CATCH();
+       {
+               return;
+       }
+       PG_END_TRY();
+
+       assert_true(false);
+#endif
+}
+
+int 
+main(int argc, char* argv[]) 
+{
+        cmockery_parse_arguments(argc, argv);
+        
+        const UnitTest tests[] = {
+                       
unit_test(test__find_memory_limited_substring__small_string),
+                       
unit_test(test__find_memory_limited_substring__null_string),
+                       
unit_test(test__find_memory_limited_substring__ascii_chars_within_memory_limit),
+                       
unit_test(test__find_memory_limited_substring__mb_chars_within_memory_limit)
+        };
+        return run_tests(tests);
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/varlena.c
----------------------------------------------------------------------
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 68aa810..21c4afb 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -28,6 +28,7 @@
 #include "utils/lsyscache.h"
 #include "utils/pg_locale.h"
 #include "utils/string_wrapper.h"
+#include "utils/memutils.h"
 
 typedef struct varlena unknown;
 
@@ -55,6 +56,13 @@ typedef struct
 #define PG_STR_GET_TEXT(str_) \
        DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(str_)))
 
+/*
+ * Max considered sub-string size is set to MaxAllocSize - 4MB).
+ * The 4MB is saved aside for memory allocation overhead such
+ * as allocation set headers.
+ */
+#define MAX_STRING_BYTES       ((Size) (MaxAllocSize - 0x400000))
+
 static int     text_position_ptr_len(char* p1, int len1, char *p2, int len2); 
 static void text_position_setup_ptr_len(char* p1, int len1, char* p2, int 
len2, TextPositionState *state);
 
@@ -617,6 +625,65 @@ charlen_to_bytelen(const char *p, int n)
        }
 }
 
+/* find_memory_limited_substring()
+ *     Computes the sub-string length in number of characters and number
+ *     of bytes where the sub-string consumes up to "memoryLimit" amount of 
memory.
+ *
+ *     Parameters:
+ *             strStart: starting pointer in the string
+ *             byteLen: number of bytes in the string, starting from strStart
+ *             memoryLimit: max string size in terms of bytes
+ *
+ *     Out parameters:
+ *             subStringByteLen: length of chosen sub-string in bytes
+ *             subStringCharLen: length of chosen sub-string in character count
+ *
+ * It is caller's responsibility that there actually are byteLen bytes
+ * starting from strStart; the string needs not be null-terminated.
+ */
+static void
+find_memory_limited_substring(const char *strStart, int byteLen, int 
memoryLimit, int *subStringByteLen, int *subStringCharLen)
+{
+       AssertArg(byteLen > memoryLimit);
+       AssertArg(NULL != strStart);
+       AssertArg(NULL != subStringCharLen);
+
+       if (pg_database_encoding_max_length() == 1)
+       {
+               /* Optimization for single-byte encodings */
+               *subStringByteLen = byteLen < memoryLimit ? byteLen : 
memoryLimit;
+               *subStringCharLen = *subStringByteLen;
+
+               return;
+       }
+       else
+       {
+               const char *strCurPointer = strStart;;
+
+               int consumedBytes = 0;
+               int consumedChars = 0;
+
+               while (consumedBytes <= byteLen)
+               {
+                       int curCharBytes = pg_mblen(strCurPointer);
+                       strCurPointer += curCharBytes;
+                       consumedChars++;
+                       consumedBytes += curCharBytes;
+
+                       if (consumedBytes > memoryLimit)
+                       {
+                               *subStringByteLen = consumedBytes - 
curCharBytes;
+                               *subStringCharLen = consumedChars - 1;
+
+                               Insist((*subStringByteLen > 0) && 
(*subStringCharLen > 0));
+
+                               return;
+                       }
+               }
+       }
+}
+
+
 /*
  * text_substr()
  * Return a substring starting at the specified position.
@@ -2559,24 +2626,36 @@ split_text(PG_FUNCTION_ARGS)
        PG_RETURN_TEXT_P(result_text);
 }
 
+
 /*
- * text_to_array
- * parse input string
- * return text array of elements
- * based on provided field separator
+ * text_to_array_impl
+ *             Carries out the actual tokenization and array conversion of an 
input string.
+ *
+ * Parameters:
+ *             string: Where to start in the input string
+ *             stringByteLen: Length of current string
+ *             delimiter: Which delimiter to use
+ *             delimiterByteLen: Length of delimiter in bytes
+ *             delimiterCharLen: Length of delimiter in chars
+ *             arrayState: State of the output array where we accumulate 
results
+ *             endOfString: Do we expect any more chunk of the main input 
string?
+ *
+ * Returns the pointer where the last match was found. Successively the
+ * caller can splice more data starting from this address to find further
+ * array elements.
  */
-Datum
-text_to_array(PG_FUNCTION_ARGS)
+static char* text_to_array_impl(char *string, int stringByteLen, char 
*delimiter,
+               int delimiterByteLen, int delimiterCharLen, ArrayBuildState 
**arrayState, bool endOfString)
 {
-       Datum d0 = PG_GETARG_DATUM(0);
-       char *p0; void *tofree0; int len0;
+       int start_posn = 1;
+       int fldnum = 1;
+       int end_posn = 0;
+       int chunk_len = 0;
+       text       *result_text;
 
-       Datum d1 = PG_GETARG_DATUM(1);
-       char *p1; void *tofree1; int len1;
+       char* cur_ptr = string;
 
-       int                     inputstring_len;
-       int                     fldsep_len; 
-       TextPositionState state =               
+       TextPositionState state =
                {
                0, /* use_wchar */
                NULL, /* str1 */
@@ -2587,79 +2666,32 @@ text_to_array(PG_FUNCTION_ARGS)
                0, /* len2 */
                };
 
-       int                     fldnum;
-       int                     start_posn;
-       int                     end_posn;
-       int                     chunk_len;
-       char       *start_ptr;
-       text       *result_text;
-       ArrayBuildState *astate = NULL;
-
-       varattrib_untoast_ptr_len(d0, &p0, &len0, &tofree0);
-       varattrib_untoast_ptr_len(d1, &p1, &len1, &tofree1);
-
-       if(pg_database_encoding_max_length() == 1)
-       {
-               inputstring_len = len0;
-               fldsep_len = len1;
-       }
-       else
-       {
-               inputstring_len = pg_mbstrlen_with_len(p0, len0);
-               fldsep_len = pg_mbstrlen_with_len(p1, len1);
-       }
-
-       /* return NULL for empty input string */
-       if (inputstring_len < 1)
-       {
-               if(tofree0)
-                       pfree(tofree0);
-               if(tofree1)
-                       pfree(tofree1);
-
-               PG_RETURN_NULL();
-       }
-
-       /*
-        * empty field separator return one element, 1D, array using the input
-        * string
-        */
-       if (fldsep_len < 1)
-       {
-               if(tofree0)
-                       pfree(tofree0);
-               if(tofree1)
-                       pfree(tofree1);
-
-               PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, 
d0, 1));
-       }
-
-       text_position_setup_ptr_len(p0, len0, p1, len1, &state);
-
-       start_posn = 1;
-       /* start_ptr points to the start_posn'th character of inputstring */
-       start_ptr = p0; 
+       text_position_setup_ptr_len(string, stringByteLen, delimiter, 
delimiterByteLen, &state);
 
        for (fldnum = 1;; fldnum++) /* field number is 1 based */
        {
                end_posn = text_position_next(start_posn, &state);
 
-               if (end_posn == 0)
+               if (end_posn == 0 && !endOfString)
+               {
+                       break;
+               }
+               else if (end_posn == 0)
                {
                        /* fetch last field */
-                       chunk_len = (p0 + len0) - start_ptr;
+                       chunk_len = (string + stringByteLen) - cur_ptr;
                }
                else
                {
                        /* fetch non-last field */
-                       chunk_len = charlen_to_bytelen(start_ptr, end_posn - 
start_posn);
+                       chunk_len = charlen_to_bytelen(cur_ptr, end_posn - 
start_posn);
                }
 
                /* must build a temp text datum to pass to accumArrayResult */
-               result_text = cstring_to_text_with_len(start_ptr, chunk_len);
+               result_text = cstring_to_text_with_len(cur_ptr, chunk_len);
 
                /* stash away this field */
-               astate = accumArrayResult(astate,
+               *arrayState = accumArrayResult(*arrayState,
                                                                  
PointerGetDatum(result_text),
                                                                  false,
                                                                  TEXTOID,
@@ -2668,20 +2700,168 @@ text_to_array(PG_FUNCTION_ARGS)
                pfree(result_text);
 
                if (end_posn == 0)
+               {
+                       /* Process next sub-string if any */
                        break;
+               }
 
                start_posn = end_posn;
-               start_ptr += chunk_len;
-               start_posn += fldsep_len;
-               start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
+               cur_ptr += chunk_len;
+               start_posn += delimiterCharLen;
+               cur_ptr += charlen_to_bytelen(cur_ptr, delimiterCharLen);
        }
 
        text_position_cleanup(&state);
 
-       if(tofree0)
-               pfree(tofree0);
-       if(tofree1)
-               pfree(tofree1);
+       return cur_ptr;
+}
+
+
+/*
+ * text_to_array_multi_pass
+ *             Carries out the actual tokenization and array conversion of 
input string
+ *             in multiple passes, where each pass is restricted to GPDB 
memory allocation limit.
+ *
+ * Parameters:
+ *             string: The start of the input string
+ *             stringByteLen: Length of current string
+ *             delimiter: Which delimiter to use
+ *             delimiterByteLen: Length of delimiter in bytes
+ *             delimiterCharLen: Length of delimiter in chars
+ *             endOfString: Do we expect any more chunk of the main input 
string?
+ *
+ * Returns the ArrayBuildState containing all the array elements.
+ */
+static ArrayBuildState* text_to_array_multi_pass(char *string, int 
stringByteLen, char *delimiter, int delimiterByteLen, int delimiterCharLen)
+{
+       ArrayBuildState *astate = NULL;
+
+       /* Start with full string. If it is too big then we chunk it later */
+       char       *start_ptr = string;
+       int curSubStringByteLen = stringByteLen;
+
+       bool endOfString = false;
+
+       /* More bytes to consider? */
+       while (!endOfString)
+       {
+               /*
+                * Give the rest of the string to the current pass; may be 
chunked if
+                * the rest still doesn't fit in the memory
+                */
+               curSubStringByteLen = (string + stringByteLen) - start_ptr;
+
+               /* Will this MBCS become too big to fit in memory once 
converted to wchar? */
+               if (pg_database_encoding_max_length() > 1 && 
curSubStringByteLen > ((MAX_STRING_BYTES)/ sizeof(pg_wchar)))
+               {
+                       int curSubStringCharLen = 0;
+                       /* We need multi-pass. So find the sub-string boundary 
for the current pass */
+                       find_memory_limited_substring(start_ptr, string + 
stringByteLen - start_ptr,
+                               (MAX_STRING_BYTES) / sizeof(pg_wchar), 
&curSubStringByteLen, &curSubStringCharLen);
+               }
+
+               Insist(start_ptr + curSubStringByteLen <= string + 
stringByteLen);
+
+               endOfString = ((start_ptr + curSubStringByteLen) == (string + 
stringByteLen));
+
+               char *nextStartPtr = text_to_array_impl(start_ptr, 
curSubStringByteLen, delimiter, delimiterByteLen, delimiterCharLen, &astate, 
endOfString);
+
+               Insist(nextStartPtr >= start_ptr);
+
+               if (!endOfString && nextStartPtr == start_ptr)
+               {
+                       elog(ERROR, "String size not supported.");
+               }
+
+               start_ptr = nextStartPtr;
+       }
+
+       return astate;
+}
+
+
+/*
+ *  * text_to_array
+ *   * parse input string
+ *    * return text array of elements
+ *     * based on provided field separator
+ *      */
+Datum
+text_to_array(PG_FUNCTION_ARGS)
+{
+       Datum stringDatum = PG_GETARG_DATUM(0);
+       char *string = NULL;
+       void *toFreeString = NULL;
+       int stringByteLen = 0;
+
+       Datum delimiterDatum = PG_GETARG_DATUM(1);
+       char *delimiter = NULL;
+       void *toFreeDelimiter = NULL;
+       int delimiterByteLen = 0;
+
+       int stringCharLen = 0;
+       int     delimiterCharLen = 0;
+
+       varattrib_untoast_ptr_len(stringDatum, &string, &stringByteLen, 
&toFreeString);
+       varattrib_untoast_ptr_len(delimiterDatum, &delimiter, 
&delimiterByteLen, &toFreeDelimiter);
+
+       if(pg_database_encoding_max_length() == 1)
+       {
+               stringCharLen = stringByteLen;
+               delimiterCharLen = delimiterByteLen;
+       }
+       else
+       {
+               stringCharLen = pg_mbstrlen_with_len(string, stringByteLen);
+               delimiterCharLen = pg_mbstrlen_with_len(delimiter, 
delimiterByteLen);
+       }
+
+       /* return NULL for empty input string */
+       if (stringCharLen < 1)
+       {
+               if(toFreeString)
+               {
+                       pfree(toFreeString);
+               }
+
+               if(toFreeDelimiter)
+               {
+                       pfree(toFreeDelimiter);
+               }
+
+               PG_RETURN_NULL();
+       }
+
+       /*
+        * empty field separator return one element, 1D, array using the input
+        * string
+        */
+       if (delimiterCharLen < 1)
+       {
+               if(toFreeString)
+               {
+                       pfree(toFreeString);
+               }
+
+               if(toFreeDelimiter)
+               {
+                       pfree(toFreeDelimiter);
+               }
+
+               PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, 
stringDatum, 1));
+       }
+
+       ArrayBuildState *astate = text_to_array_multi_pass(string, 
stringByteLen, delimiter, delimiterByteLen, delimiterCharLen);
+
+       if(toFreeString)
+       {
+               pfree(toFreeString);
+       }
+       if(toFreeDelimiter)
+       {
+               pfree(toFreeDelimiter);
+       }
+
        PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
 }

incubator-hawq git commit: HAWQ-445. Support large strings (up to a GB) in text_to_array()

Reply via email to