Repository: incubator-hawq Updated Branches: refs/heads/master 87d13b673 -> e29e13345
HAWQ-445. Support large strings (up to a GB) in text_to_array() Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/e29e1334 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/e29e1334 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/e29e1334 Branch: refs/heads/master Commit: e29e13345f14b70d4193eb16e4b904737871c486 Parents: 87d13b6 Author: ivan <iw...@pivotal.io> Authored: Thu Mar 10 09:22:49 2016 +0800 Committer: ivan <iw...@pivotal.io> Committed: Thu Mar 10 09:22:49 2016 +0800 ---------------------------------------------------------------------- src/backend/utils/adt/test/Makefile | 42 ++++ src/backend/utils/adt/test/varlena_test.c | 251 +++++++++++++++++++ src/backend/utils/adt/varlena.c | 334 +++++++++++++++++++------ 3 files changed, 550 insertions(+), 77 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/test/Makefile ---------------------------------------------------------------------- diff --git a/src/backend/utils/adt/test/Makefile b/src/backend/utils/adt/test/Makefile new file mode 100644 index 0000000..a0cd950 --- /dev/null +++ b/src/backend/utils/adt/test/Makefile @@ -0,0 +1,42 @@ +top_builddir=../../../../.. +subdir=src/backend/utils/adt + +TARGETS=varlena + +# Objects from backend, which don't need to be mocked but need to be linked. +common_REAL_OBJS=\ + $(top_srcdir)/src/backend/access/hash/hashfunc.o \ + $(top_srcdir)/src/backend/bootstrap/bootparse.o \ + $(top_srcdir)/src/backend/lib/stringinfo.o \ + $(top_srcdir)/src/backend/nodes/bitmapset.o \ + $(top_srcdir)/src/backend/nodes/equalfuncs.o \ + $(top_srcdir)/src/backend/nodes/list.o \ + $(top_srcdir)/src/backend/parser/gram.o \ + $(top_srcdir)/src/backend/regex/regcomp.o \ + $(top_srcdir)/src/backend/regex/regerror.o \ + $(top_srcdir)/src/backend/regex/regexec.o \ + $(top_srcdir)/src/backend/regex/regfree.o \ + $(top_srcdir)/src/backend/storage/page/itemptr.o \ + $(top_srcdir)/src/backend/utils/adt/datum.o \ + $(top_srcdir)/src/backend/utils/adt/like.o \ + $(top_srcdir)/src/backend/utils/hash/dynahash.o \ + $(top_srcdir)/src/backend/utils/hash/hashfn.o \ + $(top_srcdir)/src/backend/utils/misc/guc.o \ + $(top_srcdir)/src/backend/utils/init/globals.o \ + $(top_srcdir)/src/backend/utils/mmgr/mcxt.o \ + $(top_srcdir)/src/backend/utils/mmgr/aset.o \ + $(top_srcdir)/src/backend/utils/mmgr/memprot.o \ + $(top_srcdir)/src/port/exec.o \ + $(top_srcdir)/src/port/path.o \ + $(top_srcdir)/src/port/pgsleep.o \ + $(top_srcdir)/src/port/pgstrcasecmp.o \ + $(top_srcdir)/src/port/qsort.o \ + $(top_srcdir)/src/port/strlcpy.o \ + $(top_srcdir)/src/port/thread.o \ + $(top_srcdir)/src/timezone/localtime.o \ + $(top_srcdir)/src/timezone/pgtz.o + +varlena_REAL_OBJS=$(common_REAL_OBJS) + +include ../../../../Makefile.mock + http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/test/varlena_test.c ---------------------------------------------------------------------- diff --git a/src/backend/utils/adt/test/varlena_test.c b/src/backend/utils/adt/test/varlena_test.c new file mode 100644 index 0000000..46035fa --- /dev/null +++ b/src/backend/utils/adt/test/varlena_test.c @@ -0,0 +1,251 @@ +#include <stdarg.h> +#include <stddef.h> +#include <setjmp.h> +#include "cmockery.h" + +#include "c.h" +#include "postgres.h" +#include "nodes/nodes.h" +#include "../varlena.c" + +#define MEMORY_LIMIT 8 /* 8 bytes memory limit */ + +#ifdef USE_ASSERT_CHECKING +void +_ExceptionalCondition( ) +{ + PG_RE_THROW(); +} +#endif + +/* + * Checks if the small strings that fit in memory fails assertion. + */ +void +test__find_memory_limited_substring__small_string(void **state) +{ + int subStringByteLength = -1; + int subStringCharLength = -1; + int totalByteLength = MEMORY_LIMIT; + char *strStart = 0xabcdefab; + +#ifdef USE_ASSERT_CHECKING + expect_any(ExceptionalCondition,conditionName); + expect_any(ExceptionalCondition,errorType); + expect_any(ExceptionalCondition,fileName); + expect_any(ExceptionalCondition,lineNumber); + will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL); + + /* Test if within memory-limit strings cause assertion failure */ + PG_TRY(); + { + find_memory_limited_substring(strStart, totalByteLength, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + assert_true(false); + } + PG_CATCH(); + { + } + PG_END_TRY(); +#endif +} + +/* + * Checks if null input string causes assertion failure. + */ +void +test__find_memory_limited_substring__null_string(void **state) +{ + int subStringByteLength = -1; + int subStringCharLength = -1; + int totalByteLength = MEMORY_LIMIT + 1; + char *strStart = NULL; + +#ifdef USE_ASSERT_CHECKING + expect_any(ExceptionalCondition,conditionName); + expect_any(ExceptionalCondition,errorType); + expect_any(ExceptionalCondition,fileName); + expect_any(ExceptionalCondition,lineNumber); + will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL); + + /* Test if null strings cause assertion failure */ + PG_TRY(); + { + find_memory_limited_substring(strStart, totalByteLength, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + assert_true(false); + } + PG_CATCH(); + { + } + PG_END_TRY(); +#endif +} + +/* + * Checks if the returned string segments are within memory limit for ascii characters. + */ +void +test__find_memory_limited_substring__ascii_chars_within_memory_limit(void **state) +{ + int subStringByteLength = -1; + int subStringCharLength = -1; + int cumulativeLengthConsidered = 0; + + char *strStart = 0xabcdefab; + + int totalByteLength = 25; + + while (cumulativeLengthConsidered < totalByteLength - MEMORY_LIMIT) + { + will_return(pg_database_encoding_max_length, 1); + find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + cumulativeLengthConsidered += subStringByteLength; + assert_true(subStringByteLength == MEMORY_LIMIT); + assert_true(subStringByteLength == subStringCharLength); + } + +#ifdef USE_ASSERT_CHECKING + expect_any(ExceptionalCondition,conditionName); + expect_any(ExceptionalCondition,errorType); + expect_any(ExceptionalCondition,fileName); + expect_any(ExceptionalCondition,lineNumber); + will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL); + + /* Test if the left-over string that fits in memory cause assertion failure */ + PG_TRY(); + { + find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + assert_true(false); + } + PG_CATCH(); + { + } + PG_END_TRY(); + + expect_any(ExceptionalCondition,conditionName); + expect_any(ExceptionalCondition,errorType); + expect_any(ExceptionalCondition,fileName); + expect_any(ExceptionalCondition,lineNumber); + will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL); + + /* Test if null strings cause assertion failure */ + PG_TRY(); + { + find_memory_limited_substring(NULL, totalByteLength, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + } + PG_CATCH(); + { + return; + } + PG_END_TRY(); + assert_true(false); +#endif +} + + +/* + * Checks if the returned string segments are within memory limit for multi-bytes chars. + */ +void +test__find_memory_limited_substring__mb_chars_within_memory_limit(void **state) +{ + int subStringByteLength = -1; + int subStringCharLength = -1; + int cumulativeLengthConsidered = 0; + + /* Lengths of the multi-byte characters at different positions */ + int stringByteLengths[] = {3, 3, 3 /* seg1 */, 2, 2, 1, 2 /* seg2 */, 2, 1, 1, 1, 2, /* seg3 */ 5, 4 /* seg4 */, 4}; + + /* Total length in terms of number of characters */ + int stringCharLength = sizeof(stringByteLengths) / sizeof(int); + + /* Total byte lengths of all the characters */ + int totalByteLength = 0; + for (int charIndex = 0; charIndex < stringCharLength; charIndex++) + { + totalByteLength += stringByteLengths[charIndex]; + } + + int segmentByteLength = 0; /* Number of bytes in current segment */ + int segmentCharLength = 0; /* Number of characters in current segment */ + + /* Length of the char that spilled over from one partition to another */ + int carryoverLength = 0; + + /* Fictitious multi-byte string to segment */ + char *strStart = 0xabcdefab; + + for (int charIndex = 0; charIndex < stringCharLength; charIndex++) + { + if (carryoverLength > 0) + { + expect_any(pg_mblen, mbstr); + will_return(pg_mblen, carryoverLength); + carryoverLength = 0; + } + + expect_any(pg_mblen, mbstr); + will_return(pg_mblen, stringByteLengths[charIndex]); + segmentByteLength += stringByteLengths[charIndex]; + segmentCharLength++; + + if (segmentByteLength > MEMORY_LIMIT) + { + + will_return(pg_database_encoding_max_length, 6); + find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + assert_true(subStringByteLength == (segmentByteLength - stringByteLengths[charIndex])); + assert_true(subStringCharLength == (segmentCharLength - 1)); + assert_true(subStringByteLength <= MEMORY_LIMIT); + assert_true(subStringCharLength <= MEMORY_LIMIT); + + cumulativeLengthConsidered += subStringByteLength; + + segmentByteLength = stringByteLengths[charIndex]; + segmentCharLength = 1; + carryoverLength = stringByteLengths[charIndex]; + } + } + + /* Now purge any unused pg_mblen call because of the suffix that does not exceed MEMORY_LIMIT */ + for (int partitionCharIndex = 0; partitionCharIndex < segmentCharLength; partitionCharIndex++) + { + pg_mblen("a"); + } + +#ifdef USE_ASSERT_CHECKING + expect_any(ExceptionalCondition,conditionName); + expect_any(ExceptionalCondition,errorType); + expect_any(ExceptionalCondition,fileName); + expect_any(ExceptionalCondition,lineNumber); + will_be_called_with_sideeffect(ExceptionalCondition,&_ExceptionalCondition,NULL); + + /* Test if the left-over string that fits in memory cause assertion failure */ + PG_TRY(); + { + find_memory_limited_substring(strStart, totalByteLength - cumulativeLengthConsidered, MEMORY_LIMIT, &subStringByteLength, &subStringCharLength); + } + PG_CATCH(); + { + return; + } + PG_END_TRY(); + + assert_true(false); +#endif +} + +int +main(int argc, char* argv[]) +{ + cmockery_parse_arguments(argc, argv); + + const UnitTest tests[] = { + unit_test(test__find_memory_limited_substring__small_string), + unit_test(test__find_memory_limited_substring__null_string), + unit_test(test__find_memory_limited_substring__ascii_chars_within_memory_limit), + unit_test(test__find_memory_limited_substring__mb_chars_within_memory_limit) + }; + return run_tests(tests); +} + + http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e29e1334/src/backend/utils/adt/varlena.c ---------------------------------------------------------------------- diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 68aa810..21c4afb 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -28,6 +28,7 @@ #include "utils/lsyscache.h" #include "utils/pg_locale.h" #include "utils/string_wrapper.h" +#include "utils/memutils.h" typedef struct varlena unknown; @@ -55,6 +56,13 @@ typedef struct #define PG_STR_GET_TEXT(str_) \ DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(str_))) +/* + * Max considered sub-string size is set to MaxAllocSize - 4MB). + * The 4MB is saved aside for memory allocation overhead such + * as allocation set headers. + */ +#define MAX_STRING_BYTES ((Size) (MaxAllocSize - 0x400000)) + static int text_position_ptr_len(char* p1, int len1, char *p2, int len2); static void text_position_setup_ptr_len(char* p1, int len1, char* p2, int len2, TextPositionState *state); @@ -617,6 +625,65 @@ charlen_to_bytelen(const char *p, int n) } } +/* find_memory_limited_substring() + * Computes the sub-string length in number of characters and number + * of bytes where the sub-string consumes up to "memoryLimit" amount of memory. + * + * Parameters: + * strStart: starting pointer in the string + * byteLen: number of bytes in the string, starting from strStart + * memoryLimit: max string size in terms of bytes + * + * Out parameters: + * subStringByteLen: length of chosen sub-string in bytes + * subStringCharLen: length of chosen sub-string in character count + * + * It is caller's responsibility that there actually are byteLen bytes + * starting from strStart; the string needs not be null-terminated. + */ +static void +find_memory_limited_substring(const char *strStart, int byteLen, int memoryLimit, int *subStringByteLen, int *subStringCharLen) +{ + AssertArg(byteLen > memoryLimit); + AssertArg(NULL != strStart); + AssertArg(NULL != subStringCharLen); + + if (pg_database_encoding_max_length() == 1) + { + /* Optimization for single-byte encodings */ + *subStringByteLen = byteLen < memoryLimit ? byteLen : memoryLimit; + *subStringCharLen = *subStringByteLen; + + return; + } + else + { + const char *strCurPointer = strStart;; + + int consumedBytes = 0; + int consumedChars = 0; + + while (consumedBytes <= byteLen) + { + int curCharBytes = pg_mblen(strCurPointer); + strCurPointer += curCharBytes; + consumedChars++; + consumedBytes += curCharBytes; + + if (consumedBytes > memoryLimit) + { + *subStringByteLen = consumedBytes - curCharBytes; + *subStringCharLen = consumedChars - 1; + + Insist((*subStringByteLen > 0) && (*subStringCharLen > 0)); + + return; + } + } + } +} + + /* * text_substr() * Return a substring starting at the specified position. @@ -2559,24 +2626,36 @@ split_text(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result_text); } + /* - * text_to_array - * parse input string - * return text array of elements - * based on provided field separator + * text_to_array_impl + * Carries out the actual tokenization and array conversion of an input string. + * + * Parameters: + * string: Where to start in the input string + * stringByteLen: Length of current string + * delimiter: Which delimiter to use + * delimiterByteLen: Length of delimiter in bytes + * delimiterCharLen: Length of delimiter in chars + * arrayState: State of the output array where we accumulate results + * endOfString: Do we expect any more chunk of the main input string? + * + * Returns the pointer where the last match was found. Successively the + * caller can splice more data starting from this address to find further + * array elements. */ -Datum -text_to_array(PG_FUNCTION_ARGS) +static char* text_to_array_impl(char *string, int stringByteLen, char *delimiter, + int delimiterByteLen, int delimiterCharLen, ArrayBuildState **arrayState, bool endOfString) { - Datum d0 = PG_GETARG_DATUM(0); - char *p0; void *tofree0; int len0; + int start_posn = 1; + int fldnum = 1; + int end_posn = 0; + int chunk_len = 0; + text *result_text; - Datum d1 = PG_GETARG_DATUM(1); - char *p1; void *tofree1; int len1; + char* cur_ptr = string; - int inputstring_len; - int fldsep_len; - TextPositionState state = + TextPositionState state = { 0, /* use_wchar */ NULL, /* str1 */ @@ -2587,79 +2666,32 @@ text_to_array(PG_FUNCTION_ARGS) 0, /* len2 */ }; - int fldnum; - int start_posn; - int end_posn; - int chunk_len; - char *start_ptr; - text *result_text; - ArrayBuildState *astate = NULL; - - varattrib_untoast_ptr_len(d0, &p0, &len0, &tofree0); - varattrib_untoast_ptr_len(d1, &p1, &len1, &tofree1); - - if(pg_database_encoding_max_length() == 1) - { - inputstring_len = len0; - fldsep_len = len1; - } - else - { - inputstring_len = pg_mbstrlen_with_len(p0, len0); - fldsep_len = pg_mbstrlen_with_len(p1, len1); - } - - /* return NULL for empty input string */ - if (inputstring_len < 1) - { - if(tofree0) - pfree(tofree0); - if(tofree1) - pfree(tofree1); - - PG_RETURN_NULL(); - } - - /* - * empty field separator return one element, 1D, array using the input - * string - */ - if (fldsep_len < 1) - { - if(tofree0) - pfree(tofree0); - if(tofree1) - pfree(tofree1); - - PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, d0, 1)); - } - - text_position_setup_ptr_len(p0, len0, p1, len1, &state); - - start_posn = 1; - /* start_ptr points to the start_posn'th character of inputstring */ - start_ptr = p0; + text_position_setup_ptr_len(string, stringByteLen, delimiter, delimiterByteLen, &state); for (fldnum = 1;; fldnum++) /* field number is 1 based */ { end_posn = text_position_next(start_posn, &state); - if (end_posn == 0) + if (end_posn == 0 && !endOfString) + { + break; + } + else if (end_posn == 0) { /* fetch last field */ - chunk_len = (p0 + len0) - start_ptr; + chunk_len = (string + stringByteLen) - cur_ptr; } else { /* fetch non-last field */ - chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn); + chunk_len = charlen_to_bytelen(cur_ptr, end_posn - start_posn); } /* must build a temp text datum to pass to accumArrayResult */ - result_text = cstring_to_text_with_len(start_ptr, chunk_len); + result_text = cstring_to_text_with_len(cur_ptr, chunk_len); /* stash away this field */ - astate = accumArrayResult(astate, + *arrayState = accumArrayResult(*arrayState, PointerGetDatum(result_text), false, TEXTOID, @@ -2668,20 +2700,168 @@ text_to_array(PG_FUNCTION_ARGS) pfree(result_text); if (end_posn == 0) + { + /* Process next sub-string if any */ break; + } start_posn = end_posn; - start_ptr += chunk_len; - start_posn += fldsep_len; - start_ptr += charlen_to_bytelen(start_ptr, fldsep_len); + cur_ptr += chunk_len; + start_posn += delimiterCharLen; + cur_ptr += charlen_to_bytelen(cur_ptr, delimiterCharLen); } text_position_cleanup(&state); - if(tofree0) - pfree(tofree0); - if(tofree1) - pfree(tofree1); + return cur_ptr; +} + + +/* + * text_to_array_multi_pass + * Carries out the actual tokenization and array conversion of input string + * in multiple passes, where each pass is restricted to GPDB memory allocation limit. + * + * Parameters: + * string: The start of the input string + * stringByteLen: Length of current string + * delimiter: Which delimiter to use + * delimiterByteLen: Length of delimiter in bytes + * delimiterCharLen: Length of delimiter in chars + * endOfString: Do we expect any more chunk of the main input string? + * + * Returns the ArrayBuildState containing all the array elements. + */ +static ArrayBuildState* text_to_array_multi_pass(char *string, int stringByteLen, char *delimiter, int delimiterByteLen, int delimiterCharLen) +{ + ArrayBuildState *astate = NULL; + + /* Start with full string. If it is too big then we chunk it later */ + char *start_ptr = string; + int curSubStringByteLen = stringByteLen; + + bool endOfString = false; + + /* More bytes to consider? */ + while (!endOfString) + { + /* + * Give the rest of the string to the current pass; may be chunked if + * the rest still doesn't fit in the memory + */ + curSubStringByteLen = (string + stringByteLen) - start_ptr; + + /* Will this MBCS become too big to fit in memory once converted to wchar? */ + if (pg_database_encoding_max_length() > 1 && curSubStringByteLen > ((MAX_STRING_BYTES)/ sizeof(pg_wchar))) + { + int curSubStringCharLen = 0; + /* We need multi-pass. So find the sub-string boundary for the current pass */ + find_memory_limited_substring(start_ptr, string + stringByteLen - start_ptr, + (MAX_STRING_BYTES) / sizeof(pg_wchar), &curSubStringByteLen, &curSubStringCharLen); + } + + Insist(start_ptr + curSubStringByteLen <= string + stringByteLen); + + endOfString = ((start_ptr + curSubStringByteLen) == (string + stringByteLen)); + + char *nextStartPtr = text_to_array_impl(start_ptr, curSubStringByteLen, delimiter, delimiterByteLen, delimiterCharLen, &astate, endOfString); + + Insist(nextStartPtr >= start_ptr); + + if (!endOfString && nextStartPtr == start_ptr) + { + elog(ERROR, "String size not supported."); + } + + start_ptr = nextStartPtr; + } + + return astate; +} + + +/* + * * text_to_array + * * parse input string + * * return text array of elements + * * based on provided field separator + * */ +Datum +text_to_array(PG_FUNCTION_ARGS) +{ + Datum stringDatum = PG_GETARG_DATUM(0); + char *string = NULL; + void *toFreeString = NULL; + int stringByteLen = 0; + + Datum delimiterDatum = PG_GETARG_DATUM(1); + char *delimiter = NULL; + void *toFreeDelimiter = NULL; + int delimiterByteLen = 0; + + int stringCharLen = 0; + int delimiterCharLen = 0; + + varattrib_untoast_ptr_len(stringDatum, &string, &stringByteLen, &toFreeString); + varattrib_untoast_ptr_len(delimiterDatum, &delimiter, &delimiterByteLen, &toFreeDelimiter); + + if(pg_database_encoding_max_length() == 1) + { + stringCharLen = stringByteLen; + delimiterCharLen = delimiterByteLen; + } + else + { + stringCharLen = pg_mbstrlen_with_len(string, stringByteLen); + delimiterCharLen = pg_mbstrlen_with_len(delimiter, delimiterByteLen); + } + + /* return NULL for empty input string */ + if (stringCharLen < 1) + { + if(toFreeString) + { + pfree(toFreeString); + } + + if(toFreeDelimiter) + { + pfree(toFreeDelimiter); + } + + PG_RETURN_NULL(); + } + + /* + * empty field separator return one element, 1D, array using the input + * string + */ + if (delimiterCharLen < 1) + { + if(toFreeString) + { + pfree(toFreeString); + } + + if(toFreeDelimiter) + { + pfree(toFreeDelimiter); + } + + PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, stringDatum, 1)); + } + + ArrayBuildState *astate = text_to_array_multi_pass(string, stringByteLen, delimiter, delimiterByteLen, delimiterCharLen); + + if(toFreeString) + { + pfree(toFreeString); + } + if(toFreeDelimiter) + { + pfree(toFreeDelimiter); + } + PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext)); }