DRILL-630: Substr() function
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/e4101807 Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/e4101807 Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/e4101807 Branch: refs/heads/master Commit: e4101807728c06b62bb38e2871374215113cb2ef Parents: 9b827b5 Author: Yash Sharma <[email protected]> Authored: Thu Jun 5 07:54:24 2014 -0700 Committer: Jacques Nadeau <[email protected]> Committed: Mon Jun 9 17:09:38 2014 -0700 ---------------------------------------------------------------------- .../exec/expr/fn/impl/StringFunctions.java | 415 ++++++++++--------- .../exec/physical/impl/TestStringFunctions.java | 4 +- .../resources/functions/string/testSubstr.json | 7 +- 3 files changed, 233 insertions(+), 193 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/e4101807/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 8d792fa..51a7dbb 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -36,86 +36,86 @@ import org.apache.drill.exec.record.RecordBatch; public class StringFunctions{ static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StringFunctions.class); - + private StringFunctions(){} - + /* - * String Function Implementation. + * String Function Implementation. */ - + @FunctionTemplate(name = "like", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Like implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Param(constant=true) VarCharHolder pattern; @Output BitHolder out; @Workspace java.util.regex.Pattern regPattern; - + public void setup(RecordBatch incoming){ regPattern = java.util.regex.Pattern.compile(org.apache.drill.exec.expr.fn.impl.RegexpUtil.sqlToRegexLike(pattern.toString())); } - + public void eval(){ out.value = regPattern.matcher(input.toString()).matches()? 1:0; } } @FunctionTemplate(names = {"similar", "similar to"}, scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) - public static class Similar implements DrillSimpleFunc{ + public static class Similar implements DrillSimpleFunc{ @Param VarCharHolder input; @Param(constant=true) VarCharHolder pattern; @Output BitHolder out; @Workspace java.util.regex.Pattern regPattern; public void setup(RecordBatch incoming){ - regPattern = java.util.regex.Pattern.compile(org.apache.drill.exec.expr.fn.impl.RegexpUtil.sqlToRegexSimilar(pattern.toString())); + regPattern = java.util.regex.Pattern.compile(org.apache.drill.exec.expr.fn.impl.RegexpUtil.sqlToRegexSimilar(pattern.toString())); } - + public void eval(){ out.value = regPattern.matcher(input.toString()).matches()? 1:0; } } - + /* - * Replace all substring that match the regular expression with replacement. + * Replace all substring that match the regular expression with replacement. */ @FunctionTemplate(name = "regexp_replace", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class RegexpReplace implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Param(constant=true) VarCharHolder pattern; @Param VarCharHolder replacement; @Workspace ByteBuf buffer; - @Workspace java.util.regex.Pattern regPattern; + @Workspace java.util.regex.Pattern regPattern; @Output VarCharHolder out; public void setup(RecordBatch incoming){ - buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); + buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); regPattern = java.util.regex.Pattern.compile(pattern.toString()); } - + public void eval(){ out.buffer = buffer; out.start = 0; - + byte [] bytea = regPattern.matcher(input.toString()).replaceAll(replacement.toString()).getBytes(java.nio.charset.Charset.forName("UTF-8")); out.buffer.setBytes(out.start, bytea); out.end = bytea.length; } } - + @FunctionTemplate(names = {"char_length", "character_length", "length"}, scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class CharLength implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Output BigIntHolder out; public void setup(RecordBatch incoming){} - + public void eval(){ - out.value = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(input.buffer, input.start, input.end); - } + out.value = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(input.buffer, input.start, input.end); + } } @FunctionTemplate(name = "lengthUtf8", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) @@ -133,109 +133,109 @@ public class StringFunctions{ @FunctionTemplate(name = "octet_length", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class OctetLength implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Output BigIntHolder out; public void setup(RecordBatch incoming){} - - public void eval(){ + + public void eval(){ out.value = input.end - input.start; } } @FunctionTemplate(name = "bit_length", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class BitLength implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Output BigIntHolder out; public void setup(RecordBatch incoming){} - - public void eval(){ + + public void eval(){ out.value = (input.end - input.start) * 8; } } - + /* * Location of specified substring. - * + * * Difference from PostgreSQL : - * exp \ System PostgreSQL Drill + * exp \ System PostgreSQL Drill * position('', 'abc') 1 0 * position('', '') 1 0 */ @FunctionTemplate(name = "position", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Position implements DrillSimpleFunc{ - + @Param VarCharHolder substr; @Param VarCharHolder str; @Output BigIntHolder out; public void setup(RecordBatch incoming){} - + public void eval(){ //Do string match. - int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(str.buffer, str.start, str.end, + int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(str.buffer, str.start, str.end, substr.buffer, substr.start, substr.end); if (pos < 0) { out.value = 0; //indicate not found a matched substr. } else { - //Count the # of characters. (one char could have 1-4 bytes) - out.value = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(str.buffer, str.start, pos) + 1; + //Count the # of characters. (one char could have 1-4 bytes) + out.value = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(str.buffer, str.start, pos) + 1; } } - + } - - // same as function "position(substr, str) ", except the reverse order of argument. + + // same as function "position(substr, str) ", except the reverse order of argument. @FunctionTemplate(name = "strpos", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Strpos implements DrillSimpleFunc{ - + @Param VarCharHolder str; @Param VarCharHolder substr; @Output BigIntHolder out; public void setup(RecordBatch incoming){} - - public void eval(){ + + public void eval(){ //Do string match. - int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(str.buffer, str.start, str.end, + int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(str.buffer, str.start, str.end, substr.buffer, substr.start, substr.end); if (pos < 0) { out.value = 0; //indicate not found a matched substr. } else { - //Count the # of characters. (one char could have 1-4 bytes) - out.value = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(str.buffer, str.start, pos) + 1; + //Count the # of characters. (one char could have 1-4 bytes) + out.value = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(str.buffer, str.start, pos) + 1; } - } - + } + } - + /* * Convert string to lower case. */ @FunctionTemplate(name = "lower", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class LowerCase implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Output VarCharHolder out; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - + public void eval(){ out.buffer = buffer; out.start = 0; out.end = input.end - input.start; - + for (int id = input.start; id < input.end; id++) { byte currentByte = input.buffer.getByte(id); - + // 'A - Z' : 0x41 - 0x5A // 'a - z' : 0x61 - 0x7A if (currentByte >= 0x41 && currentByte <= 0x5A) { @@ -251,23 +251,23 @@ public class StringFunctions{ */ @FunctionTemplate(name = "upper", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class UpperCase implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Output VarCharHolder out; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - + public void eval() { out.buffer = buffer; out.start = 0; out.end = input.end - input.start; - + for (int id = input.start; id < input.end; id++) { byte currentByte = input.buffer.getByte(id); - + // 'A - Z' : 0x41 - 0x5A // 'a - z' : 0x61 - 0x7A if (currentByte >= 0x61 && currentByte <= 0x7A) { @@ -278,50 +278,85 @@ public class StringFunctions{ } } - // Follow Postgre. - // -- Valid "offset": [1, string_length], - // -- Valid "length": [1, up to string_length - offset + 1], if length > string_length - offset +1, get the substr up to the string_lengt. + + // Follow Postgre. + // -- Valid "offset": [1, string_length], + // -- Valid "length": [1, up to string_length - offset + 1], if length > string_length - offset +1, get the substr up to the string_lengt. @FunctionTemplate(names = {"substring", "substr"}, scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Substring implements DrillSimpleFunc{ @Param VarCharHolder string; @Param BigIntHolder offset; @Param BigIntHolder length; - + @Output VarCharHolder out; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; public void setup(RecordBatch incoming) { - + } - + public void eval() { out.buffer = string.buffer; - // if length is NOT positive, or offset is NOT positive, or input string is empty, return empty string. + // if length is NOT positive, or offset is NOT positive, or input string is empty, return empty string. if (length.value <= 0 || offset.value <=0 || string.end <= string.start) { - out.start = out.end = 0; - } else { - //Do 1st scan to counter # of character in string. + out.start = out.end = 0; + } else { + //Do 1st scan to counter # of character in string. int charCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(string.buffer, string.start, string.end); - - int fromCharIdx = (int) offset.value; //the start position of char (inclusive) - - if (fromCharIdx > charCount ) { // invalid length, return empty string. - out.start = out.end = 0; + + int fromCharIdx = (int) offset.value; //the start position of char (inclusive) + + if (fromCharIdx > charCount ) { // invalid length, return empty string. + out.start = out.end = 0; } else { out.start = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, string.start, string.end, fromCharIdx-1); - + // Bounded length by charCount - fromCharIdx + 1. substring("abc", 1, 5) --> "abc" - int charLen = Math.min((int)length.value, charCount - fromCharIdx + 1); - + int charLen = Math.min((int)length.value, charCount - fromCharIdx + 1); + out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, out.start, string.end, charLen); } - } + } + } + + } + + @FunctionTemplate(names = {"substring", "substr"}, scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class SubstringOffset implements DrillSimpleFunc{ + + @Param VarCharHolder string; + @Param BigIntHolder offset; + + @Output VarCharHolder out; + @Workspace ByteBuf buffer; + + public void setup(RecordBatch incoming) { + } + + public void eval() { + out.buffer = string.buffer; + // if length is NOT positive, or offset is NOT positive, or input string is empty, return empty string. + if (offset.value <=0 || string.end <= string.start) { + out.start = out.end = 0; + } else { + //Do 1st scan to counter # of character in string. + int charCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(string.buffer, string.start, string.end); + + int fromCharIdx = (int) offset.value; //the start position of char (inclusive) + + if (fromCharIdx > charCount ) { // invalid length, return empty string. + out.start = out.end = 0; + } else { + out.start = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, string.start, string.end, fromCharIdx-1); + out.end = string.end; + } + } } - + } - // Return first length characters in the string. When length is negative, return all but last |length| characters. + // Return first length characters in the string. When length is negative, return all but last |length| characters. // If length > total charcounts, return the whole string. // If length = 0, return empty // If length < 0, and |length| > total charcounts, return empty. @@ -330,30 +365,30 @@ public class StringFunctions{ @Param VarCharHolder string; @Param BigIntHolder length; - + @Output VarCharHolder out; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; public void setup(RecordBatch incoming){ } - + public void eval() { out.buffer = string.buffer; // if length is 0, or input string is empty, return empty string. if (length.value == 0 || string.end <= string.start) { out.start = out.end = 0; - } else { - //Do 1st scan to counter # of character in string. + } else { + //Do 1st scan to counter # of character in string. int charCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(string.buffer, string.start, string.end); int charLen = 0; if (length.value > 0) { charLen = Math.min((int)length.value, charCount); //left('abc', 5) -> 'abc' } else if (length.value < 0) { - charLen = Math.max(0, charCount + (int)length.value) ; // left('abc', -5) ==> '' + charLen = Math.max(0, charCount + (int)length.value) ; // left('abc', -5) ==> '' } - - out.start = string.start; //Starting from the left of input string. + + out.start = string.start; //Starting from the left of input string. out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, out.start, string.end, charLen); } // end of lenth.value != 0 } @@ -365,22 +400,22 @@ public class StringFunctions{ @Param VarCharHolder string; @Param BigIntHolder length; - + @Output VarCharHolder out; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; public void setup(RecordBatch incoming){ } - + public void eval() { out.buffer = string.buffer; // invalid length. if (length.value == 0 || string.end <= string.start) { out.start = out.end = 0; } else { - //Do 1st scan to counter # of character in string. + //Do 1st scan to counter # of character in string. int charCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(string.buffer, string.start, string.end); - + int fromCharIdx; //the start position of char (inclusive) int charLen; // the end position of char (inclusive) if (length.value > 0) { @@ -390,15 +425,15 @@ public class StringFunctions{ fromCharIdx = Math.abs((int) length.value) + 1; charLen = charCount - fromCharIdx +1; } - + // invalid length : right('abc', -5) -> '' - if (charLen <=0) { + if (charLen <=0) { out.start = out.end = 0; - } else { + } else { //Do 2nd scan of string. Get bytes corresponding chars in range. - out.start = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, string.start, string.end, fromCharIdx-1); - out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, out.start, string.end, charLen); - } + out.start = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, string.start, string.end, fromCharIdx-1); + out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(string.buffer, out.start, string.end, charLen); + } } } } @@ -406,26 +441,26 @@ public class StringFunctions{ @FunctionTemplate(name = "initcap", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class InitCap implements DrillSimpleFunc{ - + @Param VarCharHolder input; @Output VarCharHolder out; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - + public void eval() { out.buffer = buffer; out.start = 0; out.end = input.end - input.start; - + // Assumes Alpha as [A-Za-z0-9] - // white space is treated as everything else. + // white space is treated as everything else. boolean capNext = true; for (int id = input.start; id < input.end; id++) { byte currentByte = input.buffer.getByte(id); - + // 'A - Z' : 0x41 - 0x5A // 'a - z' : 0x61 - 0x7A // '0-9' : 0x30 - 0x39 @@ -450,102 +485,102 @@ public class StringFunctions{ capNext = true; } } - + out.buffer.setByte(id - input.start, currentByte) ; } //end of for_loop - + } - + } - + //Replace all occurrences in 'text' of substring 'from' with substring 'to' @FunctionTemplate(name = "replace", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Replace implements DrillSimpleFunc{ - + @Param VarCharHolder text; @Param VarCharHolder from; @Param VarCharHolder to; - @Workspace ByteBuf buffer; + @Workspace ByteBuf buffer; @Output VarCharHolder out; public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - + public void eval(){ out.buffer = buffer; out.start = out.end = 0; int fromL = from.end - from.start; int textL = text.end - text.start; - + if (fromL > 0 && fromL <= textL) { //If "from" is not empty and it's length is no longer than text's length //then, we may find a match, and do replace. - int i = text.start; + int i = text.start; for (; i<=text.end - fromL; ) { int j = from.start; for (; j<from.end; j++) { if (text.buffer.getByte(i + j - from.start) != from.buffer.getByte(j)) break; } - + if (j == from.end ) { - //find a true match ("from" is not empty), copy entire "to" string to out buffer + //find a true match ("from" is not empty), copy entire "to" string to out buffer for (int k = to.start ; k< to.end; k++) { out.buffer.setByte(out.end++, to.buffer.getByte(k)); } - + //advance i by the length of "from" i += from.end - from.start; } else { - //no match. copy byte i in text, advance i by 1. + //no match. copy byte i in text, advance i by 1. out.buffer.setByte(out.end++, text.buffer.getByte(i++)); } } - + //Copy the tail part of text (length < fromL). for (; i< text.end; i++) { out.buffer.setByte(out.end++, text.buffer.getByte(i)); } } else { - //If "from" is empty or its length is larger than text's length, - //then, we just set "out" as "text". + //If "from" is empty or its length is larger than text's length, + //then, we just set "out" as "text". out.buffer = text.buffer; out.start = text.start; out.end = text.end; } - + } // end of eval() - + } /* - * Fill up the string to length 'length' by prepending the characters 'fill' in the beginning of 'text'. + * Fill up the string to length 'length' by prepending the characters 'fill' in the beginning of 'text'. * If the string is already longer than length, then it is truncated (on the right). */ @FunctionTemplate(name = "lpad", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Lpad implements DrillSimpleFunc{ - + @Param VarCharHolder text; @Param BigIntHolder length; @Param VarCharHolder fill; - @Workspace ByteBuf buffer; - + @Workspace ByteBuf buffer; + @Output VarCharHolder out; public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - - public void eval() { + + public void eval() { byte currentByte = 0; int id = 0; //get the char length of text. int textCharCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(text.buffer, text.start, text.end); - + //get the char length of fill. int fillCharCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(fill.buffer, fill.start, fill.end); - + if (length.value <= 0) { //case 1: target length is <=0, then return an empty string. out.buffer = buffer; @@ -565,58 +600,58 @@ public class StringFunctions{ int count = 0; out.buffer = buffer; out.start = out.end = 0; - + while (count < length.value - textCharCount) { for (id = fill.start; id < fill.end; id++) { if (count == length.value - textCharCount) break; - - currentByte = fill.buffer.getByte(id); + + currentByte = fill.buffer.getByte(id); if (currentByte < 0x128 || // 1-byte char. First byte is 0xxxxxxx. (currentByte & 0xE0) == 0xC0 || // 2-byte char. First byte is 110xxxxx - (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx + (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx (currentByte & 0xF8) == 0xF0) { //4-byte char. First byte is 11110xxx count ++; //Advance the counter, since we find one char. } out.buffer.setByte(out.end++, currentByte); - } + } } // end of while - + //copy "text" into "out" - for (id = text.start; id < text.end; id++) + for (id = text.start; id < text.end; id++) out.buffer.setByte(out.end++, text.buffer.getByte(id)); } } // end of eval - + } - + /** * Fill up the string to length "length" by appending the characters 'fill' at the end of 'text' * If the string is already longer than length then it is truncated. */ @FunctionTemplate(name = "rpad", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Rpad implements DrillSimpleFunc{ - + @Param VarCharHolder text; @Param BigIntHolder length; @Param VarCharHolder fill; - @Workspace ByteBuf buffer; - + @Workspace ByteBuf buffer; + @Output VarCharHolder out; public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - - public void eval() { + + public void eval() { byte currentByte = 0; int id = 0; //get the char length of text. int textCharCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(text.buffer, text.start, text.end); - + //get the char length of fill. int fillCharCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(fill.buffer, fill.start, fill.end); - + if (length.value <= 0) { //case 1: target length is <=0, then return an empty string. out.buffer = buffer; @@ -631,36 +666,36 @@ public class StringFunctions{ out.buffer = text.buffer; out.start = text.start; out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(text.buffer, text.start, text.end, (int)length.value); - } else if (length.value > textCharCount) { + } else if (length.value > textCharCount) { //case 4: copy "text" into "out", then copy "fill" on the right. out.buffer = buffer; out.start = out.end = 0; - for (id = text.start; id < text.end; id++) + for (id = text.start; id < text.end; id++) out.buffer.setByte(out.end++, text.buffer.getByte(id)); //copy "fill" on right. Total # of char to copy : length.value - textCharCount int count = 0; - + while (count < length.value - textCharCount) { for (id = fill.start; id < fill.end; id++) { if (count == length.value - textCharCount) break; - - currentByte = fill.buffer.getByte(id); + + currentByte = fill.buffer.getByte(id); if (currentByte < 0x128 || // 1-byte char. First byte is 0xxxxxxx. (currentByte & 0xE0) == 0xC0 || // 2-byte char. First byte is 110xxxxx - (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx + (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx (currentByte & 0xF8) == 0xF0) { //4-byte char. First byte is 11110xxx count ++; //Advance the counter, since we find one char. } out.buffer.setByte(out.end++, currentByte); - } + } } // end of while - + } } // end of eval - + } /** @@ -668,18 +703,18 @@ public class StringFunctions{ */ @FunctionTemplate(name = "ltrim", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Ltrim implements DrillSimpleFunc{ - + @Param VarCharHolder text; @Param VarCharHolder from; - + @Output VarCharHolder out; public void setup(RecordBatch incoming){ } - - public void eval() { + + public void eval() { out.buffer = text.buffer; - out.start = out.end = text.end; + out.start = out.end = text.end; byte currentByte = 0; int id = 0; @@ -687,26 +722,26 @@ public class StringFunctions{ //Scan from left of "text", stop until find a char not in "from" for (id = text.start; id < text.end; ) { currentByte = text.buffer.getByte(id); - + bytePerChar = 0; - + if (currentByte < 0x128) // 1-byte char. First byte is 0xxxxxxx. bytePerChar = 1; else if ((currentByte & 0xE0) == 0xC0 ) // 2-byte char. First byte is 110xxxxx bytePerChar = 2; - else if ((currentByte & 0xF0) == 0xE0 ) // 3-byte char. First byte is 1110xxxx + else if ((currentByte & 0xF0) == 0xE0 ) // 3-byte char. First byte is 1110xxxx bytePerChar = 3; else if ((currentByte & 0xF8) == 0xF0) //4-byte char. First byte is 11110xxx bytePerChar = 4; - + //Scan to check if "from" contains the character of "byterPerChar" bytes. int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer, from.start, from.end, text.buffer, id, id + bytePerChar); if (pos < 0) { // Found the 1st char not in "from", stop - out.start = id; + out.start = id; break; } - id += bytePerChar; //Advance to next character. + id += bytePerChar; //Advance to next character. } } // end of eval @@ -717,18 +752,18 @@ public class StringFunctions{ */ @FunctionTemplate(name = "rtrim", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) public static class Rtrim implements DrillSimpleFunc{ - + @Param VarCharHolder text; @Param VarCharHolder from; - + @Output VarCharHolder out; public void setup(RecordBatch incoming){ } - - public void eval() { + + public void eval() { out.buffer = text.buffer; - out.start = out.end = text.start; + out.start = out.end = text.start; byte currentByte = 0; int id = 0; @@ -736,7 +771,7 @@ public class StringFunctions{ //Scan from right of "text", stop until find a char not in "from" for (id = text.end-1; id>= text.start; ) { currentByte = text.buffer.getByte(id); - + bytePerChar = 0; //In UTF-8 encoding, the continuation byte for a multi-byte char is 10xxxxxx. //Continue back-off to prior byte if it's continuation byte @@ -747,51 +782,51 @@ public class StringFunctions{ bytePerChar = 1; else if ((currentByte & 0xE0) == 0xC0 ) // 2-byte char. First byte is 110xxxxx bytePerChar = 2; - else if ((currentByte & 0xF0) == 0xE0 ) // 3-byte char. First byte is 1110xxxx + else if ((currentByte & 0xF0) == 0xE0 ) // 3-byte char. First byte is 1110xxxx bytePerChar = 3; else if ((currentByte & 0xF8) == 0xF0) //4-byte char. First byte is 11110xxx bytePerChar = 4; - - //Scan to check if "from" contains the character of "byterPerChar" bytes. The lead byte starts at id. + + //Scan to check if "from" contains the character of "byterPerChar" bytes. The lead byte starts at id. int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer, from.start, from.end, text.buffer, id, id + bytePerChar); if (pos < 0) { // Found the 1st char not in "from", stop - out.end = id+ bytePerChar; + out.end = id+ bytePerChar; break; } - - id --; // back-off to prior character. + + id --; // back-off to prior character. } } // end of eval } //Concatenate the text representations of the arguments. NULL arguments are ignored. - //TODO: NullHanding.INTERNAL for DrillSimpleFunc requires change in code generation. + //TODO: NullHanding.INTERNAL for DrillSimpleFunc requires change in code generation. @FunctionTemplate(name = "concat", scope = FunctionScope.SIMPLE, nulls = NullHandling.INTERNAL) public static class Concat implements DrillSimpleFunc{ - + @Param VarCharHolder left; @Param VarCharHolder right; @Output VarCharHolder out; - @Workspace ByteBuf buffer; - - + @Workspace ByteBuf buffer; + + public void setup(RecordBatch incoming){ buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [8000]); } - + public void eval(){ out.buffer = buffer; out.start = out.end = 0; - + int id = 0; - for (id = left.start; id < left.end; id++) + for (id = left.start; id < left.end; id++) out.buffer.setByte(out.end++, left.buffer.getByte(id)); - + for (id = right.start; id < right.end; id++) out.buffer.setByte(out.end++, right.buffer.getByte(id)); - } - + } + } @FunctionTemplate(name = "concat", scope = FunctionScope.SIMPLE, nulls = NullHandling.INTERNAL) http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/e4101807/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java index 51aa633..af741a5 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java @@ -93,7 +93,7 @@ public class TestStringFunctions extends ExecTest { for (int i = 0; i<res.length; i++) { assertEquals(String.format("column %s does not match", i), expectedResults[i], res[i]); - } + } } if(context.getFailureCause() != null){ @@ -190,7 +190,7 @@ public class TestStringFunctions extends ExecTest { @Test public void testSubstr(@Injectable final DrillbitContext bitContext, @Injectable UserServer.UserClientConnection connection) throws Throwable{ - Object [] expected = new Object[] {"abc", "bcd", "bcdef", "bcdef", "", "", "", "", "à¤à¤¾à¤°à¤¤", "वरà¥à¤·", "वरà¥à¤·"}; + Object [] expected = new Object[] {"abc", "bcd", "bcdef", "bcdef", "", "", "", "", "à¤à¤¾à¤°à¤¤", "वरà¥à¤·", "वरà¥à¤·", "cdef", "", "", "", "डà¥à¤°à¤¿à¤²"}; runTest(bitContext, connection, expected, "functions/string/testSubstr.json"); } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/e4101807/exec/java-exec/src/test/resources/functions/string/testSubstr.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/functions/string/testSubstr.json b/exec/java-exec/src/test/resources/functions/string/testSubstr.json index 94467ae..02c7a2d 100644 --- a/exec/java-exec/src/test/resources/functions/string/testSubstr.json +++ b/exec/java-exec/src/test/resources/functions/string/testSubstr.json @@ -33,7 +33,12 @@ { ref: "col8", expr: "substring('abcdef', 10, 2)"}, { ref: "col9", expr: "substring('à¤à¤¾à¤°à¤¤à¤µà¤°à¥à¤·', 1, 4)"}, { ref: "col10", expr: "substring('à¤à¤¾à¤°à¤¤à¤µà¤°à¥à¤·', 5, 4)"}, - { ref: "col11", expr: "substring('à¤à¤¾à¤°à¤¤à¤µà¤°à¥à¤·', 5, 5)"} + { ref: "col11", expr: "substring('à¤à¤¾à¤°à¤¤à¤µà¤°à¥à¤·', 5, 5)"}, + { ref: "col12", expr: "substring('abcdef', 3)"}, + { ref: "col13", expr: "substring('abcdef', -2)"}, + { ref: "col14", expr: "substring('abcdef', 0)"}, + { ref: "col15", expr: "substring('abcdef', 10)"}, + { ref: "col16", expr: "substring('ठपाà¤à¥ डà¥à¤°à¤¿à¤²', 7)"} ] }, {
