DRILL-476: Create binary_string() function to convert encoded binary string to sequence of bytes
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/c7cb7bae Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/c7cb7bae Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/c7cb7bae Branch: refs/heads/master Commit: c7cb7baec66230e117d10494549586f987277510 Parents: 025538c Author: Aditya Kishore <[email protected]> Authored: Wed Apr 2 00:26:46 2014 -0700 Committer: Jacques Nadeau <[email protected]> Committed: Sat Apr 19 21:07:28 2014 -0700 ---------------------------------------------------------------------- .../exec/expr/fn/impl/StringFunctionUtil.java | 118 +++++++++++++++---- .../exec/expr/fn/impl/StringFunctions.java | 21 ++++ 2 files changed, 116 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/c7cb7bae/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java index c0dad84..0096a13 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java @@ -20,66 +20,138 @@ package org.apache.drill.exec.expr.fn.impl; import io.netty.buffer.ByteBuf; public class StringFunctionUtil { - + /* Decode the input bytebuf using UTF-8, and return the number of characters */ public static int getUTF8CharLength(ByteBuf buffer, int start, int end) { int charCount = 0; - + for (int id = start; id < end; id++) { byte currentByte = buffer.getByte(id); - + if (currentByte < 0x128 || // 1-byte char. First byte is 0xxxxxxx. (currentByte & 0xE0) == 0xC0 || // 2-byte char. First byte is 110xxxxx - (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx + (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx (currentByte & 0xF8) == 0xF0) { //4-byte char. First byte is 11110xxx charCount ++; //Advance the counter, since we find one char. - } + } } return charCount; } - /* Decode the input bytebuf using UTF-8. Search in the range of [start, end], find + /* Decode the input bytebuf using UTF-8. Search in the range of [start, end], find * the position of the first byte of next char after we see "charLength" chars. - * + * */ public static int getUTF8CharPosition(ByteBuf buffer, int start, int end, int charLength) { int charCount = 0; - + if (start >=end) - return -1; //wrong input here. - + return -1; //wrong input here. + for (int id = start; id < end; id++) { - + byte currentByte = buffer.getByte(id); if (currentByte < 0x128 || // 1-byte char. First byte is 0xxxxxxx. (currentByte & 0xE0) == 0xC0 || // 2-byte char. First byte is 110xxxxx - (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx + (currentByte & 0xF0) == 0xE0 || // 3-byte char. First byte is 1110xxxx (currentByte & 0xF8) == 0xF0) { //4-byte char. First byte is 11110xxx - charCount ++; //Advance the counter, since we find one char. + charCount ++; //Advance the counter, since we find one char. if (charCount == charLength + 1) { return id; } - } + } } - return end; + return end; } - - public static int stringLeftMatchUTF8(ByteBuf str, int strStart, int strEnd, - ByteBuf substr, int subStart, int subEnd) { + + public static int stringLeftMatchUTF8(ByteBuf str, int strStart, int strEnd, + ByteBuf substr, int subStart, int subEnd) { for (int i = strStart; i <= strEnd - (subEnd - subStart); i++) { int j = subStart; - for (; j< subEnd; j++) { + for (; j< subEnd; j++) { if (str.getByte(i + j - subStart) != substr.getByte(j)) - break; + break; } - - if (j == subEnd && j!= subStart) { // found a matched substr (non-empty) in str. + + if (j == subEnd && j!= subStart) { // found a matched substr (non-empty) in str. return i; // found a match. } } - + return -1; } + + /** + * Return a printable representation of a byte buffer, escaping the non-printable + * bytes as '\\xNN' where NN is the hexadecimal representation of such bytes. + * + * This function does not modify the {@code readerIndex} and {@code writerIndex} + * of the byte buffer. + */ + public static String toBinaryString(ByteBuf buf, int strStart, int strEnd) { + StringBuilder result = new StringBuilder(); + for (int i = strStart; i < strEnd ; ++i) { + int ch = buf.getByte(i) & 0xFF; + if ( (ch >= '0' && ch <= '9') + || (ch >= 'A' && ch <= 'Z') + || (ch >= 'a' && ch <= 'z') + || " `~!@#$%^&*()-_=+[]{}|;:'\",.<>/?".indexOf(ch) >= 0 ) { + result.append((char)ch); + } else { + result.append(String.format("\\x%02X", ch)); + } + } + return result.toString(); + } + + /** + * In-place parsing of a hex encoded binary string. + * + * This function does not modify the {@code readerIndex} and {@code writerIndex} + * of the byte buffer. + * + * @return Index in the byte buffer just after the last written byte. + */ + public static int parseBinaryString(ByteBuf str, int strStart, int strEnd) { + int length = (strEnd - strStart); + int dstEnd = strStart; + for (int i = strStart; i < length ; i++) { + byte b = str.getByte(i); + if (b == '\\' + && length > i+3 + && (str.getByte(i+1) == 'x' || str.getByte(i+1) == 'X')) { + // ok, take next 2 hex digits. + byte hd1 = str.getByte(i+2); + byte hd2 = str.getByte(i+3); + if (isHexDigit(hd1) && isHexDigit(hd2)) { // [a-fA-F0-9] + // turn hex ASCII digit -> number + b = (byte) ((toBinaryFromHex(hd1) << 4) + toBinaryFromHex(hd2)); + i += 3; // skip 3 + } + } + str.setByte(dstEnd++, b); + } + return dstEnd; + } + + /** + * Takes a ASCII digit in the range A-F0-9 and returns + * the corresponding integer/ordinal value. + * @param ch The hex digit. + * @return The converted hex value as a byte. + */ + private static byte toBinaryFromHex(byte ch) { + if ( ch >= 'A' && ch <= 'F' ) + return (byte) ((byte)10 + (byte) (ch - 'A')); + else if ( ch >= 'a' && ch <= 'f' ) + return (byte) ((byte)10 + (byte) (ch - 'a')); + return (byte) (ch - '0'); + } + + private static boolean isHexDigit(byte c) { + return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || (c >= '0' && c <= '9'); + } + } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/c7cb7bae/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 5e85012..aca5933 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -29,6 +29,7 @@ import org.apache.drill.exec.expr.annotations.Param; import org.apache.drill.exec.expr.annotations.Workspace; import org.apache.drill.exec.expr.holders.BigIntHolder; import org.apache.drill.exec.expr.holders.BitHolder; +import org.apache.drill.exec.expr.holders.VarBinaryHolder; import org.apache.drill.exec.expr.holders.VarCharHolder; import org.apache.drill.exec.record.RecordBatch; @@ -779,4 +780,24 @@ public class StringFunctions{ } } + + // Converts a hex encoded string into a varbinary type. + // "\xca\xfe\xba\xbe" => (byte[]) {(byte)0xca, (byte)0xfe, (byte)0xba, (byte)0xbe} + @FunctionTemplate(name = "binary_string", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class BinaryString implements DrillSimpleFunc { + + @Param VarCharHolder in; + @Output VarBinaryHolder out; + + public void setup(RecordBatch incoming) { } + + public void eval() { + out.buffer = in.buffer; + out.start = in.start; + out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.parseBinaryString(in.buffer, in.start, in.end); + out.buffer.readerIndex(out.start); + out.buffer.writerIndex(out.end); + } + } + }
