DRILL-476: Create binary_string() function to convert encoded binary string to 
sequence of bytes


Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/c7cb7bae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/c7cb7bae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/c7cb7bae

Branch: refs/heads/master
Commit: c7cb7baec66230e117d10494549586f987277510
Parents: 025538c
Author: Aditya Kishore <[email protected]>
Authored: Wed Apr 2 00:26:46 2014 -0700
Committer: Jacques Nadeau <[email protected]>
Committed: Sat Apr 19 21:07:28 2014 -0700

----------------------------------------------------------------------
 .../exec/expr/fn/impl/StringFunctionUtil.java   | 118 +++++++++++++++----
 .../exec/expr/fn/impl/StringFunctions.java      |  21 ++++
 2 files changed, 116 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/c7cb7bae/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
index c0dad84..0096a13 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
@@ -20,66 +20,138 @@ package org.apache.drill.exec.expr.fn.impl;
 import io.netty.buffer.ByteBuf;
 
 public class StringFunctionUtil {
-  
+
   /* Decode the input bytebuf using UTF-8, and return the number of characters
    */
   public static int getUTF8CharLength(ByteBuf buffer, int start, int end) {
     int charCount = 0;
-    
+
     for (int id = start; id < end; id++) {
       byte  currentByte = buffer.getByte(id);
-      
+
       if (currentByte < 0x128  ||           // 1-byte char. First byte is 
0xxxxxxx.
           (currentByte & 0xE0) == 0xC0 ||   // 2-byte char. First byte is 
110xxxxx
-          (currentByte & 0xF0) == 0xE0 ||   // 3-byte char. First byte is 
1110xxxx 
+          (currentByte & 0xF0) == 0xE0 ||   // 3-byte char. First byte is 
1110xxxx
           (currentByte & 0xF8) == 0xF0) {   //4-byte char. First byte is 
11110xxx
         charCount ++;  //Advance the counter, since we find one char.
-      }            
+      }
     }
     return charCount;
   }
 
-  /* Decode the input bytebuf using UTF-8. Search in the range of [start, 
end], find 
+  /* Decode the input bytebuf using UTF-8. Search in the range of [start, 
end], find
    * the position of the first byte of next char after we see "charLength" 
chars.
-   *    
+   *
    */
   public static int getUTF8CharPosition(ByteBuf buffer, int start, int end, 
int charLength) {
     int charCount = 0;
-    
+
     if (start >=end)
-      return -1;  //wrong input here. 
-    
+      return -1;  //wrong input here.
+
     for (int id = start; id < end; id++) {
-      
+
       byte  currentByte = buffer.getByte(id);
 
       if (currentByte < 0x128  ||           // 1-byte char. First byte is 
0xxxxxxx.
           (currentByte & 0xE0) == 0xC0 ||   // 2-byte char. First byte is 
110xxxxx
-          (currentByte & 0xF0) == 0xE0 ||   // 3-byte char. First byte is 
1110xxxx 
+          (currentByte & 0xF0) == 0xE0 ||   // 3-byte char. First byte is 
1110xxxx
           (currentByte & 0xF8) == 0xF0) {   //4-byte char. First byte is 
11110xxx
-        charCount ++;  //Advance the counter, since we find one char. 
+        charCount ++;  //Advance the counter, since we find one char.
         if (charCount == charLength + 1) {
           return id;
         }
-      }            
+      }
     }
-    return end;  
+    return end;
   }
-  
-  public static int stringLeftMatchUTF8(ByteBuf str, int strStart, int strEnd, 
-                                    ByteBuf substr, int subStart, int subEnd) 
{   
+
+  public static int stringLeftMatchUTF8(ByteBuf str, int strStart, int strEnd,
+                                    ByteBuf substr, int subStart, int subEnd) {
     for (int i = strStart; i <= strEnd - (subEnd - subStart); i++) {
       int j = subStart;
-      for (; j< subEnd; j++) {          
+      for (; j< subEnd; j++) {
         if (str.getByte(i + j - subStart) != substr.getByte(j))
-          break;          
+          break;
       }
-      
-      if (j == subEnd  && j!= subStart) {  // found a matched substr 
(non-empty) in str. 
+
+      if (j == subEnd  && j!= subStart) {  // found a matched substr 
(non-empty) in str.
         return i;   // found a match.
       }
     }
-    
+
     return -1;
   }
+
+  /**
+   * Return a printable representation of a byte buffer, escaping the 
non-printable
+   * bytes as '\\xNN' where NN is the hexadecimal representation of such bytes.
+   *
+   * This function does not modify  the {@code readerIndex} and {@code 
writerIndex}
+   * of the byte buffer.
+   */
+  public static String toBinaryString(ByteBuf buf, int strStart, int strEnd) {
+    StringBuilder result = new StringBuilder();
+    for (int i = strStart; i < strEnd ; ++i) {
+      int ch = buf.getByte(i) & 0xFF;
+      if ( (ch >= '0' && ch <= '9')
+          || (ch >= 'A' && ch <= 'Z')
+          || (ch >= 'a' && ch <= 'z')
+          || " `~!@#$%^&*()-_=+[]{}|;:'\",.<>/?".indexOf(ch) >= 0 ) {
+          result.append((char)ch);
+      } else {
+        result.append(String.format("\\x%02X", ch));
+      }
+    }
+    return result.toString();
+  }
+
+  /**
+   * In-place parsing of a hex encoded binary string.
+   *
+   * This function does not modify  the {@code readerIndex} and {@code 
writerIndex}
+   * of the byte buffer.
+   *
+   * @return Index in the byte buffer just after the last written byte.
+   */
+  public static int parseBinaryString(ByteBuf str, int strStart, int strEnd) {
+    int length = (strEnd - strStart);
+    int dstEnd = strStart;
+    for (int i = strStart; i < length ; i++) {
+      byte b = str.getByte(i);
+      if (b == '\\'
+          && length > i+3
+          && (str.getByte(i+1) == 'x' || str.getByte(i+1) == 'X')) {
+        // ok, take next 2 hex digits.
+        byte hd1 = str.getByte(i+2);
+        byte hd2 = str.getByte(i+3);
+        if (isHexDigit(hd1) && isHexDigit(hd2)) { // [a-fA-F0-9]
+          // turn hex ASCII digit -> number
+          b = (byte) ((toBinaryFromHex(hd1) << 4) + toBinaryFromHex(hd2));
+          i += 3; // skip 3
+        }
+      }
+      str.setByte(dstEnd++, b);
+    }
+    return dstEnd;
+  }
+
+  /**
+   * Takes a ASCII digit in the range A-F0-9 and returns
+   * the corresponding integer/ordinal value.
+   * @param ch  The hex digit.
+   * @return The converted hex value as a byte.
+   */
+  private static byte toBinaryFromHex(byte ch) {
+    if ( ch >= 'A' && ch <= 'F' )
+      return (byte) ((byte)10 + (byte) (ch - 'A'));
+    else if ( ch >= 'a' && ch <= 'f' )
+      return (byte) ((byte)10 + (byte) (ch - 'a'));
+    return (byte) (ch - '0');
+  }
+
+  private static boolean isHexDigit(byte c) {
+    return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || (c >= '0' && c 
<= '9');
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/c7cb7bae/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
index 5e85012..aca5933 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
@@ -29,6 +29,7 @@ import org.apache.drill.exec.expr.annotations.Param;
 import org.apache.drill.exec.expr.annotations.Workspace;
 import org.apache.drill.exec.expr.holders.BigIntHolder;
 import org.apache.drill.exec.expr.holders.BitHolder;
+import org.apache.drill.exec.expr.holders.VarBinaryHolder;
 import org.apache.drill.exec.expr.holders.VarCharHolder;
 import org.apache.drill.exec.record.RecordBatch;
 
@@ -779,4 +780,24 @@ public class StringFunctions{
     } 
     
   }
+
+  // Converts a hex encoded string into a varbinary type.
+  // "\xca\xfe\xba\xbe" => (byte[]) {(byte)0xca, (byte)0xfe, (byte)0xba, 
(byte)0xbe}
+  @FunctionTemplate(name = "binary_string", scope = FunctionScope.SIMPLE, 
nulls = NullHandling.NULL_IF_NULL)
+  public static class BinaryString implements DrillSimpleFunc {
+
+    @Param  VarCharHolder in;
+    @Output VarBinaryHolder out;
+
+    public void setup(RecordBatch incoming) { }
+
+    public void eval() {
+      out.buffer = in.buffer;
+      out.start = in.start;
+      out.end = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.parseBinaryString(in.buffer,
 in.start, in.end);
+      out.buffer.readerIndex(out.start);
+      out.buffer.writerIndex(out.end);
+    }
+  }
+
 }

Reply via email to