Repository: drill Updated Branches: refs/heads/master c2d9959e0 -> 2e2b54af5
DRILL-4573: Fixed issue with string functions when input contains non-ASCII characters. Close apache/drill#512 Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/2e2b54af Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/2e2b54af Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/2e2b54af Branch: refs/heads/master Commit: 2e2b54af5379f7046e84390e70f3862cddd93195 Parents: c2d9959 Author: jean-claude cote <jcc...@gmail.com> Authored: Sat May 14 18:02:57 2016 -0400 Committer: Jinfeng Ni <j...@apache.org> Committed: Wed Jun 15 17:46:56 2016 -0700 ---------------------------------------------------------------------- .../exec/expr/fn/impl/CharSequenceWrapper.java | 204 +++++++++++++++++-- .../exec/expr/fn/impl/StringFunctions.java | 28 ++- .../exec/expr/fn/impl/TestStringFunctions.java | 26 +++ 3 files changed, 234 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/2e2b54af/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java index 6c475ed..d085494 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java @@ -6,9 +6,7 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * * http://www.apache.org/licenses/LICENSE-2.0 - * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,33 +15,203 @@ */ package org.apache.drill.exec.expr.fn.impl; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.util.regex.Matcher; + import io.netty.buffer.DrillBuf; +/** + * A CharSequence is a readable sequence of char values. This interface provides + * uniform, read-only access to many different kinds of char sequences. A char + * value represents a character in the Basic Multilingual Plane (BMP) or a + * surrogate. Refer to Unicode Character Representation for details.<br> + * Specifically this implementation of the CharSequence adapts a Drill + * {@link DrillBuf} to the CharSequence. The implementation is meant to be + * re-used that is allocated once and then passed DrillBuf to adapt. This can be + * handy to exploit API that consume CharSequence avoiding the need to create + * string objects. + * + */ public class CharSequenceWrapper implements CharSequence { - private int start; - private int end; - private DrillBuf buffer; + // The adapted drill buffer (in the case of US-ASCII) + private DrillBuf buffer; + // The converted bytes in the case of non ASCII + private CharBuffer charBuffer; + // initial char buffer capacity + private static final int INITIAL_CHAR_BUF = 1024; + // The decoder to use in the case of non ASCII + private CharsetDecoder decoder; + + // The start offset into the drill buffer + private int start; + // The end offset into the drill buffer + private int end; + // Indicates that the current byte buffer contains only ascii chars + private boolean usAscii; + + public CharSequenceWrapper() { + } + + public CharSequenceWrapper(int start, int end, DrillBuf buffer) { + setBuffer(start, end, buffer); + } - @Override - public int length() { - return end - start; + @Override + public int length() { + return end - start; + } + + @Override + public char charAt(int index) { + if (usAscii) { + // Each byte is a char, the index is relative to the start of the original buffer + return (char) (buffer.getByte(start + index) & 0x00FF); + } else { + // The char buffer is a copy so the index directly corresponds + return charBuffer.charAt(index); } + } + + /** + * When using the Java regex {@link Matcher} the subSequence is only called + * when capturing groups. Drill does not currently use capture groups in the + * UDF so this method is not required.<br> + * It could be implemented by creating a new CharSequenceWrapper however + * this would imply newly allocated objects which is what this wrapper tries + * to avoid. + * + */ + @Override + public CharSequence subSequence(int start, int end) { + throw new UnsupportedOperationException(); + } + + /** + * Set the DrillBuf to adapt to a CharSequence. This method can be used to + * replace any previous DrillBuf thus avoiding recreating the + * CharSequenceWrapper and thus re-using the CharSequenceWrapper object. + * + * @param start + * @param end + * @param buffer + */ + public void setBuffer(int start, int end, DrillBuf buffer) { + // Test if buffer is an ASCII string or not. + usAscii = isAscii(start, end, buffer); - @Override - public char charAt(int index) { - return (char) buffer.getByte(start + index); + if (usAscii) { + // each byte equals one char + this.start = start; + this.end = end; + this.buffer = buffer; + } else { + initCharBuffer(); + // Wrap with java byte buffer + ByteBuffer byteBuf = buffer.nioBuffer(start, end - start); + while (charBuffer.capacity() < Integer.MAX_VALUE) { + byteBuf.mark(); + if (decodeUT8(byteBuf)) { + break; + } + // Failed to convert because the char buffer was not large enough + growCharBuffer(); + // Make sure to reset the byte buffer we need to reprocess it + byteBuf.reset(); + } + this.start = 0; + this.end = charBuffer.position(); + // reset the char buffer so the index are relative to the start of the buffer + charBuffer.rewind(); } + } - @Override - public CharSequence subSequence(int start, int end) { - throw new UnsupportedOperationException("Not implemented."); + /** + * Test if the buffer contains only ASCII bytes. + * @param start + * @param end + * @param buffer + * @return + */ + private boolean isAscii(int start, int end, DrillBuf buffer) { + for (int i = start; i < end; i++) { + byte bb = buffer.getByte(i); + if (bb < 0) { + //System.out.printf("Not a ASCII byte 0x%02X\n", bb); + return false; + } } + return true; + } - public void setBuffer(int start, int end, DrillBuf buffer) { - this.start = start; - this.end = end; - this.buffer = buffer; + /** + * Initialize the charbuffer and decoder if they are not yet initialized. + */ + private void initCharBuffer() { + if (charBuffer == null) { + charBuffer = CharBuffer.allocate(INITIAL_CHAR_BUF); } + if (decoder == null) { + decoder = Charset.forName("UTF-8").newDecoder(); + } + } + + /** + * Decode the buffer using the CharsetDecoder. + * @param byteBuf + * @return false if failed because the charbuffer was not big enough + * @throws RuntimeException if it fails for encoding errors + */ + private boolean decodeUT8(ByteBuffer byteBuf) { + // We give it all of the input data in call. + boolean endOfInput = true; + decoder.reset(); + charBuffer.rewind(); + // Convert utf-8 bytes to sequence of chars + CoderResult result = decoder.decode(byteBuf, charBuffer, endOfInput); + if (result.isOverflow()) { + // Not enough space in the charBuffer. + return false; + } else if (result.isError()) { + // Any other error + try { + result.throwException(); + } catch (CharacterCodingException e) { + throw new RuntimeException(e); + } + } + return true; + } + + /** + * Grow the charbuffer making sure not to overflow size integer. Note + * this grows in the same manner as the ArrayList that is it adds 50% + * to the current size. + */ + private void growCharBuffer() { + // overflow-conscious code + int oldCapacity = charBuffer.capacity(); + //System.out.println("old capacity " + oldCapacity); + int newCapacity = oldCapacity + (oldCapacity >> 1); + if (newCapacity < 0) { + newCapacity = Integer.MAX_VALUE; + } + //System.out.println("new capacity " + newCapacity); + charBuffer = CharBuffer.allocate(newCapacity); + } + + /** + * The regexp_replace function is implemented in a way to avoid the call to toString() + * not to uselessly create a string object. + */ + @Override + public String toString() { + throw new UnsupportedOperationException(); + } } http://git-wip-us.apache.org/repos/asf/drill/blob/2e2b54af/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 524d254..50ff435 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -243,14 +243,30 @@ public class StringFunctions{ public void eval() { out.start = 0; charSequenceWrapper.setBuffer(input.start, input.end, input.buffer); + final String r = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(replacement.start, replacement.end, replacement.buffer); // Reusing same charSequenceWrapper, no need to pass it in. - // This saves one method call since reset(CharSequence) calls reset() matcher.reset(); - final String r = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(replacement.start, replacement.end, replacement.buffer); - final byte [] bytea = matcher.replaceAll(r).getBytes(java.nio.charset.Charset.forName("UTF-8")); - out.buffer = buffer = buffer.reallocIfNeeded(bytea.length); - out.buffer.setBytes(out.start, bytea); - out.end = bytea.length; + // Implementation of Matcher.replaceAll() in-lined to avoid creating String object + // in cases where we don't actually replace anything. + boolean result = matcher.find(); + if (result) { + StringBuffer sb = new StringBuffer(); + do { + matcher.appendReplacement(sb, r); + result = matcher.find(); + } while (result); + matcher.appendTail(sb); + final byte [] bytea = sb.toString().getBytes(java.nio.charset.Charset.forName("UTF-8")); + out.buffer = buffer = buffer.reallocIfNeeded(bytea.length); + out.buffer.setBytes(out.start, bytea); + out.end = bytea.length; + } + else { + // There is no matches, copy the input bytes into the output buffer + out.buffer = buffer = buffer.reallocIfNeeded(input.end - input.start); + out.buffer.setBytes(0, input.buffer, input.start, input.end - input.start); + out.end = input.end - input.start; + } } } http://git-wip-us.apache.org/repos/asf/drill/blob/2e2b54af/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java index 612408b..daedd1c 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java @@ -117,6 +117,32 @@ public class TestStringFunctions extends BaseTestQuery { } @Test + public void testRegexpMatchesNonAscii() throws Exception { + testBuilder() + .sqlQuery("select regexp_matches(a, 'München') res1, regexp_matches(b, 'AMünchenA') res2 " + + "from (values('München', 'MünchenA'), ('MünchenA', 'AMünchenA')) as t(a,b)") + .unOrdered() + .baselineColumns("res1", "res2") + .baselineValues(true, false) + .baselineValues(false, true) + .build() + .run(); + } + + @Test + public void testRegexpReplace() throws Exception { + testBuilder() + .sqlQuery("select regexp_replace(a, 'a|c', 'x') res1, regexp_replace(b, 'd', 'zzz') res2 " + + "from (values('abc', 'bcd'), ('bcd', 'abc')) as t(a,b)") + .unOrdered() + .baselineColumns("res1", "res2") + .baselineValues("xbx", "bczzz") + .baselineValues("bxd", "abc") + .build() + .run(); + } + + @Test public void testILike() throws Exception { testBuilder() .sqlQuery("select n_name from cp.`tpch/nation.parquet` where ilike(n_name, '%united%') = true")