Github user paul-rogers commented on a diff in the pull request: https://github.com/apache/drill/pull/1001#discussion_r145577808 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java --- @@ -17,36 +17,133 @@ */ package org.apache.drill.exec.expr.fn.impl; -public class SqlPatternContainsMatcher implements SqlPatternMatcher { +public final class SqlPatternContainsMatcher implements SqlPatternMatcher { final String patternString; CharSequence charSequenceWrapper; final int patternLength; public SqlPatternContainsMatcher(String patternString, CharSequence charSequenceWrapper) { - this.patternString = patternString; + this.patternString = patternString; this.charSequenceWrapper = charSequenceWrapper; - patternLength = patternString.length(); + patternLength = patternString.length(); } @Override - public int match() { - final int txtLength = charSequenceWrapper.length(); - int patternIndex = 0; - int txtIndex = 0; + public final int match() { + // The idea is to write loops with simple condition checks to allow the Java Hotspot vectorize + // the generate code. + if (patternLength == 1) { + return match_1(); + } else if (patternLength == 2) { + return match_2(); + } else if (patternLength == 3) { + return match_3(); + } else { + return match_N(); + } + } + + private final int match_1() { + final CharSequence sequenceWrapper = charSequenceWrapper; + final int lengthToProcess = sequenceWrapper.length(); + final char first_patt_char = patternString.charAt(0); + + // simplePattern string has meta characters i.e % and _ and escape characters removed. + // so, we can just directly compare. + for (int idx = 0; idx < lengthToProcess; idx++) { + char input_char = sequenceWrapper.charAt(idx); + + if (first_patt_char != input_char) { + continue; + } + return 1; + } + return 0; + } + + private final int match_2() { + final CharSequence sequenceWrapper = charSequenceWrapper; + final int lengthToProcess = sequenceWrapper.length() - 1; + final char first_patt_char = patternString.charAt(0); + + // simplePattern string has meta characters i.e % and _ and escape characters removed. + // so, we can just directly compare. + for (int idx = 0; idx < lengthToProcess; idx++) { + char input_char = sequenceWrapper.charAt(idx); + + if (first_patt_char != input_char) { + continue; + } else { + char ch2_1 = sequenceWrapper.charAt(idx+1); + char ch2_2 = patternString.charAt(1); --- End diff -- We want speed. Instead of getting the second character multiple times, is it better to get it once up front? I suppose that depends on the hit rate. Average hit rate may be 2 % (1/ ~64). So if our input is smaller than 64 characters, we'll have, on average one hit so we pay the second character cost once. At 128 or above, we'll pay the cost two or more times. But, maybe the JVM can optimize away the second and subsequent accesses? Actually, let's take a step back. The pattern is fixed. We parsed the pattern to decide to use this particular class. Should we instead create a 1-char, 2-char and n-char matcher class so we get the second character (for the 2-char case) only once, and we eliminate the extra per-value if-check?
---