Repository: hive Updated Branches: refs/heads/master 941610f2a -> 0ad4f717a
HIVE-11141 : Improve RuleRegExp when the Expression node stack gets huge (Hari Subramaniyan, reviewed by Laljo John Pullokkaran, Jesus Camacho Rodriguez) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0ad4f717 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0ad4f717 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0ad4f717 Branch: refs/heads/master Commit: 0ad4f717a7c06bd7bbd90d4b3e861ba1e25d14b7 Parents: 941610f Author: Hari Subramaniyan <harisan...@apache.org> Authored: Mon Jul 20 17:17:03 2015 -0700 Committer: Hari Subramaniyan <harisan...@apache.org> Committed: Mon Jul 20 17:17:03 2015 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hive/ql/lib/RuleRegExp.java | 191 ++++++++++++++++++- .../hadoop/hive/ql/lib/TestRuleRegExp.java | 118 ++++++++++++ 2 files changed, 300 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/0ad4f717/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java index ddc96c2..c88ed68 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java @@ -18,6 +18,9 @@ package org.apache.hadoop.hive.ql.lib; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -31,7 +34,54 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; public class RuleRegExp implements Rule { private final String ruleName; - private final Pattern pattern; + private final Pattern patternWithWildCardChar; + private final String patternWithoutWildCardChar; + private String[] patternORWildChar; + private static final Set<Character> wildCards = new HashSet<Character>(Arrays.asList( + '[', '^', '$', '*', ']', '+', '|', '(', '\\', '.', '?', ')', '&')); + + /** + * The function iterates through the list of wild card characters and sees if + * this regular expression contains a wild card character. + * + * @param pattern + * pattern expressed as a regular Expression + */ + private static boolean patternHasWildCardChar(String pattern) { + if (pattern == null) { + return false; + } + for (char pc : pattern.toCharArray()) { + if (wildCards.contains(pc)) { + return true; + } + } + return false; + } + + /** + * The function iterates through the list of wild card characters and sees if + * this regular expression contains only the given char as wild card character. + * + * @param pattern + * pattern expressed as a regular Expression + * @param wcc + * wild card character + */ + private static boolean patternHasOnlyWildCardChar(String pattern, char wcc) { + if (pattern == null) { + return false; + } + boolean ret = true; + boolean hasWildCard = false; + for (char pc : pattern.toCharArray()) { + if (wildCards.contains(pc)) { + hasWildCard = true; + ret = ret && (pc == wcc); + } + } + return ret && hasWildCard; + } /** * The rule specified by the regular expression. Note that, the regular @@ -46,33 +96,156 @@ public class RuleRegExp implements Rule { **/ public RuleRegExp(String ruleName, String regExp) { this.ruleName = ruleName; - pattern = Pattern.compile(regExp); + + if (patternHasWildCardChar(regExp)) { + if (patternHasOnlyWildCardChar(regExp, '|')) { + this.patternWithWildCardChar = null; + this.patternWithoutWildCardChar = null; + this.patternORWildChar = regExp.split("\\|"); + } else { + this.patternWithWildCardChar = Pattern.compile(regExp); + this.patternWithoutWildCardChar = null; + this.patternORWildChar = null; + } + } else { + this.patternWithWildCardChar = null; + this.patternWithoutWildCardChar = regExp; + this.patternORWildChar = null; + } } /** - * This function returns the cost of the rule for the specified stack. Lower - * the cost, the better the rule is matched - * + * This function returns the cost of the rule for the specified stack when the pattern + * matched for has no wildcard character in it. The function expects patternWithoutWildCardChar + * to be not null. * @param stack * Node stack encountered so far * @return cost of the function * @throws SemanticException */ - @Override - public int cost(Stack<Node> stack) throws SemanticException { + private int costPatternWithoutWildCardChar(Stack<Node> stack) throws SemanticException { int numElems = (stack != null ? stack.size() : 0); + String name = new String(""); + int patLen = patternWithoutWildCardChar.length(); + + for (int pos = numElems - 1; pos >= 0; pos--) { + name = stack.get(pos).getName() + "%" + name; + if (name.length() >= patLen) { + if (patternWithoutWildCardChar.equals(name)) { + return patLen; + } else { + return -1; + } + } + } + return -1; + } + + /** + * This function returns the cost of the rule for the specified stack when the pattern + * matched for has only OR wildcard character in it. The function expects patternORWildChar + * to be not null. + * @param stack + * Node stack encountered so far + * @return cost of the function + * @throws SemanticException + */ + private int costPatternWithORWildCardChar(Stack<Node> stack) throws SemanticException { + int numElems = (stack != null ? stack.size() : 0); + for (String pattern : patternORWildChar) { + String name = new String(""); + int patLen = pattern.length(); + + for (int pos = numElems - 1; pos >= 0; pos--) { + name = stack.get(pos).getName() + "%" + name; + if (name.length() >= patLen) { + if (pattern.equals(name)) { + return patLen; + } else { + break; + } + } + } + } + return -1; + } + + /** + * This function returns the cost of the rule for the specified stack when the pattern + * matched for has wildcard character in it. The function expects patternWithWildCardChar + * to be not null. + * + * @param stack + * Node stack encountered so far + * @return cost of the function + * @throws SemanticException + */ + private int costPatternWithWildCardChar(Stack<Node> stack) throws SemanticException { + int numElems = (stack != null ? stack.size() : 0); String name = ""; + Matcher m = patternWithWildCardChar.matcher(""); for (int pos = numElems - 1; pos >= 0; pos--) { name = stack.get(pos).getName() + "%" + name; - Matcher m = pattern.matcher(name); + m.reset(name); if (m.matches()) { - return m.group().length(); + return name.length(); } } return -1; } /** + * Returns true if the rule pattern is valid and has wild character in it. + */ + boolean rulePatternIsValidWithWildCardChar() { + return patternWithoutWildCardChar == null && patternWithWildCardChar != null && this.patternORWildChar == null; + } + + /** + * Returns true if the rule pattern is valid and has wild character in it. + */ + boolean rulePatternIsValidWithoutWildCardChar() { + return patternWithWildCardChar == null && patternWithoutWildCardChar != null && this.patternORWildChar == null; + } + + /** + * Returns true if the rule pattern is valid and has wild character in it. + */ + boolean rulePatternIsValidWithORWildCardChar() { + return patternWithoutWildCardChar == null && patternWithWildCardChar == null && this.patternORWildChar != null; + } + + /** + * This function returns the cost of the rule for the specified stack. Lower + * the cost, the better the rule is matched + * + * @param stack + * Node stack encountered so far + * @return cost of the function + * @throws SemanticException + */ + @Override + public int cost(Stack<Node> stack) throws SemanticException { + if (rulePatternIsValidWithoutWildCardChar()) { + return costPatternWithoutWildCardChar(stack); + } + if (rulePatternIsValidWithWildCardChar()) { + return costPatternWithWildCardChar(stack); + } + if (rulePatternIsValidWithORWildCardChar()) { + return costPatternWithORWildCardChar(stack); + } + // If we reached here, either : + // 1. patternWithWildCardChar and patternWithoutWildCardChar are both nulls. + // 2. patternWithWildCardChar and patternWithoutWildCardChar are both not nulls. + // This is an internal error and we should not let this happen, so throw an exception. + throw new SemanticException ( + "Rule pattern is invalid for " + getName() + " : patternWithWildCardChar = " + + patternWithWildCardChar + " patternWithoutWildCardChar = " + + patternWithoutWildCardChar); + } + + /** * @return the name of the Node **/ @Override http://git-wip-us.apache.org/repos/asf/hive/blob/0ad4f717/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java b/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java new file mode 100644 index 0000000..f06d0df --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.lib; + +import static org.junit.Assert.*; + +import java.util.List; +import java.util.Stack; + +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.junit.Test; + +public class TestRuleRegExp { + + public class TestNode implements Node { + private String name; + + TestNode (String name) { + this.name = name; + } + + @Override + public List<? extends Node> getChildren() { + return null; + } + + @Override + public String getName() { + return name; + } + } + + @Test + public void testPatternWithoutWildCardChar() { + String patternStr = + ReduceSinkOperator.getOperatorName() + "%" + + SelectOperator.getOperatorName() + "%" + + FileSinkOperator.getOperatorName() + "%"; + RuleRegExp rule1 = new RuleRegExp("R1", patternStr); + assertEquals(rule1.rulePatternIsValidWithoutWildCardChar(), true); + assertEquals(rule1.rulePatternIsValidWithWildCardChar(), false); + // positive test + Stack<Node> ns1 = new Stack<Node>(); + ns1.push(new TestNode(ReduceSinkOperator.getOperatorName())); + ns1.push(new TestNode(SelectOperator.getOperatorName())); + ns1.push(new TestNode(FileSinkOperator.getOperatorName())); + try { + assertEquals(rule1.cost(ns1), patternStr.length()); + } catch (SemanticException e) { + fail(e.getMessage()); + } + // negative test + Stack<Node> ns2 = new Stack<Node>(); + ns2.push(new TestNode(ReduceSinkOperator.getOperatorName())); + ns1.push(new TestNode(TableScanOperator.getOperatorName())); + ns1.push(new TestNode(FileSinkOperator.getOperatorName())); + try { + assertEquals(rule1.cost(ns2), -1); + } catch (SemanticException e) { + fail(e.getMessage()); + } + } + + @Test + public void testPatternWithWildCardChar() { + RuleRegExp rule1 = new RuleRegExp("R1", + "(" + TableScanOperator.getOperatorName() + "%" + + FilterOperator.getOperatorName() + "%)|(" + + TableScanOperator.getOperatorName() + "%" + + FileSinkOperator.getOperatorName() + "%)"); + assertEquals(rule1.rulePatternIsValidWithoutWildCardChar(), false); + assertEquals(rule1.rulePatternIsValidWithWildCardChar(), true); + // positive test + Stack<Node> ns1 = new Stack<Node>(); + ns1.push(new TestNode(TableScanOperator.getOperatorName())); + ns1.push(new TestNode(FilterOperator.getOperatorName())); + Stack<Node> ns2 = new Stack<Node>(); + ns2.push(new TestNode(TableScanOperator.getOperatorName())); + ns2.push(new TestNode(FileSinkOperator.getOperatorName())); + try { + assertNotEquals(rule1.cost(ns1), -1); + assertNotEquals(rule1.cost(ns2), -1); + } catch (SemanticException e) { + fail(e.getMessage()); + } + // negative test + Stack<Node> ns3 = new Stack<Node>(); + ns3.push(new TestNode(ReduceSinkOperator.getOperatorName())); + ns3.push(new TestNode(ReduceSinkOperator.getOperatorName())); + ns3.push(new TestNode(FileSinkOperator.getOperatorName())); + try { + assertEquals(rule1.cost(ns3), -1); + } catch (SemanticException e) { + fail(e.getMessage()); + } + } + +}