HIVE-2327 Optimize REGEX UDFs with constant parameter information (Alexander Pivovarov, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/312711b7 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/312711b7 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/312711b7 Branch: refs/heads/parquet Commit: 312711b705b2af950c475572785fc19660ec1c38 Parents: c567a77 Author: Alexander Pivovarov <apivova...@gmail.com> Authored: Thu Apr 2 22:35:38 2015 -0700 Committer: Alexander Pivovarov <apivova...@gmail.com> Committed: Tue May 19 15:29:20 2015 -0700 ---------------------------------------------------------------------- .../hadoop/hive/ql/exec/FunctionRegistry.java | 5 +- .../hive/ql/optimizer/physical/Vectorizer.java | 3 +- .../apache/hadoop/hive/ql/udf/UDFRegExp.java | 76 ----------- .../hive/ql/udf/generic/GenericUDFRegExp.java | 133 ++++++++++++++++++ .../ql/udf/generic/TestGenericUDFRegexp.java | 135 +++++++++++++++++++ .../spark/vectorization_short_regress.q.out | 8 +- .../tez/vectorization_short_regress.q.out | 8 +- .../vectorization_short_regress.q.out | 8 +- 8 files changed, 283 insertions(+), 93 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 7ce0a1c..9abe15e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -81,7 +81,6 @@ import org.apache.hadoop.hive.ql.udf.UDFPI; import org.apache.hadoop.hive.ql.udf.UDFParseUrl; import org.apache.hadoop.hive.ql.udf.UDFRadians; import org.apache.hadoop.hive.ql.udf.UDFRand; -import org.apache.hadoop.hive.ql.udf.UDFRegExp; import org.apache.hadoop.hive.ql.udf.UDFRegExpExtract; import org.apache.hadoop.hive.ql.udf.UDFRegExpReplace; import org.apache.hadoop.hive.ql.udf.UDFRepeat; @@ -249,8 +248,8 @@ public final class FunctionRegistry { system.registerGenericUDF("initcap", GenericUDFInitCap.class); system.registerUDF("like", UDFLike.class, true); - system.registerUDF("rlike", UDFRegExp.class, true); - system.registerUDF("regexp", UDFRegExp.class, true); + system.registerGenericUDF("rlike", GenericUDFRegExp.class); + system.registerGenericUDF("regexp", GenericUDFRegExp.class); system.registerUDF("regexp_replace", UDFRegExpReplace.class, false); system.registerUDF("regexp_extract", UDFRegExpExtract.class, false); system.registerUDF("parse_url", UDFParseUrl.class, false); http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 656a5e3..705b185 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -119,7 +119,6 @@ import org.apache.hadoop.hive.ql.udf.UDFMinute; import org.apache.hadoop.hive.ql.udf.UDFMonth; import org.apache.hadoop.hive.ql.udf.UDFRadians; import org.apache.hadoop.hive.ql.udf.UDFRand; -import org.apache.hadoop.hive.ql.udf.UDFRegExp; import org.apache.hadoop.hive.ql.udf.UDFSecond; import org.apache.hadoop.hive.ql.udf.UDFSign; import org.apache.hadoop.hive.ql.udf.UDFSin; @@ -227,7 +226,7 @@ public class Vectorizer implements PhysicalPlanResolver { supportedGenericUDFs.add(GenericUDFDateDiff.class); supportedGenericUDFs.add(UDFLike.class); - supportedGenericUDFs.add(UDFRegExp.class); + supportedGenericUDFs.add(GenericUDFRegExp.class); supportedGenericUDFs.add(UDFSubstr.class); supportedGenericUDFs.add(GenericUDFLTrim.class); supportedGenericUDFs.add(GenericUDFRTrim.class); http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java deleted file mode 100755 index 76e1d2e..0000000 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.udf; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDF; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; -import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColRegExpStringScalar; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.Text; - -/** - * UDFRegExp. - * - */ -@Description(name = "rlike,regexp", - value = "str _FUNC_ regexp - Returns true if str matches regexp and " - + "false otherwise", extended = "Example:\n" - + " > SELECT 'fb' _FUNC_ '.*' FROM src LIMIT 1;\n" + " true") -@VectorizedExpressions({FilterStringColRegExpStringScalar.class}) -public class UDFRegExp extends UDF { - static final Log LOG = LogFactory.getLog(UDFRegExp.class.getName()); - - private final Text lastRegex = new Text(); - private Pattern p = null; - private boolean warned = false; - - private final BooleanWritable result = new BooleanWritable(); - - public UDFRegExp() { - } - - public BooleanWritable evaluate(Text s, Text regex) { - if (s == null || regex == null) { - return null; - } - if (regex.getLength() == 0) { - if (!warned) { - warned = true; - LOG.warn(getClass().getSimpleName() + " regex is empty. Additional " - + "warnings for an empty regex will be suppressed."); - } - result.set(false); - return result; - } - if (!regex.equals(lastRegex) || p == null) { - lastRegex.set(regex); - p = Pattern.compile(regex.toString()); - } - Matcher m = p.matcher(s.toString()); - result.set(m.find(0)); - return result; - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java new file mode 100644 index 0000000..0a9dd7b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; +import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColRegExpStringScalar; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.BooleanWritable; + +/** + * UDF to extract a specific group identified by a java regex. Note that if a + * regexp has a backslash ('\'), then need to specify '\\' For example, + * regexp_extract('100-200', '(\\d+)-(\\d+)', 1) will return '100' + */ +@Description(name = "rlike,regexp", + value = "str _FUNC_ regexp - Returns true if str matches regexp and " + + "false otherwise", extended = "Example:\n" + + " > SELECT 'fb' _FUNC_ '.*' FROM src LIMIT 1;\n" + " true") +@VectorizedExpressions({FilterStringColRegExpStringScalar.class}) +public class GenericUDFRegExp extends GenericUDF { + static final Log LOG = LogFactory.getLog(GenericUDFRegExp.class.getName()); + private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[2]; + private transient Converter[] converters = new Converter[2]; + private final BooleanWritable output = new BooleanWritable(); + private transient boolean isRegexConst; + private transient String regexConst; + private transient Pattern patternConst; + private transient boolean warned; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + checkArgsSize(arguments, 2, 2); + + checkArgPrimitive(arguments, 0); + checkArgPrimitive(arguments, 1); + + checkArgGroups(arguments, 0, inputTypes, STRING_GROUP); + checkArgGroups(arguments, 1, inputTypes, STRING_GROUP); + + obtainStringConverter(arguments, 0, inputTypes, converters); + obtainStringConverter(arguments, 1, inputTypes, converters); + + if (arguments[1] instanceof ConstantObjectInspector) { + regexConst = getConstantStringValue(arguments, 1); + if (regexConst != null) { + patternConst = Pattern.compile(regexConst); + } + isRegexConst = true; + } + + ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + return outputOI; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String s = getStringValue(arguments, 0, converters); + if (s == null) { + return null; + } + + String regex; + if (isRegexConst) { + regex = regexConst; + } else { + regex = getStringValue(arguments, 1, converters); + } + if (regex == null) { + return null; + } + + if (regex.length() == 0) { + if (!warned) { + warned = true; + LOG.warn(getClass().getSimpleName() + " regex is empty. Additional " + + "warnings for an empty regex will be suppressed."); + } + output.set(false); + return output; + } + + Pattern p; + if (isRegexConst) { + p = patternConst; + } else { + p = Pattern.compile(regex); + } + + Matcher m = p.matcher(s); + output.set(m.find(0)); + return output; + } + + @Override + public String getDisplayString(String[] children) { + return children[0] + " regexp " + children[1]; + } + + @Override + protected String getFuncName() { + return "regexp"; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java new file mode 100644 index 0000000..4e3be90 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.Text; + +public class TestGenericUDFRegexp extends TestCase { + + public void testConstant() throws HiveException { + GenericUDFRegExp udf = new GenericUDFRegExp(); + ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + Text regexText = new Text("^fo"); + ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory + .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, regexText); + ObjectInspector[] arguments = { valueOI0, valueOI1 }; + + udf.initialize(arguments); + + runAndVerifyConst("fofo", regexText, true, udf); + runAndVerifyConst("fofofo", regexText, true, udf); + runAndVerifyConst("fobar", regexText, true, udf); + runAndVerifyConst("barfobar", regexText, false, udf); + // null + runAndVerifyConst(null, regexText, null, udf); + } + + public void testEmptyConstant() throws HiveException { + GenericUDFRegExp udf = new GenericUDFRegExp(); + ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + Text regexText = new Text(""); + ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory + .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, regexText); + ObjectInspector[] arguments = { valueOI0, valueOI1 }; + + udf.initialize(arguments); + + // empty regex (should be one WARN message) + runAndVerifyConst("foo", regexText, false, udf); + runAndVerifyConst("bar", regexText, false, udf); + // null + runAndVerifyConst(null, regexText, null, udf); + } + + public void testNullConstant() throws HiveException { + GenericUDFRegExp udf = new GenericUDFRegExp(); + ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + Text regexText = null; + ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory + .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, regexText); + ObjectInspector[] arguments = { valueOI0, valueOI1 }; + + udf.initialize(arguments); + // null + runAndVerifyConst("fofo", regexText, null, udf); + runAndVerifyConst("fofofo", regexText, null, udf); + runAndVerifyConst("fobar", regexText, null, udf); + runAndVerifyConst(null, regexText, null, udf); + } + + public void testNonConstant() throws HiveException { + GenericUDFRegExp udf = new GenericUDFRegExp(); + ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + ObjectInspector[] arguments = { valueOI0, valueOI1 }; + + udf.initialize(arguments); + + runAndVerify("fofo", "^fo", true, udf); + runAndVerify("fo\no", "^fo\no$", true, udf); + runAndVerify("Bn", "^Ba*n", true, udf); + runAndVerify("afofo", "fo", true, udf); + runAndVerify("afofo", "^fo", false, udf); + runAndVerify("Baan", "^Ba?n", false, udf); + runAndVerify("axe", "pi|apa", false, udf); + runAndVerify("pip", "^(pi)*$", false, udf); + // empty regex (should be one WARN message) + runAndVerify("bar", "", false, udf); + runAndVerify("foo", "", false, udf); + // null + runAndVerify(null, "^fo", null, udf); + runAndVerify("fofo", null, null, udf); + } + + private void runAndVerifyConst(String str, Text regexText, Boolean expResult, GenericUDF udf) + throws HiveException { + DeferredObject valueObj0 = new DeferredJavaObject(str != null ? new Text(str) : null); + DeferredObject valueObj1 = new DeferredJavaObject(regexText); + DeferredObject[] args = { valueObj0, valueObj1 }; + BooleanWritable output = (BooleanWritable) udf.evaluate(args); + if (expResult == null) { + assertNull(output); + } else { + assertNotNull(output); + assertEquals("regexp() const test ", expResult.booleanValue(), output.get()); + } + } + + private void runAndVerify(String str, String regex, Boolean expResult, GenericUDF udf) + throws HiveException { + DeferredObject valueObj0 = new DeferredJavaObject(str != null ? new Text(str) : null); + DeferredObject valueObj1 = new DeferredJavaObject(regex != null ? new Text(regex) : null); + DeferredObject[] args = { valueObj0, valueObj1 }; + BooleanWritable output = (BooleanWritable) udf.evaluate(args); + if (expResult == null) { + assertNull(output); + } else { + assertNotNull(output); + assertEquals("regexp() test ", expResult.booleanValue(), output.get()); + } + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out index a4b8e05..25eb161 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out @@ -360,7 +360,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((((cbigint <= 197) and (cint < cbigint)) or ((cdouble >= -26.28) and (csmallint > cdouble))) or ((ctinyint > cfloat) and (cstring1 rlike '.*ss.*'))) or ((cfloat > 79.553) and (cstring2 like '10%'))) (type: boolean) + predicate: (((((cbigint <= 197) and (cint < cbigint)) or ((cdouble >= -26.28) and (csmallint > cdouble))) or ((ctinyint > cfloat) and cstring1 regexp '.*ss.*')) or ((cfloat > 79.553) and (cstring2 like '10%'))) (type: boolean) Statistics: Num rows: 6826 Data size: 209555 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cbigint (type: bigint), csmallint (type: smallint), cdouble (type: double), ctinyint (type: tinyint) @@ -935,7 +935,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((((cstring1 rlike 'a.*') and (cstring2 like '%ss%')) or ((1 <> cboolean2) and ((csmallint < 79.553) and (-257 <> ctinyint)))) or ((cdouble > ctinyint) and (cfloat >= cint))) or ((cint < cbigint) and (ctinyint > cbigint))) (type: boolean) + predicate: ((((cstring1 regexp 'a.*' and (cstring2 like '%ss%')) or ((1 <> cboolean2) and ((csmallint < 79.553) and (-257 <> ctinyint)))) or ((cdouble > ctinyint) and (cfloat >= cint))) or ((cint < cbigint) and (ctinyint > cbigint))) (type: boolean) Statistics: Num rows: 9898 Data size: 303864 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cdouble (type: double), ctimestamp2 (type: timestamp), cstring1 (type: string), cboolean2 (type: boolean), ctinyint (type: tinyint), cfloat (type: float), ctimestamp1 (type: timestamp), csmallint (type: smallint), cbigint (type: bigint), (-3728 * cbigint) (type: bigint), (- cint) (type: int), (-863.257 - cint) (type: double), (- csmallint) (type: smallint), (csmallint - (- csmallint)) (type: smallint), ((csmallint - (- csmallint)) + (- csmallint)) (type: smallint), (cint / cint) (type: double), ((-863.257 - cint) - -26.28) (type: double), (- cfloat) (type: float), (cdouble * -89010) (type: double), (ctinyint / 988888) (type: double), (- ctinyint) (type: tinyint), (79.553 / ctinyint) (type: double) @@ -2339,7 +2339,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((ctimestamp1 <> 0) and (((((((-257 <> ctinyint) and cboolean2 is not null) and ((cstring1 rlike '.*ss') and (-3 < ctimestamp1))) or (ctimestamp2 = -5)) or ((ctimestamp1 < 0) and (cstring2 like '%b%'))) or (cdouble = cint)) or (cboolean1 is null and (cfloat < cint)))) (type: boolean) + predicate: ((ctimestamp1 <> 0) and (((((((-257 <> ctinyint) and cboolean2 is not null) and (cstring1 regexp '.*ss' and (-3 < ctimestamp1))) or (ctimestamp2 = -5)) or ((ctimestamp1 < 0) and (cstring2 like '%b%'))) or (cdouble = cint)) or (cboolean1 is null and (cfloat < cint)))) (type: boolean) Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctimestamp1 (type: timestamp), cstring1 (type: string), cint (type: int), csmallint (type: smallint), ctinyint (type: tinyint), cfloat (type: float), cdouble (type: double) @@ -2672,7 +2672,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (cboolean1 is not null and (((((cdouble < csmallint) and ((cboolean2 = cboolean1) and (cbigint <= -863.257))) or ((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1)))) or (cstring2 rlike 'b')) or ((csmallint >= ctinyint) and ctimestamp2 is null))) (type: boolean) + predicate: (cboolean1 is not null and (((((cdouble < csmallint) and ((cboolean2 = cboolean1) and (cbigint <= -863.257))) or ((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1)))) or cstring2 regexp 'b') or ((csmallint >= ctinyint) and ctimestamp2 is null))) (type: boolean) Statistics: Num rows: 4778 Data size: 146682 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cboolean1 (type: boolean), cfloat (type: float), cbigint (type: bigint), cint (type: int), cdouble (type: double), ctinyint (type: tinyint), csmallint (type: smallint) http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out index a3c723d..bf01f78 100644 --- a/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out +++ b/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out @@ -360,7 +360,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and (cstring1 rlike '.*ss.*')) or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean) + predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and cstring1 regexp '.*ss.*') or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean) Statistics: Num rows: 6826 Data size: 1467614 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cbigint (type: bigint), csmallint (type: smallint), cdouble (type: double), ctinyint (type: tinyint) @@ -935,7 +935,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((cstring1 rlike 'a.*') and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean) + predicate: ((cstring1 regexp 'a.*' and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean) Statistics: Num rows: 9898 Data size: 2128105 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cdouble (type: double), ctimestamp2 (type: timestamp), cstring1 (type: string), cboolean2 (type: boolean), ctinyint (type: tinyint), cfloat (type: float), ctimestamp1 (type: timestamp), csmallint (type: smallint), cbigint (type: bigint), (-3728 * cbigint) (type: bigint), (- cint) (type: int), (-863.257 - UDFToDouble(cint)) (type: double), (- csmallint) (type: smallint), (csmallint - (- csmallint)) (type: smallint), ((csmallint - (- csmallint)) + (- csmallint)) (type: smallint), (UDFToDouble(cint) / UDFToDouble(cint)) (type: double), ((-863.257 - UDFToDouble(cint)) - -26.28) (type: double), (- cfloat) (type: float), (cdouble * -89010.0) (type: double), (UDFToDouble(ctinyint) / 988888.0) (type: double), (- ctinyint) (type: tinyint), (79.553 / UDFToDouble(ctinyint)) (type: double) @@ -2339,7 +2339,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and ((cstring1 rlike '.*ss') and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean) + predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and (cstring1 regexp '.*ss' and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctimestamp1 (type: timestamp), cstring1 (type: string), cint (type: int), csmallint (type: smallint), ctinyint (type: tinyint), cfloat (type: float), cdouble (type: double) @@ -2672,7 +2672,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or ((cstring2 rlike 'b') or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean) + predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or (cstring2 regexp 'b' or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean) Statistics: Num rows: 4778 Data size: 1027287 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cboolean1 (type: boolean), cfloat (type: float), cbigint (type: bigint), cint (type: int), cdouble (type: double), ctinyint (type: tinyint), csmallint (type: smallint) http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/results/clientpositive/vectorization_short_regress.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/vectorization_short_regress.q.out index b9ab174..b823d4b 100644 --- a/ql/src/test/results/clientpositive/vectorization_short_regress.q.out +++ b/ql/src/test/results/clientpositive/vectorization_short_regress.q.out @@ -349,7 +349,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and (cstring1 rlike '.*ss.*')) or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean) + predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and cstring1 regexp '.*ss.*') or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean) Statistics: Num rows: 6826 Data size: 1467614 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cbigint (type: bigint), csmallint (type: smallint), cdouble (type: double), ctinyint (type: tinyint) @@ -906,7 +906,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((cstring1 rlike 'a.*') and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean) + predicate: ((cstring1 regexp 'a.*' and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean) Statistics: Num rows: 9898 Data size: 2128105 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cdouble (type: double), ctimestamp2 (type: timestamp), cstring1 (type: string), cboolean2 (type: boolean), ctinyint (type: tinyint), cfloat (type: float), ctimestamp1 (type: timestamp), csmallint (type: smallint), cbigint (type: bigint), (-3728 * cbigint) (type: bigint), (- cint) (type: int), (-863.257 - UDFToDouble(cint)) (type: double), (- csmallint) (type: smallint), (csmallint - (- csmallint)) (type: smallint), ((csmallint - (- csmallint)) + (- csmallint)) (type: smallint), (UDFToDouble(cint) / UDFToDouble(cint)) (type: double), ((-863.257 - UDFToDouble(cint)) - -26.28) (type: double), (- cfloat) (type: float), (cdouble * -89010.0) (type: double), (UDFToDouble(ctinyint) / 988888.0) (type: double), (- ctinyint) (type: tinyint), (79.553 / UDFToDouble(ctinyint)) (type: double) @@ -2288,7 +2288,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and ((cstring1 rlike '.*ss') and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean) + predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and (cstring1 regexp '.*ss' and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ctimestamp1 (type: timestamp), cstring1 (type: string), cint (type: int), csmallint (type: smallint), ctinyint (type: tinyint), cfloat (type: float), cdouble (type: double) @@ -2624,7 +2624,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or ((cstring2 rlike 'b') or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean) + predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or (cstring2 regexp 'b' or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean) Statistics: Num rows: 4778 Data size: 1027287 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cboolean1 (type: boolean), cfloat (type: float), cbigint (type: bigint), cint (type: int), cdouble (type: double), ctinyint (type: tinyint), csmallint (type: smallint)