HIVE-12230 custom UDF configure() not called in Vectorization mode (Matt McCline, reviewd by Jason Dere)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/95fcdb55 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/95fcdb55 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/95fcdb55 Branch: refs/heads/spark Commit: 95fcdb55513e4771f7b387f714043870ef41ce66 Parents: d33ddef Author: Matt McCline <mmccl...@hortonworks.com> Authored: Thu Nov 5 13:16:14 2015 -0800 Committer: Matt McCline <mmccl...@hortonworks.com> Committed: Thu Nov 5 13:16:14 2015 -0800 ---------------------------------------------------------------------- .../hadoop/hive/ql/exec/MapredContext.java | 2 +- .../ql/exec/vector/udf/VectorUDFAdaptor.java | 5 ++ .../hive/ql/exec/vector/UDFHelloTest.java | 69 +++++++++++++++++++ .../vector_custom_udf_configure.q | 11 +++ .../vector_custom_udf_configure.q.out | 70 ++++++++++++++++++++ 5 files changed, 156 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/95fcdb55/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java index 6ce84ac..b7ed0c1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java @@ -116,7 +116,7 @@ public class MapredContext { udfs.clear(); } - void setup(GenericUDF genericUDF) { + public void setup(GenericUDF genericUDF) { if (needConfigure(genericUDF)) { genericUDF.configure(this); } http://git-wip-us.apache.org/repos/asf/hive/blob/95fcdb55/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFAdaptor.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFAdaptor.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFAdaptor.java index b397398..d3a0f9f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFAdaptor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFAdaptor.java @@ -21,6 +21,7 @@ import java.sql.Date; import java.sql.Timestamp; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.vector.*; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; @@ -84,6 +85,10 @@ public class VectorUDFAdaptor extends VectorExpression { for (int i = 0; i < childrenOIs.length; i++) { childrenOIs[i] = writers[i].getObjectInspector(); } + MapredContext context = MapredContext.get(); + if (context != null) { + context.setup(genericUDF); + } outputOI = VectorExpressionWriterFactory.genVectorExpressionWritable(expr) .getObjectInspector(); http://git-wip-us.apache.org/repos/asf/hive/blob/95fcdb55/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/UDFHelloTest.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/UDFHelloTest.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/UDFHelloTest.java new file mode 100644 index 0000000..48fb59a --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/UDFHelloTest.java @@ -0,0 +1,69 @@ +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.ql.exec.MapredContext; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * UDF to obfuscate input data appending "Hello " + */ +public class UDFHelloTest extends GenericUDF { + private static final Logger LOG = LoggerFactory.getLogger(UDFHelloTest.class); + + private Text result = new Text(); + + private static String greeting = ""; + + private ObjectInspectorConverters.Converter[] converters; + + @Override + public Object evaluate(DeferredObject[] arg0) throws HiveException { + + if (arg0.length != 1) { + LOG.error("UDFHelloTest expects exactly 1 argument"); + throw new HiveException("UDFHelloTest expects exactly 1 argument"); + } + + if (arg0[0].get() == null) { + LOG.warn("Empty input"); + return null; + } + + Text data = (Text) converters[0].convert(arg0[0].get()); + + String dataString = data.toString(); + + result.set(greeting + dataString); + + return result; + } + + @Override + public String getDisplayString(String[] arg0) { + return "Hello..."; + } + + @Override + public void configure(MapredContext context) { + greeting = "Hello "; + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException { + converters = new ObjectInspectorConverters.Converter[arg0.length]; + for (int i = 0; i < arg0.length; i++) { + converters[i] = ObjectInspectorConverters.getConverter(arg0[i], + PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + // evaluate will return a Text object + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/95fcdb55/ql/src/test/queries/clientpositive/vector_custom_udf_configure.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_custom_udf_configure.q b/ql/src/test/queries/clientpositive/vector_custom_udf_configure.q new file mode 100644 index 0000000..eb19f3a --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_custom_udf_configure.q @@ -0,0 +1,11 @@ +set hive.fetch.task.conversion=none; + +create temporary function UDFHelloTest as 'org.apache.hadoop.hive.ql.exec.vector.UDFHelloTest'; + +create table testorc1(id int, name string) stored as orc; +insert into table testorc1 values(1, 'a1'), (2,'a2'); + +set hive.vectorized.execution.enabled=true; +explain +select id, UDFHelloTest(name) from testorc1; +select id, UDFHelloTest(name) from testorc1; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/95fcdb55/ql/src/test/results/clientpositive/vector_custom_udf_configure.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_custom_udf_configure.q.out b/ql/src/test/results/clientpositive/vector_custom_udf_configure.q.out new file mode 100644 index 0000000..d529873 --- /dev/null +++ b/ql/src/test/results/clientpositive/vector_custom_udf_configure.q.out @@ -0,0 +1,70 @@ +PREHOOK: query: create temporary function UDFHelloTest as 'org.apache.hadoop.hive.ql.exec.vector.UDFHelloTest' +PREHOOK: type: CREATEFUNCTION +PREHOOK: Output: udfhellotest +POSTHOOK: query: create temporary function UDFHelloTest as 'org.apache.hadoop.hive.ql.exec.vector.UDFHelloTest' +POSTHOOK: type: CREATEFUNCTION +POSTHOOK: Output: udfhellotest +PREHOOK: query: create table testorc1(id int, name string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@testorc1 +POSTHOOK: query: create table testorc1(id int, name string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@testorc1 +PREHOOK: query: insert into table testorc1 values(1, 'a1'), (2,'a2') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@testorc1 +POSTHOOK: query: insert into table testorc1 values(1, 'a1'), (2,'a2') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@testorc1 +POSTHOOK: Lineage: testorc1.id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: testorc1.name SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain +select id, UDFHelloTest(name) from testorc1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select id, UDFHelloTest(name) from testorc1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: testorc1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: int), Hello... (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select id, UDFHelloTest(name) from testorc1 +PREHOOK: type: QUERY +PREHOOK: Input: default@testorc1 +#### A masked pattern was here #### +POSTHOOK: query: select id, UDFHelloTest(name) from testorc1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testorc1 +#### A masked pattern was here #### +1 Hello a1 +2 Hello a2