This is an automated email from the ASF dual-hosted git repository. sbadhya pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 855e4055675 HIVE-28148: Implement array_compact UDF to remove all nulls from an array (#5161) (Taraka Rama Rao Lethavadla reviewed by Sourabh Badhya) 855e4055675 is described below commit 855e4055675e3c993a61f59501f783e641abaaa6 Author: tarak271 <ta...@cloudera.com> AuthorDate: Fri Apr 5 16:45:17 2024 +0530 HIVE-28148: Implement array_compact UDF to remove all nulls from an array (#5161) (Taraka Rama Rao Lethavadla reviewed by Sourabh Badhya) --- .../hadoop/hive/ql/exec/FunctionRegistry.java | 1 + .../ql/udf/generic/GenericUDFArrayCompact.java | 56 +++++++++ .../ql/udf/generic/TestGenericUDFArrayCompact.java | 127 +++++++++++++++++++++ .../queries/clientnegative/udf_array_compact_1.q | 1 + .../queries/clientpositive/udf_array_compact.q | 38 ++++++ .../clientnegative/udf_array_compact_1.q.out | 1 + .../clientpositive/llap/show_functions.q.out | 2 + .../clientpositive/llap/udf_array_compact.q.out | 112 ++++++++++++++++++ 8 files changed, 338 insertions(+) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 28f35c4a15f..c54a59f9516 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -617,6 +617,7 @@ public final class FunctionRegistry { system.registerGenericUDF("array_remove", GenericUDFArrayRemove.class); system.registerGenericUDF("array_position", GenericUDFArrayPosition.class); system.registerGenericUDF("array_append", GenericUDFArrayAppend.class); + system.registerGenericUDF("array_compact", GenericUDFArrayCompact.class); system.registerGenericUDF("deserialize", GenericUDFDeserialize.class); system.registerGenericUDF("sentences", GenericUDFSentences.class); system.registerGenericUDF("map_keys", GenericUDFMapKeys.class); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayCompact.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayCompact.java new file mode 100644 index 00000000000..71f5526e126 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayCompact.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * GenericUDFArrayCompact. + */ +@Description(name = "array_compact", value = "_FUNC_(array) - Removes NULL elements from array.", + extended = "Example:\n" + " > SELECT _FUNC_(array(1,NULL,3,NULL,4)) FROM src;\n" + " [1,3,4]") +public class GenericUDFArrayCompact extends AbstractGenericUDFArrayBase { + private static final String FUNC_NAME = "ARRAY_COMPACT"; + + public GenericUDFArrayCompact() { + super(FUNC_NAME, 1, 1, ObjectInspector.Category.LIST); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object array = arguments[ARRAY_IDX].get(); + int arrayLength = arrayOI.getListLength(array); + if (arrayLength == 0) { + return Collections.emptyList(); + } else if (arrayLength < 0) { + return null; + } + + List resultArray = new ArrayList<>(((ListObjectInspector) argumentOIs[ARRAY_IDX]).getList(array)); + return resultArray.stream().filter(Objects::nonNull).map(o -> converter.convert(o)).collect(Collectors.toList()); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFArrayCompact.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFArrayCompact.java new file mode 100644 index 00000000000..fb0ec6ed307 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFArrayCompact.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +import static java.util.Arrays.asList; + +public class TestGenericUDFArrayCompact extends TestGenericUDFArray { + + public TestGenericUDFArrayCompact() { + super.udf = new GenericUDFArrayCompact(); + } + + @Test public void testPrimitive() throws HiveException { + ObjectInspector[] inputOIs = { ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableIntObjectInspector) }; + udf.initialize(inputOIs); + + Object i1 = new IntWritable(3); + Object i2 = new IntWritable(1); + Object i3 = new IntWritable(2); + Object i4 = new IntWritable(4); + runAndVerify(asList(i1, i2, null, i3, i4), asList(i1, i2, i3, i4)); + + i1 = new FloatWritable(13.3f); + i2 = new FloatWritable(1.1f); + i3 = new FloatWritable(3.3f); + i4 = new FloatWritable(2.20f); + runAndVerify(asList(i1, i2, i3, i4, null), asList(i1, i2, i3, i4)); + } + + @Test public void testList() throws HiveException { + ObjectInspector[] inputOIs = { ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector)) }; + udf.initialize(inputOIs); + + Object i1 = asList(new Text("aa1"), new Text("dd"), new Text("cc"), new Text("bb")); + Object i2 = asList(new Text("aa2"), new Text("cc"), new Text("ba"), new Text("dd")); + Object i3 = asList(new Text("aa3"), new Text("cc"), new Text("dd"), new Text("ee"), new Text("bb")); + Object i4 = asList(new Text("aa4"), new Text("cc"), new Text("ddd"), new Text("bb")); + runAndVerify(asList(i1, i2, null, i3, null, i4), asList(i1, i2, i3, i4)); + } + + @Test public void testStruct() throws HiveException { + ObjectInspector[] inputOIs = { ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardStructObjectInspector(asList("f1", "f2", "f3", "f4"), + asList(PrimitiveObjectInspectorFactory.writableStringObjectInspector, + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector, + PrimitiveObjectInspectorFactory.writableDateObjectInspector, + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableIntObjectInspector)))) }; + udf.initialize(inputOIs); + + Object i1 = asList(new Text("a"), new DoubleWritable(3.1415), new DateWritableV2(Date.of(2015, 5, 26)), + asList(new IntWritable(1), new IntWritable(3), new IntWritable(2), new IntWritable(4))); + + Object i2 = asList(new Text("b"), new DoubleWritable(3.14), new DateWritableV2(Date.of(2015, 5, 26)), + asList(new IntWritable(1), new IntWritable(3), new IntWritable(2), new IntWritable(4))); + + Object i3 = asList(new Text("a"), new DoubleWritable(3.1415), new DateWritableV2(Date.of(2015, 5, 25)), + asList(new IntWritable(1), new IntWritable(3), new IntWritable(2), new IntWritable(5))); + + Object i4 = asList(new Text("a"), new DoubleWritable(3.1415), new DateWritableV2(Date.of(2015, 5, 25)), + asList(new IntWritable(1), new IntWritable(3), new IntWritable(2), new IntWritable(4))); + + runAndVerify(asList(i1, null, null, i3, i4, i2), asList(i1, i3, i4, i2)); + } + + @Test public void testMap() throws HiveException { + ObjectInspector[] inputOIs = { ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector)) }; + udf.initialize(inputOIs); + + Map<Text, IntWritable> m1 = new HashMap<>(); + m1.put(new Text("a"), new IntWritable(4)); + m1.put(new Text("b"), new IntWritable(3)); + m1.put(new Text("c"), new IntWritable(1)); + m1.put(new Text("d"), new IntWritable(2)); + + Map<Text, IntWritable> m2 = new HashMap<>(); + m2.put(new Text("d"), new IntWritable(4)); + m2.put(new Text("b"), new IntWritable(3)); + m2.put(new Text("a"), new IntWritable(1)); + m2.put(new Text("c"), new IntWritable(2)); + + Map<Text, IntWritable> m3 = new HashMap<>(); + m3.put(new Text("d"), new IntWritable(4)); + m3.put(new Text("b"), new IntWritable(3)); + m3.put(new Text("a"), new IntWritable(1)); + + runAndVerify(asList(m1, m3, m2, null, null), asList(m1, m3, m2)); + } + +} diff --git a/ql/src/test/queries/clientnegative/udf_array_compact_1.q b/ql/src/test/queries/clientnegative/udf_array_compact_1.q new file mode 100644 index 00000000000..fc5fb9e2096 --- /dev/null +++ b/ql/src/test/queries/clientnegative/udf_array_compact_1.q @@ -0,0 +1 @@ +SELECT array_compact(3); \ No newline at end of file diff --git a/ql/src/test/queries/clientpositive/udf_array_compact.q b/ql/src/test/queries/clientpositive/udf_array_compact.q new file mode 100644 index 00000000000..f8b6a42aadb --- /dev/null +++ b/ql/src/test/queries/clientpositive/udf_array_compact.q @@ -0,0 +1,38 @@ +--! qt:dataset:src + +-- SORT_QUERY_RESULTS + +set hive.fetch.task.conversion=more; + +DESCRIBE FUNCTION array_compact; +DESCRIBE FUNCTION EXTENDED array_compact; + +-- evalutes function for array of primitives +SELECT array_compact(array(1, 2, 3, null,3,4)) FROM src tablesample (1 rows); + +SELECT array_compact(array()) FROM src tablesample (1 rows); + +SELECT array_compact(array(null)) FROM src tablesample (1 rows); + +SELECT array_compact(array(1.12, 2.23, 3.34, null,1.11,1.12,2.9)) FROM src tablesample (1 rows); + +SELECT array_compact(array(1.1234567890, 2.234567890, 3.34567890, null, 3.3456789, 2.234567,1.1234567890)) FROM src tablesample (1 rows); + +SELECT array_compact(array(11234567890, 2234567890, 334567890, null, 11234567890, 2234567890, 334567890, null)) FROM src tablesample (1 rows); + +SELECT array_compact(array(array("a","b","c","d"),array("a","b","c","d"),array("a","b","c","d","e"),null,array("e","a","b","c","d"))) FROM src tablesample (1 rows); + +# handle null array cases + +dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/test_null_array; + +dfs -copyFromLocal ../../data/files/test_null_array.csv ${system:test.tmp.dir}/test_null_array/; + +create external table test_null_array (id int, value Array<String>) ROW FORMAT DELIMITED + FIELDS TERMINATED BY ':' collection items terminated by ',' location '${system:test.tmp.dir}/test_null_array'; + +select value from test_null_array; + +select array_compact(value) from test_null_array; + +dfs -rm -r ${system:test.tmp.dir}/test_null_array; \ No newline at end of file diff --git a/ql/src/test/results/clientnegative/udf_array_compact_1.q.out b/ql/src/test/results/clientnegative/udf_array_compact_1.q.out new file mode 100644 index 00000000000..30933b86378 --- /dev/null +++ b/ql/src/test/results/clientnegative/udf_array_compact_1.q.out @@ -0,0 +1 @@ +FAILED: SemanticException [Error 10016]: Line 1:21 Argument type mismatch '3': "array" expected at function ARRAY_COMPACT, but "int" is found diff --git a/ql/src/test/results/clientpositive/llap/show_functions.q.out b/ql/src/test/results/clientpositive/llap/show_functions.q.out index b764c89e94d..08f85658227 100644 --- a/ql/src/test/results/clientpositive/llap/show_functions.q.out +++ b/ql/src/test/results/clientpositive/llap/show_functions.q.out @@ -47,6 +47,7 @@ and approx_distinct array array_append +array_compact array_contains array_distinct array_except @@ -677,6 +678,7 @@ and approx_distinct array array_append +array_compact array_contains array_distinct array_except diff --git a/ql/src/test/results/clientpositive/llap/udf_array_compact.q.out b/ql/src/test/results/clientpositive/llap/udf_array_compact.q.out new file mode 100644 index 00000000000..633e21e30bf --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/udf_array_compact.q.out @@ -0,0 +1,112 @@ +PREHOOK: query: DESCRIBE FUNCTION array_compact +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION array_compact +POSTHOOK: type: DESCFUNCTION +array_compact(array) - Removes NULL elements from array. +PREHOOK: query: DESCRIBE FUNCTION EXTENDED array_compact +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED array_compact +POSTHOOK: type: DESCFUNCTION +array_compact(array) - Removes NULL elements from array. +Example: + > SELECT array_compact(array(1,NULL,3,NULL,4)) FROM src; + [1,3,4] +Function class:org.apache.hadoop.hive.ql.udf.generic.GenericUDFArrayCompact +Function type:BUILTIN +PREHOOK: query: SELECT array_compact(array(1, 2, 3, null,3,4)) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array(1, 2, 3, null,3,4)) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[1,2,3,3,4] +PREHOOK: query: SELECT array_compact(array()) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array()) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[] +PREHOOK: query: SELECT array_compact(array(null)) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array(null)) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[] +PREHOOK: query: SELECT array_compact(array(1.12, 2.23, 3.34, null,1.11,1.12,2.9)) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array(1.12, 2.23, 3.34, null,1.11,1.12,2.9)) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[1.12,2.23,3.34,1.11,1.12,2.9] +PREHOOK: query: SELECT array_compact(array(1.1234567890, 2.234567890, 3.34567890, null, 3.3456789, 2.234567,1.1234567890)) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array(1.1234567890, 2.234567890, 3.34567890, null, 3.3456789, 2.234567,1.1234567890)) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[1.123456789,2.23456789,3.3456789,3.3456789,2.234567,1.123456789] +PREHOOK: query: SELECT array_compact(array(11234567890, 2234567890, 334567890, null, 11234567890, 2234567890, 334567890, null)) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array(11234567890, 2234567890, 334567890, null, 11234567890, 2234567890, 334567890, null)) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[11234567890,2234567890,334567890,11234567890,2234567890,334567890] +PREHOOK: query: SELECT array_compact(array(array("a","b","c","d"),array("a","b","c","d"),array("a","b","c","d","e"),null,array("e","a","b","c","d"))) FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT array_compact(array(array("a","b","c","d"),array("a","b","c","d"),array("a","b","c","d","e"),null,array("e","a","b","c","d"))) FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +[["a","b","c","d"],["a","b","c","d"],["a","b","c","d","e"],["e","a","b","c","d"]] +PREHOOK: query: create external table test_null_array (id int, value Array<String>) ROW FORMAT DELIMITED +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +#### A masked pattern was here #### +PREHOOK: Output: database:default +PREHOOK: Output: default@test_null_array +POSTHOOK: query: create external table test_null_array (id int, value Array<String>) ROW FORMAT DELIMITED +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +#### A masked pattern was here #### +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_null_array +PREHOOK: query: select value from test_null_array +PREHOOK: type: QUERY +PREHOOK: Input: default@test_null_array +#### A masked pattern was here #### +POSTHOOK: query: select value from test_null_array +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_null_array +#### A masked pattern was here #### +["NULL"] +["null","null"] +[] +PREHOOK: query: select array_compact(value) from test_null_array +PREHOOK: type: QUERY +PREHOOK: Input: default@test_null_array +#### A masked pattern was here #### +POSTHOOK: query: select array_compact(value) from test_null_array +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_null_array +#### A masked pattern was here #### +["NULL"] +["null","null"] +[]