[spark] branch master updated (348fd53 -> 75da050)
This is an automated email from the ASF dual-hosted git repository. lixiao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 348fd53 [SPARK-31307][ML][EXAMPLES] Add examples for ml.fvalue add 75da050 [MINOR][SQL][DOCS] Remove two leading spaces from sql tables No new revisions were added by this update. Summary of changes: docs/sql-ref-ansi-compliance.md | 40 +- docs/sql-ref-functions-udf-hive.md| 82 ++-- docs/sql-ref-null-semantics.md| 512 +++--- docs/sql-ref-syntax-aux-analyze-table.md | 88 ++-- docs/sql-ref-syntax-aux-conf-mgmt-set.md | 10 +- docs/sql-ref-syntax-aux-describe-database.md | 44 +- docs/sql-ref-syntax-aux-describe-function.md | 84 ++-- docs/sql-ref-syntax-aux-describe-query.md | 60 +-- docs/sql-ref-syntax-aux-describe-table.md | 164 +++ docs/sql-ref-syntax-aux-show-columns.md | 42 +- docs/sql-ref-syntax-aux-show-create-table.md | 20 +- docs/sql-ref-syntax-aux-show-databases.md | 40 +- docs/sql-ref-syntax-aux-show-functions.md | 96 ++-- docs/sql-ref-syntax-aux-show-partitions.md| 60 +-- docs/sql-ref-syntax-aux-show-table.md | 178 docs/sql-ref-syntax-aux-show-tables.md| 64 +-- docs/sql-ref-syntax-aux-show-tblproperties.md | 48 +- docs/sql-ref-syntax-aux-show-views.md | 68 +-- docs/sql-ref-syntax-ddl-alter-database.md | 16 +- docs/sql-ref-syntax-ddl-alter-table.md| 252 +-- docs/sql-ref-syntax-ddl-alter-view.md | 112 ++--- docs/sql-ref-syntax-ddl-create-database.md| 16 +- docs/sql-ref-syntax-ddl-create-function.md| 46 +- docs/sql-ref-syntax-ddl-drop-function.md | 32 +- docs/sql-ref-syntax-ddl-repair-table.md | 18 +- docs/sql-ref-syntax-ddl-truncate-table.md | 32 +- docs/sql-ref-syntax-dml-insert-into.md| 164 +++ docs/sql-ref-syntax-dml-insert-overwrite-table.md | 124 +++--- docs/sql-ref-syntax-dml-load.md | 44 +- docs/sql-ref-syntax-qry-aggregation.md| 22 - docs/sql-ref-syntax-qry-explain.md| 100 ++--- docs/sql-ref-syntax-qry-sampling.md | 82 ++-- docs/sql-ref-syntax-qry-select-clusterby.md | 40 +- docs/sql-ref-syntax-qry-select-cte.md | 60 +-- docs/sql-ref-syntax-qry-select-distinct.md| 22 - docs/sql-ref-syntax-qry-select-distribute-by.md | 40 +- docs/sql-ref-syntax-qry-select-groupby.md | 216 - docs/sql-ref-syntax-qry-select-having.md | 68 +-- docs/sql-ref-syntax-qry-select-inline-table.md| 36 +- docs/sql-ref-syntax-qry-select-join.md| 175 docs/sql-ref-syntax-qry-select-limit.md | 50 +-- docs/sql-ref-syntax-qry-select-orderby.md | 90 ++-- docs/sql-ref-syntax-qry-select-setops.md | 190 docs/sql-ref-syntax-qry-select-sortby.md | 132 +++--- docs/sql-ref-syntax-qry-select-tvf.md | 68 +-- docs/sql-ref-syntax-qry-select-where.md | 82 ++-- docs/sql-ref-syntax-qry-window.md | 168 +++ 47 files changed, 2076 insertions(+), 2121 deletions(-) delete mode 100644 docs/sql-ref-syntax-qry-aggregation.md delete mode 100644 docs/sql-ref-syntax-qry-select-distinct.md - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (348fd53 -> 75da050)
This is an automated email from the ASF dual-hosted git repository. lixiao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 348fd53 [SPARK-31307][ML][EXAMPLES] Add examples for ml.fvalue add 75da050 [MINOR][SQL][DOCS] Remove two leading spaces from sql tables No new revisions were added by this update. Summary of changes: docs/sql-ref-ansi-compliance.md | 40 +- docs/sql-ref-functions-udf-hive.md| 82 ++-- docs/sql-ref-null-semantics.md| 512 +++--- docs/sql-ref-syntax-aux-analyze-table.md | 88 ++-- docs/sql-ref-syntax-aux-conf-mgmt-set.md | 10 +- docs/sql-ref-syntax-aux-describe-database.md | 44 +- docs/sql-ref-syntax-aux-describe-function.md | 84 ++-- docs/sql-ref-syntax-aux-describe-query.md | 60 +-- docs/sql-ref-syntax-aux-describe-table.md | 164 +++ docs/sql-ref-syntax-aux-show-columns.md | 42 +- docs/sql-ref-syntax-aux-show-create-table.md | 20 +- docs/sql-ref-syntax-aux-show-databases.md | 40 +- docs/sql-ref-syntax-aux-show-functions.md | 96 ++-- docs/sql-ref-syntax-aux-show-partitions.md| 60 +-- docs/sql-ref-syntax-aux-show-table.md | 178 docs/sql-ref-syntax-aux-show-tables.md| 64 +-- docs/sql-ref-syntax-aux-show-tblproperties.md | 48 +- docs/sql-ref-syntax-aux-show-views.md | 68 +-- docs/sql-ref-syntax-ddl-alter-database.md | 16 +- docs/sql-ref-syntax-ddl-alter-table.md| 252 +-- docs/sql-ref-syntax-ddl-alter-view.md | 112 ++--- docs/sql-ref-syntax-ddl-create-database.md| 16 +- docs/sql-ref-syntax-ddl-create-function.md| 46 +- docs/sql-ref-syntax-ddl-drop-function.md | 32 +- docs/sql-ref-syntax-ddl-repair-table.md | 18 +- docs/sql-ref-syntax-ddl-truncate-table.md | 32 +- docs/sql-ref-syntax-dml-insert-into.md| 164 +++ docs/sql-ref-syntax-dml-insert-overwrite-table.md | 124 +++--- docs/sql-ref-syntax-dml-load.md | 44 +- docs/sql-ref-syntax-qry-aggregation.md| 22 - docs/sql-ref-syntax-qry-explain.md| 100 ++--- docs/sql-ref-syntax-qry-sampling.md | 82 ++-- docs/sql-ref-syntax-qry-select-clusterby.md | 40 +- docs/sql-ref-syntax-qry-select-cte.md | 60 +-- docs/sql-ref-syntax-qry-select-distinct.md| 22 - docs/sql-ref-syntax-qry-select-distribute-by.md | 40 +- docs/sql-ref-syntax-qry-select-groupby.md | 216 - docs/sql-ref-syntax-qry-select-having.md | 68 +-- docs/sql-ref-syntax-qry-select-inline-table.md| 36 +- docs/sql-ref-syntax-qry-select-join.md| 175 docs/sql-ref-syntax-qry-select-limit.md | 50 +-- docs/sql-ref-syntax-qry-select-orderby.md | 90 ++-- docs/sql-ref-syntax-qry-select-setops.md | 190 docs/sql-ref-syntax-qry-select-sortby.md | 132 +++--- docs/sql-ref-syntax-qry-select-tvf.md | 68 +-- docs/sql-ref-syntax-qry-select-where.md | 82 ++-- docs/sql-ref-syntax-qry-window.md | 168 +++ 47 files changed, 2076 insertions(+), 2121 deletions(-) delete mode 100644 docs/sql-ref-syntax-qry-aggregation.md delete mode 100644 docs/sql-ref-syntax-qry-select-distinct.md - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-31307][ML][EXAMPLES] Add examples for ml.fvalue
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 348fd53 [SPARK-31307][ML][EXAMPLES] Add examples for ml.fvalue 348fd53 is described below commit 348fd53214ccc476bee37e3ddd6b075a53886104 Author: Qianyang Yu AuthorDate: Fri May 1 09:16:08 2020 -0500 [SPARK-31307][ML][EXAMPLES] Add examples for ml.fvalue ### What changes were proposed in this pull request? Add FValue example for ml.stat.FValueTest in python/java/scala ### Why are the changes needed? Improve ML example ### Does this PR introduce any user-facing change? No ### How was this patch tested? manually run the example Closes #28400 from kevinyu98/spark-26111-fvalue-examples. Authored-by: Qianyang Yu Signed-off-by: Sean Owen --- .../spark/examples/ml/JavaFValueTestExample.java | 75 ++ examples/src/main/python/ml/fvalue_test_example.py | 52 +++ .../spark/examples/ml/FVlaueTestExample.scala | 63 ++ 3 files changed, 190 insertions(+) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaFValueTestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaFValueTestExample.java new file mode 100644 index 000..11861ac --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaFValueTestExample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.sql.SparkSession; + +// $example on$ +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.stat.FValueTest; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +// $example off$ + +/** + * An example for FValue testing. + * Run with + * + * bin/run-example ml.JavaFValueTestExample + * + */ +public class JavaFValueTestExample { + + public static void main(String[] args) { +SparkSession spark = SparkSession + .builder() + .appName("JavaFValueTestExample") + .getOrCreate(); + +// $example on$ +List data = Arrays.asList( + RowFactory.create(4.6, Vectors.dense(6.0, 7.0, 0.0, 7.0, 6.0, 0.0)), + RowFactory.create(6.6, Vectors.dense(0.0, 9.0, 6.0, 0.0, 5.0, 9.0)), + RowFactory.create(5.1, Vectors.dense(0.0, 9.0, 3.0, 0.0, 5.0, 5.0)), + RowFactory.create(7.6, Vectors.dense(0.0, 9.0, 8.0, 5.0, 6.0, 4.0)), + RowFactory.create(9.0, Vectors.dense(8.0, 9.0, 6.0, 5.0, 4.0, 4.0)), + RowFactory.create(9.0, Vectors.dense(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)) +); + +StructType schema = new StructType(new StructField[]{ + new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); + +Dataset df = spark.createDataFrame(data, schema); +Row r = FValueTest.test(df, "features", "label").head(); +System.out.println("pValues: " + r.get(0).toString()); +System.out.println("degreesOfFreedom: " + r.getList(1).toString()); +System.out.println("fvalue: " + r.get(2).toString()); + +// $example off$ + +spark.stop(); + } +} diff --git a/examples/src/main/python/ml/fvalue_test_example.py b/examples/src/main/python/ml/fvalue_test_example.py new file mode 100644 index 000..4a97bcd --- /dev/null +++ b/examples/src/main/python/ml/fvalue_test_example.py @@ -0,0 +1,52 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#
[spark] branch branch-2.4 updated: [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1222ce0 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements 1222ce0 is described below commit 1222ce064f97ed9ad34e2fca4d270762592a1854 Author: Pablo Langa AuthorDate: Fri May 1 22:09:04 2020 +0900 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements ### What changes were proposed in this pull request? The collect_set() aggregate function should produce a set of distinct elements. When the column argument's type is BinayType this is not the case. Example: ```scala import org.apache.spark.sql.functions._ import org.apache.spark.sql.expressions.Window case class R(id: String, value: String, bytes: Array[Byte]) def makeR(id: String, value: String) = R(id, value, value.getBytes) val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), makeR("b", "fish")).toDF() // In the example below "bytesSet" erroneously has duplicates but "stringSet" does not (as expected). df.agg(collect_set('value) as "stringSet", collect_set('bytes) as "byteSet").show(truncate=false) // The same problem is displayed when using window functions. val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) val result = df.select( collect_set('value).over(win) as "stringSet", collect_set('bytes).over(win) as "bytesSet" ) .select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", size('bytesSet) as "bytesSetSize") .show() ``` We use a HashSet buffer to accumulate the results, the problem is that arrays equality in Scala don't behave as expected, arrays ara just plain java arrays and the equality don't compare the content of the arrays Array(1, 2, 3) == Array(1, 2, 3) => False The result is that duplicates are not removed in the hashset The solution proposed is that in the last stage, when we have all the data in the Hashset buffer, we delete duplicates changing the type of the elements and then transform it to the original type. This transformation is only applied when we have a BinaryType ### Why are the changes needed? Fix the bug explained ### Does this PR introduce any user-facing change? Yes. Now `collect_set()` correctly deduplicates array of byte. ### How was this patch tested? Unit testing Closes #28351 from planga82/feature/SPARK-31500_COLLECT_SET_bug. Authored-by: Pablo Langa Signed-off-by: Takeshi Yamamuro (cherry picked from commit 4fecc20f6ecdfe642890cf0a368a85558c40a47c) Signed-off-by: Takeshi Yamamuro --- .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index be972f0..8dc3171 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -23,6 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ @@ -46,13 +47,15 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper // actual order of input rows. override lazy val deterministic: Boolean = false + protected def convertToBufferElement(value: Any): Any + override def update(buffer: T, input: InternalRow): T = { val value = child.eval(input) // Do not allow null values. We follow the semantics of Hive's collect_list/collect_set here. // See: org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMkCollectionEvaluator if (value != null) { - buffer += InternalRow.copyValue(value) + buffer += convertToBufferElement(value) } buffer } @@ -61,12 +64,10 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper buffer ++= other } - override def eval(buffer: T): Any = { -new GenericArrayData(buffer.toArray) - } + protected val bufferElementType: DataType private lazy val projection = UnsafeProjection.create( -Array[DataType](ArrayType(elementType = child.dataType, containsNull =
[spark] branch branch-2.4 updated: [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1222ce0 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements 1222ce0 is described below commit 1222ce064f97ed9ad34e2fca4d270762592a1854 Author: Pablo Langa AuthorDate: Fri May 1 22:09:04 2020 +0900 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements ### What changes were proposed in this pull request? The collect_set() aggregate function should produce a set of distinct elements. When the column argument's type is BinayType this is not the case. Example: ```scala import org.apache.spark.sql.functions._ import org.apache.spark.sql.expressions.Window case class R(id: String, value: String, bytes: Array[Byte]) def makeR(id: String, value: String) = R(id, value, value.getBytes) val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), makeR("b", "fish")).toDF() // In the example below "bytesSet" erroneously has duplicates but "stringSet" does not (as expected). df.agg(collect_set('value) as "stringSet", collect_set('bytes) as "byteSet").show(truncate=false) // The same problem is displayed when using window functions. val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) val result = df.select( collect_set('value).over(win) as "stringSet", collect_set('bytes).over(win) as "bytesSet" ) .select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", size('bytesSet) as "bytesSetSize") .show() ``` We use a HashSet buffer to accumulate the results, the problem is that arrays equality in Scala don't behave as expected, arrays ara just plain java arrays and the equality don't compare the content of the arrays Array(1, 2, 3) == Array(1, 2, 3) => False The result is that duplicates are not removed in the hashset The solution proposed is that in the last stage, when we have all the data in the Hashset buffer, we delete duplicates changing the type of the elements and then transform it to the original type. This transformation is only applied when we have a BinaryType ### Why are the changes needed? Fix the bug explained ### Does this PR introduce any user-facing change? Yes. Now `collect_set()` correctly deduplicates array of byte. ### How was this patch tested? Unit testing Closes #28351 from planga82/feature/SPARK-31500_COLLECT_SET_bug. Authored-by: Pablo Langa Signed-off-by: Takeshi Yamamuro (cherry picked from commit 4fecc20f6ecdfe642890cf0a368a85558c40a47c) Signed-off-by: Takeshi Yamamuro --- .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index be972f0..8dc3171 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -23,6 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ @@ -46,13 +47,15 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper // actual order of input rows. override lazy val deterministic: Boolean = false + protected def convertToBufferElement(value: Any): Any + override def update(buffer: T, input: InternalRow): T = { val value = child.eval(input) // Do not allow null values. We follow the semantics of Hive's collect_list/collect_set here. // See: org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMkCollectionEvaluator if (value != null) { - buffer += InternalRow.copyValue(value) + buffer += convertToBufferElement(value) } buffer } @@ -61,12 +64,10 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper buffer ++= other } - override def eval(buffer: T): Any = { -new GenericArrayData(buffer.toArray) - } + protected val bufferElementType: DataType private lazy val projection = UnsafeProjection.create( -Array[DataType](ArrayType(elementType = child.dataType, containsNull =
[spark] branch branch-3.0 updated: [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 1795a70 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements 1795a70 is described below commit 1795a70bb04fad1b8cf76271443a448f8d72fc8a Author: Pablo Langa AuthorDate: Fri May 1 22:09:04 2020 +0900 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements ### What changes were proposed in this pull request? The collect_set() aggregate function should produce a set of distinct elements. When the column argument's type is BinayType this is not the case. Example: ```scala import org.apache.spark.sql.functions._ import org.apache.spark.sql.expressions.Window case class R(id: String, value: String, bytes: Array[Byte]) def makeR(id: String, value: String) = R(id, value, value.getBytes) val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), makeR("b", "fish")).toDF() // In the example below "bytesSet" erroneously has duplicates but "stringSet" does not (as expected). df.agg(collect_set('value) as "stringSet", collect_set('bytes) as "byteSet").show(truncate=false) // The same problem is displayed when using window functions. val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) val result = df.select( collect_set('value).over(win) as "stringSet", collect_set('bytes).over(win) as "bytesSet" ) .select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", size('bytesSet) as "bytesSetSize") .show() ``` We use a HashSet buffer to accumulate the results, the problem is that arrays equality in Scala don't behave as expected, arrays ara just plain java arrays and the equality don't compare the content of the arrays Array(1, 2, 3) == Array(1, 2, 3) => False The result is that duplicates are not removed in the hashset The solution proposed is that in the last stage, when we have all the data in the Hashset buffer, we delete duplicates changing the type of the elements and then transform it to the original type. This transformation is only applied when we have a BinaryType ### Why are the changes needed? Fix the bug explained ### Does this PR introduce any user-facing change? Yes. Now `collect_set()` correctly deduplicates array of byte. ### How was this patch tested? Unit testing Closes #28351 from planga82/feature/SPARK-31500_COLLECT_SET_bug. Authored-by: Pablo Langa Signed-off-by: Takeshi Yamamuro (cherry picked from commit 4fecc20f6ecdfe642890cf0a368a85558c40a47c) Signed-off-by: Takeshi Yamamuro --- .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index 5848aa3..0a3d876 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -23,6 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ @@ -46,13 +47,15 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper // actual order of input rows. override lazy val deterministic: Boolean = false + protected def convertToBufferElement(value: Any): Any + override def update(buffer: T, input: InternalRow): T = { val value = child.eval(input) // Do not allow null values. We follow the semantics of Hive's collect_list/collect_set here. // See: org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMkCollectionEvaluator if (value != null) { - buffer += InternalRow.copyValue(value) + buffer += convertToBufferElement(value) } buffer } @@ -61,12 +64,10 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper buffer ++= other } - override def eval(buffer: T): Any = { -new GenericArrayData(buffer.toArray) - } + protected val bufferElementType: DataType private lazy val projection = UnsafeProjection.create( -Array[DataType](ArrayType(elementType = child.dataType, containsNull =
[spark] branch branch-2.4 updated: [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1222ce0 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements 1222ce0 is described below commit 1222ce064f97ed9ad34e2fca4d270762592a1854 Author: Pablo Langa AuthorDate: Fri May 1 22:09:04 2020 +0900 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements ### What changes were proposed in this pull request? The collect_set() aggregate function should produce a set of distinct elements. When the column argument's type is BinayType this is not the case. Example: ```scala import org.apache.spark.sql.functions._ import org.apache.spark.sql.expressions.Window case class R(id: String, value: String, bytes: Array[Byte]) def makeR(id: String, value: String) = R(id, value, value.getBytes) val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), makeR("b", "fish")).toDF() // In the example below "bytesSet" erroneously has duplicates but "stringSet" does not (as expected). df.agg(collect_set('value) as "stringSet", collect_set('bytes) as "byteSet").show(truncate=false) // The same problem is displayed when using window functions. val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) val result = df.select( collect_set('value).over(win) as "stringSet", collect_set('bytes).over(win) as "bytesSet" ) .select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", size('bytesSet) as "bytesSetSize") .show() ``` We use a HashSet buffer to accumulate the results, the problem is that arrays equality in Scala don't behave as expected, arrays ara just plain java arrays and the equality don't compare the content of the arrays Array(1, 2, 3) == Array(1, 2, 3) => False The result is that duplicates are not removed in the hashset The solution proposed is that in the last stage, when we have all the data in the Hashset buffer, we delete duplicates changing the type of the elements and then transform it to the original type. This transformation is only applied when we have a BinaryType ### Why are the changes needed? Fix the bug explained ### Does this PR introduce any user-facing change? Yes. Now `collect_set()` correctly deduplicates array of byte. ### How was this patch tested? Unit testing Closes #28351 from planga82/feature/SPARK-31500_COLLECT_SET_bug. Authored-by: Pablo Langa Signed-off-by: Takeshi Yamamuro (cherry picked from commit 4fecc20f6ecdfe642890cf0a368a85558c40a47c) Signed-off-by: Takeshi Yamamuro --- .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index be972f0..8dc3171 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -23,6 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ @@ -46,13 +47,15 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper // actual order of input rows. override lazy val deterministic: Boolean = false + protected def convertToBufferElement(value: Any): Any + override def update(buffer: T, input: InternalRow): T = { val value = child.eval(input) // Do not allow null values. We follow the semantics of Hive's collect_list/collect_set here. // See: org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMkCollectionEvaluator if (value != null) { - buffer += InternalRow.copyValue(value) + buffer += convertToBufferElement(value) } buffer } @@ -61,12 +64,10 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper buffer ++= other } - override def eval(buffer: T): Any = { -new GenericArrayData(buffer.toArray) - } + protected val bufferElementType: DataType private lazy val projection = UnsafeProjection.create( -Array[DataType](ArrayType(elementType = child.dataType, containsNull =
[spark] branch branch-3.0 updated: [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 1795a70 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements 1795a70 is described below commit 1795a70bb04fad1b8cf76271443a448f8d72fc8a Author: Pablo Langa AuthorDate: Fri May 1 22:09:04 2020 +0900 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements ### What changes were proposed in this pull request? The collect_set() aggregate function should produce a set of distinct elements. When the column argument's type is BinayType this is not the case. Example: ```scala import org.apache.spark.sql.functions._ import org.apache.spark.sql.expressions.Window case class R(id: String, value: String, bytes: Array[Byte]) def makeR(id: String, value: String) = R(id, value, value.getBytes) val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), makeR("b", "fish")).toDF() // In the example below "bytesSet" erroneously has duplicates but "stringSet" does not (as expected). df.agg(collect_set('value) as "stringSet", collect_set('bytes) as "byteSet").show(truncate=false) // The same problem is displayed when using window functions. val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) val result = df.select( collect_set('value).over(win) as "stringSet", collect_set('bytes).over(win) as "bytesSet" ) .select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", size('bytesSet) as "bytesSetSize") .show() ``` We use a HashSet buffer to accumulate the results, the problem is that arrays equality in Scala don't behave as expected, arrays ara just plain java arrays and the equality don't compare the content of the arrays Array(1, 2, 3) == Array(1, 2, 3) => False The result is that duplicates are not removed in the hashset The solution proposed is that in the last stage, when we have all the data in the Hashset buffer, we delete duplicates changing the type of the elements and then transform it to the original type. This transformation is only applied when we have a BinaryType ### Why are the changes needed? Fix the bug explained ### Does this PR introduce any user-facing change? Yes. Now `collect_set()` correctly deduplicates array of byte. ### How was this patch tested? Unit testing Closes #28351 from planga82/feature/SPARK-31500_COLLECT_SET_bug. Authored-by: Pablo Langa Signed-off-by: Takeshi Yamamuro (cherry picked from commit 4fecc20f6ecdfe642890cf0a368a85558c40a47c) Signed-off-by: Takeshi Yamamuro --- .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index 5848aa3..0a3d876 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -23,6 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ @@ -46,13 +47,15 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper // actual order of input rows. override lazy val deterministic: Boolean = false + protected def convertToBufferElement(value: Any): Any + override def update(buffer: T, input: InternalRow): T = { val value = child.eval(input) // Do not allow null values. We follow the semantics of Hive's collect_list/collect_set here. // See: org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMkCollectionEvaluator if (value != null) { - buffer += InternalRow.copyValue(value) + buffer += convertToBufferElement(value) } buffer } @@ -61,12 +64,10 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper buffer ++= other } - override def eval(buffer: T): Any = { -new GenericArrayData(buffer.toArray) - } + protected val bufferElementType: DataType private lazy val projection = UnsafeProjection.create( -Array[DataType](ArrayType(elementType = child.dataType, containsNull =
[spark] branch master updated (b7cde42 -> 4fecc20)
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from b7cde42 [SPARK-31619][CORE] Rename config "spark.dynamicAllocation.shuffleTimeout" to "spark.dynamicAllocation.shuffleTracking.timeout" add 4fecc20 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements No new revisions were added by this update. Summary of changes: .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (b7cde42 -> 4fecc20)
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from b7cde42 [SPARK-31619][CORE] Rename config "spark.dynamicAllocation.shuffleTimeout" to "spark.dynamicAllocation.shuffleTracking.timeout" add 4fecc20 [SPARK-31500][SQL] collect_set() of BinaryType returns duplicate elements No new revisions were added by this update. Summary of changes: .../catalyst/expressions/aggregate/collect.scala | 45 +++--- .../apache/spark/sql/DataFrameAggregateSuite.scala | 16 2 files changed, 55 insertions(+), 6 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31372][SQL][TEST][FOLLOWUP][3.0] Update the golden file of ExpressionsSchemaSuite
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7c6b970 [SPARK-31372][SQL][TEST][FOLLOWUP][3.0] Update the golden file of ExpressionsSchemaSuite 7c6b970 is described below commit 7c6b9708b6fbc81d583081a7b027fe1cce493b6c Author: Takeshi Yamamuro AuthorDate: Fri May 1 18:37:41 2020 +0900 [SPARK-31372][SQL][TEST][FOLLOWUP][3.0] Update the golden file of ExpressionsSchemaSuite ### What changes were proposed in this pull request? This PR is a follow-up PR to update the golden file of `ExpressionsSchemaSuite`. ### Why are the changes needed? To recover tests in branch-3.0. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. Closes #28427 from maropu/SPARK-31372-FOLLOWUP. Authored-by: Takeshi Yamamuro Signed-off-by: Takeshi Yamamuro --- .../src/test/resources/sql-functions/sql-expression-schema.md| 9 ++--- .../test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala | 7 ++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 1e22ae2..2091de2 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 333 + - Number of queries: 328 - Number of expressions that missing example: 34 - Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,struct,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch ## Schema of Built-in Functions @@ -123,7 +123,7 @@ | org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual | >= | SELECT 2 >= 1 | struct<(2 >= 1):boolean> | | org.apache.spark.sql.catalyst.expressions.Greatest | greatest | SELECT greatest(10, 9, 2, 4, 3) | struct | | org.apache.spark.sql.catalyst.expressions.Grouping | grouping | SELECT name, grouping(name), sum(age) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name) | struct | -| org.apache.spark.sql.catalyst.expressions.GroupingID | grouping_id | SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height) | struct | +| org.apache.spark.sql.catalyst.expressions.GroupingID | grouping_id | SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height) | struct | | org.apache.spark.sql.catalyst.expressions.Hex | hex | SELECT hex(17) | struct | | org.apache.spark.sql.catalyst.expressions.Hour | hour | SELECT hour('2009-07-30 12:58:59') | struct | | org.apache.spark.sql.catalyst.expressions.Hypot | hypot | SELECT hypot(3, 4) | struct | @@ -140,7 +140,6 @@ | org.apache.spark.sql.catalyst.expressions.IsNaN | isnan | SELECT isnan(cast('NaN' as double)) | struct | | org.apache.spark.sql.catalyst.expressions.IsNotNull | isnotnull | SELECT isnotnull(1) | struct<(1 IS NOT NULL):boolean> | | org.apache.spark.sql.catalyst.expressions.IsNull | isnull | SELECT isnull(1) | struct<(1 IS NULL):boolean> | -| org.apache.spark.sql.catalyst.expressions.JsonObjectKeys | json_object_keys | SELECT json_object_keys('{}') | struct> | | org.apache.spark.sql.catalyst.expressions.JsonToStructs | from_json | SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE') | struct> | | org.apache.spark.sql.catalyst.expressions.JsonTuple | json_tuple | SELECT json_tuple('{"a":1, "b":2}', 'a', 'b') | struct | | org.apache.spark.sql.catalyst.expressions.Lag | lag | N/A | N/A | @@ -151,7 +150,6 @@ | org.apache.spark.sql.catalyst.expressions.Length | character_length | SELECT character_length('Spark SQL ') | struct | | org.apache.spark.sql.catalyst.expressions.Length | char_length | SELECT char_length('Spark SQL ') | struct | | org.apache.spark.sql.catalyst.expressions.Length | length | SELECT length('Spark SQL ') | struct | -| org.apache.spark.sql.catalyst.expressions.LengthOfJsonArray | json_array_length | SELECT json_array_length('[1,2,3,4]') | struct | | org.apache.spark.sql.catalyst.expressions.LessThan | < | SELECT 1 < 2 | struct<(1 < 2):boolean> | | org.apache.spark.sql.catalyst.expressions.LessThanOrEqual | <= | SELECT 2 <= 2 | struct<(2 <= 2):boolean> | | org.apache.spark.sql.catalyst.expressions.Levenshtein | levenshtein | SELECT
[spark] branch branch-3.0 updated: [SPARK-31372][SQL][TEST][FOLLOWUP][3.0] Update the golden file of ExpressionsSchemaSuite
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7c6b970 [SPARK-31372][SQL][TEST][FOLLOWUP][3.0] Update the golden file of ExpressionsSchemaSuite 7c6b970 is described below commit 7c6b9708b6fbc81d583081a7b027fe1cce493b6c Author: Takeshi Yamamuro AuthorDate: Fri May 1 18:37:41 2020 +0900 [SPARK-31372][SQL][TEST][FOLLOWUP][3.0] Update the golden file of ExpressionsSchemaSuite ### What changes were proposed in this pull request? This PR is a follow-up PR to update the golden file of `ExpressionsSchemaSuite`. ### Why are the changes needed? To recover tests in branch-3.0. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. Closes #28427 from maropu/SPARK-31372-FOLLOWUP. Authored-by: Takeshi Yamamuro Signed-off-by: Takeshi Yamamuro --- .../src/test/resources/sql-functions/sql-expression-schema.md| 9 ++--- .../test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala | 7 ++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 1e22ae2..2091de2 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 333 + - Number of queries: 328 - Number of expressions that missing example: 34 - Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,struct,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch ## Schema of Built-in Functions @@ -123,7 +123,7 @@ | org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual | >= | SELECT 2 >= 1 | struct<(2 >= 1):boolean> | | org.apache.spark.sql.catalyst.expressions.Greatest | greatest | SELECT greatest(10, 9, 2, 4, 3) | struct | | org.apache.spark.sql.catalyst.expressions.Grouping | grouping | SELECT name, grouping(name), sum(age) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name) | struct | -| org.apache.spark.sql.catalyst.expressions.GroupingID | grouping_id | SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height) | struct | +| org.apache.spark.sql.catalyst.expressions.GroupingID | grouping_id | SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height) | struct | | org.apache.spark.sql.catalyst.expressions.Hex | hex | SELECT hex(17) | struct | | org.apache.spark.sql.catalyst.expressions.Hour | hour | SELECT hour('2009-07-30 12:58:59') | struct | | org.apache.spark.sql.catalyst.expressions.Hypot | hypot | SELECT hypot(3, 4) | struct | @@ -140,7 +140,6 @@ | org.apache.spark.sql.catalyst.expressions.IsNaN | isnan | SELECT isnan(cast('NaN' as double)) | struct | | org.apache.spark.sql.catalyst.expressions.IsNotNull | isnotnull | SELECT isnotnull(1) | struct<(1 IS NOT NULL):boolean> | | org.apache.spark.sql.catalyst.expressions.IsNull | isnull | SELECT isnull(1) | struct<(1 IS NULL):boolean> | -| org.apache.spark.sql.catalyst.expressions.JsonObjectKeys | json_object_keys | SELECT json_object_keys('{}') | struct> | | org.apache.spark.sql.catalyst.expressions.JsonToStructs | from_json | SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE') | struct> | | org.apache.spark.sql.catalyst.expressions.JsonTuple | json_tuple | SELECT json_tuple('{"a":1, "b":2}', 'a', 'b') | struct | | org.apache.spark.sql.catalyst.expressions.Lag | lag | N/A | N/A | @@ -151,7 +150,6 @@ | org.apache.spark.sql.catalyst.expressions.Length | character_length | SELECT character_length('Spark SQL ') | struct | | org.apache.spark.sql.catalyst.expressions.Length | char_length | SELECT char_length('Spark SQL ') | struct | | org.apache.spark.sql.catalyst.expressions.Length | length | SELECT length('Spark SQL ') | struct | -| org.apache.spark.sql.catalyst.expressions.LengthOfJsonArray | json_array_length | SELECT json_array_length('[1,2,3,4]') | struct | | org.apache.spark.sql.catalyst.expressions.LessThan | < | SELECT 1 < 2 | struct<(1 < 2):boolean> | | org.apache.spark.sql.catalyst.expressions.LessThanOrEqual | <= | SELECT 2 <= 2 | struct<(2 <= 2):boolean> | | org.apache.spark.sql.catalyst.expressions.Levenshtein | levenshtein | SELECT