[spark] branch branch-3.2 updated: [SPARK-38977][SQL] Fix schema pruning with correlated subqueries
This is an automated email from the ASF dual-hosted git repository. viirya pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 38d818913bb [SPARK-38977][SQL] Fix schema pruning with correlated subqueries 38d818913bb is described below commit 38d818913bbb84e720cae149a236c61bf8fc4f18 Author: Anton Okolnychyi AuthorDate: Fri Apr 22 14:11:47 2022 -0700 [SPARK-38977][SQL] Fix schema pruning with correlated subqueries ### What changes were proposed in this pull request? This PR fixes schema pruning for queries with multiple correlated subqueries. Previously, Spark would throw an exception trying to determine root fields in `SchemaPruning$identifyRootFields`. That was happening because expressions in predicates that referenced attributes in subqueries were not ignored. That's why attributes from multiple subqueries could conflict with each other (e.g. incompatible types) even though they should be ignored. For instance, the following query would throw a runtime exception. ``` SELECT name FROM contacts c WHERE EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id) AND EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value) ``` ``` [info] org.apache.spark.SparkException: Failed to merge fields 'value' and 'value'. Failed to merge incompatible data types int and string [info] at org.apache.spark.sql.errors.QueryExecutionErrors$.failedMergingFieldsError(QueryExecutionErrors.scala:936) ``` ### Why are the changes needed? These changes are needed to avoid exceptions for some queries with multiple correlated subqueries. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? This PR comes with tests. Closes #36303 from aokolnychyi/spark-38977. Authored-by: Anton Okolnychyi Signed-off-by: Liang-Chi Hsieh (cherry picked from commit 0c9947dabcb71de414c97c0e60a1067e468f2642) Signed-off-by: Liang-Chi Hsieh --- .../sql/catalyst/expressions/SchemaPruning.scala | 4 + .../execution/datasources/SchemaPruningSuite.scala | 102 + 2 files changed, 106 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala index 9aa2766dd3e..1ebe680263f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala @@ -140,6 +140,10 @@ object SchemaPruning extends SQLConfHelper { RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = true) :: Nil case IsNotNull(_: Attribute) | IsNull(_: Attribute) => expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = true)) + case s: SubqueryExpression => +// use subquery references that only include outer attrs and +// ignore join conditions as those may include attributes from other tables +s.references.toSeq.flatMap(getRootFields) case _ => expr.children.flatMap(getRootFields) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 1175063bfa9..0b745f18768 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -901,4 +901,106 @@ abstract class SchemaPruningSuite .count() assert(count == 0) } + + testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS subquery") { + +import testImplicits._ + +withTempView("ids", "first_names") { + val df1 = Seq(1, 2, 3).toDF("value") + df1.createOrReplaceTempView("ids") + + val df2 = Seq("John", "Bob").toDF("value") + df2.createOrReplaceTempView("first_names") + + val query = sql( +"""SELECT name FROM contacts c + |WHERE + | EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id) + | AND + | EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value) + |""".stripMargin) + + checkScan(query, "struct>") + + checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil) +} + } + + testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS subquery") { + +import testImplicits._ + +withTempView("ids", "first_names") { + val df1 = Seq(1, 2, 3).toDF("value") + df1.createOrReplaceTempView("ids") + + val df2 = Seq("John", "Bob").toDF("value") +
[spark] branch branch-3.3 updated: [SPARK-38977][SQL] Fix schema pruning with correlated subqueries
This is an automated email from the ASF dual-hosted git repository. viirya pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 9c5f38d8085 [SPARK-38977][SQL] Fix schema pruning with correlated subqueries 9c5f38d8085 is described below commit 9c5f38d808573ced34bb52bdf4c5102ff2d1a7e2 Author: Anton Okolnychyi AuthorDate: Fri Apr 22 14:11:47 2022 -0700 [SPARK-38977][SQL] Fix schema pruning with correlated subqueries ### What changes were proposed in this pull request? This PR fixes schema pruning for queries with multiple correlated subqueries. Previously, Spark would throw an exception trying to determine root fields in `SchemaPruning$identifyRootFields`. That was happening because expressions in predicates that referenced attributes in subqueries were not ignored. That's why attributes from multiple subqueries could conflict with each other (e.g. incompatible types) even though they should be ignored. For instance, the following query would throw a runtime exception. ``` SELECT name FROM contacts c WHERE EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id) AND EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value) ``` ``` [info] org.apache.spark.SparkException: Failed to merge fields 'value' and 'value'. Failed to merge incompatible data types int and string [info] at org.apache.spark.sql.errors.QueryExecutionErrors$.failedMergingFieldsError(QueryExecutionErrors.scala:936) ``` ### Why are the changes needed? These changes are needed to avoid exceptions for some queries with multiple correlated subqueries. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? This PR comes with tests. Closes #36303 from aokolnychyi/spark-38977. Authored-by: Anton Okolnychyi Signed-off-by: Liang-Chi Hsieh (cherry picked from commit 0c9947dabcb71de414c97c0e60a1067e468f2642) Signed-off-by: Liang-Chi Hsieh --- .../sql/catalyst/expressions/SchemaPruning.scala | 4 + .../execution/datasources/SchemaPruningSuite.scala | 102 + 2 files changed, 106 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala index fd5b2db61f3..e14bcba0ace 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala @@ -152,6 +152,10 @@ object SchemaPruning extends SQLConfHelper { RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = true) :: Nil case IsNotNull(_: Attribute) | IsNull(_: Attribute) => expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = true)) + case s: SubqueryExpression => +// use subquery references that only include outer attrs and +// ignore join conditions as those may include attributes from other tables +s.references.toSeq.flatMap(getRootFields) case _ => expr.children.flatMap(getRootFields) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 2c227baa04f..1d62713331f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -935,4 +935,106 @@ abstract class SchemaPruningSuite .count() assert(count == 0) } + + testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS subquery") { + +import testImplicits._ + +withTempView("ids", "first_names") { + val df1 = Seq(1, 2, 3).toDF("value") + df1.createOrReplaceTempView("ids") + + val df2 = Seq("John", "Bob").toDF("value") + df2.createOrReplaceTempView("first_names") + + val query = sql( +"""SELECT name FROM contacts c + |WHERE + | EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id) + | AND + | EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value) + |""".stripMargin) + + checkScan(query, "struct>") + + checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil) +} + } + + testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS subquery") { + +import testImplicits._ + +withTempView("ids", "first_names") { + val df1 = Seq(1, 2, 3).toDF("value") + df1.createOrReplaceTempView("ids") + + val df2 = Seq("John", "Bob").toDF("value") +
[spark] branch master updated: [SPARK-38977][SQL] Fix schema pruning with correlated subqueries
This is an automated email from the ASF dual-hosted git repository. viirya pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0c9947dabcb [SPARK-38977][SQL] Fix schema pruning with correlated subqueries 0c9947dabcb is described below commit 0c9947dabcb71de414c97c0e60a1067e468f2642 Author: Anton Okolnychyi AuthorDate: Fri Apr 22 14:11:47 2022 -0700 [SPARK-38977][SQL] Fix schema pruning with correlated subqueries ### What changes were proposed in this pull request? This PR fixes schema pruning for queries with multiple correlated subqueries. Previously, Spark would throw an exception trying to determine root fields in `SchemaPruning$identifyRootFields`. That was happening because expressions in predicates that referenced attributes in subqueries were not ignored. That's why attributes from multiple subqueries could conflict with each other (e.g. incompatible types) even though they should be ignored. For instance, the following query would throw a runtime exception. ``` SELECT name FROM contacts c WHERE EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id) AND EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value) ``` ``` [info] org.apache.spark.SparkException: Failed to merge fields 'value' and 'value'. Failed to merge incompatible data types int and string [info] at org.apache.spark.sql.errors.QueryExecutionErrors$.failedMergingFieldsError(QueryExecutionErrors.scala:936) ``` ### Why are the changes needed? These changes are needed to avoid exceptions for some queries with multiple correlated subqueries. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? This PR comes with tests. Closes #36303 from aokolnychyi/spark-38977. Authored-by: Anton Okolnychyi Signed-off-by: Liang-Chi Hsieh --- .../sql/catalyst/expressions/SchemaPruning.scala | 4 + .../execution/datasources/SchemaPruningSuite.scala | 102 + 2 files changed, 106 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala index fd5b2db61f3..e14bcba0ace 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala @@ -152,6 +152,10 @@ object SchemaPruning extends SQLConfHelper { RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = true) :: Nil case IsNotNull(_: Attribute) | IsNull(_: Attribute) => expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = true)) + case s: SubqueryExpression => +// use subquery references that only include outer attrs and +// ignore join conditions as those may include attributes from other tables +s.references.toSeq.flatMap(getRootFields) case _ => expr.children.flatMap(getRootFields) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 4eb8258830c..becace3c69b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -935,4 +935,106 @@ abstract class SchemaPruningSuite .count() assert(count == 0) } + + testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS subquery") { + +import testImplicits._ + +withTempView("ids", "first_names") { + val df1 = Seq(1, 2, 3).toDF("value") + df1.createOrReplaceTempView("ids") + + val df2 = Seq("John", "Bob").toDF("value") + df2.createOrReplaceTempView("first_names") + + val query = sql( +"""SELECT name FROM contacts c + |WHERE + | EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id) + | AND + | EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value) + |""".stripMargin) + + checkScan(query, "struct>") + + checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil) +} + } + + testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS subquery") { + +import testImplicits._ + +withTempView("ids", "first_names") { + val df1 = Seq(1, 2, 3).toDF("value") + df1.createOrReplaceTempView("ids") + + val df2 = Seq("John", "Bob").toDF("value") + df2.createOrReplaceTempView("first_names") + + val query = sql( +"""SELECT name FROM contacts c +
[spark] branch master updated: [SPARK-38996][SQL] Use double quotes for types in error massages
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 5e494d3de70 [SPARK-38996][SQL] Use double quotes for types in error massages 5e494d3de70 is described below commit 5e494d3de70c6e46f33addd751a227e6f9d5703f Author: Max Gekk AuthorDate: Fri Apr 22 23:07:01 2022 +0300 [SPARK-38996][SQL] Use double quotes for types in error massages ### What changes were proposed in this pull request? In the PR, I propose to modify the method `QueryErrorsBase.toSQLType()` to use double quotes for types in error messages. ### Why are the changes needed? 1. To highlight types and make them more visible for users. 2. To be able to easily parse types from error text. 3. To be consistent to other outputs of identifiers, sql statement and etc. where Spark uses quotes or ticks. ### Does this PR introduce _any_ user-facing change? Yes, the PR changes user-facing errors. ### How was this patch tested? By running the modified test suites: ``` $ build/sbt "test:testOnly *QueryParsingErrorsSuite" $ build/sbt "test:testOnly *QueryCompilationErrorsSuite" $ build/sbt "test:testOnly *QueryExecutionErrorsSuite" $ build/sbt "testOnly *CastSuite" $ build/sbt "testOnly *AnsiCastSuiteWithAnsiModeOn" $ build/sbt "testOnly *EncoderResolutionSuite" $ build/sbt "test:testOnly *DatasetSuite" $ build/sbt "test:testOnly *InsertSuite" ``` Closes #36324 from MaxGekk/wrap-types-in-error-classes. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../apache/spark/sql/errors/QueryErrorsBase.scala | 2 +- .../catalyst/encoders/EncoderResolutionSuite.scala | 8 +-- .../catalyst/expressions/AnsiCastSuiteBase.scala | 36 ++-- .../spark/sql/catalyst/expressions/CastSuite.scala | 66 +++--- .../sql/catalyst/util/DateFormatterSuite.scala | 2 +- .../catalyst/util/TimestampFormatterSuite.scala| 2 +- .../org/apache/spark/sql/types/DecimalSuite.scala | 2 +- .../resources/sql-tests/results/ansi/cast.sql.out | 66 +++--- .../resources/sql-tests/results/ansi/date.sql.out | 6 +- .../results/ansi/datetime-parsing-invalid.sql.out | 4 +- .../sql-tests/results/ansi/interval.sql.out| 20 +++ .../results/ansi/string-functions.sql.out | 8 +-- .../sql-tests/results/postgreSQL/float4.sql.out| 14 ++--- .../sql-tests/results/postgreSQL/float8.sql.out| 10 ++-- .../sql-tests/results/postgreSQL/int8.sql.out | 8 +-- .../sql-tests/results/postgreSQL/text.sql.out | 4 +- .../results/postgreSQL/window_part2.sql.out| 2 +- .../results/postgreSQL/window_part3.sql.out| 2 +- .../results/postgreSQL/window_part4.sql.out| 2 +- .../results/timestampNTZ/timestamp-ansi.sql.out| 2 +- .../scala/org/apache/spark/sql/DatasetSuite.scala | 2 +- .../org/apache/spark/sql/SQLInsertTestSuite.scala | 2 +- .../sql/errors/QueryCompilationErrorsSuite.scala | 4 +- .../sql/errors/QueryExecutionAnsiErrorsSuite.scala | 2 +- .../sql/errors/QueryExecutionErrorsSuite.scala | 10 ++-- .../spark/sql/errors/QueryParsingErrorsSuite.scala | 4 +- .../org/apache/spark/sql/sources/InsertSuite.scala | 8 +-- 27 files changed, 149 insertions(+), 149 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala index 7daf8ae7325..4400bedfd5d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala @@ -62,6 +62,6 @@ trait QueryErrorsBase { } def toSQLType(t: DataType): String = { -t.sql +"\"" + t.sql + "\"" } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala index c83e85ef9de..dae7340ac08 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala @@ -88,7 +88,7 @@ class EncoderResolutionSuite extends PlanTest { val attrs = Seq($"arr".array(StringType)) assert(intercept[AnalysisException](encoder.resolveAndBind(attrs)).message == s""" - |[CANNOT_UP_CAST_DATATYPE] Cannot up cast array element from STRING to BIGINT. + |[CANNOT_UP_CAST_DATATYPE] Cannot up cast array element from "STRING" to "BIGINT". |The type path of the target object is: |- array element class: "scala.Long" |- field (class: "scala.Array",
[spark] branch branch-3.3 updated: [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down
This is an automated email from the ASF dual-hosted git repository. huaxingao pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new ca9138ee8b6 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down ca9138ee8b6 is described below commit ca9138ee8b6d8645943b737cc4231fbb0154c8cb Author: Cheng Su AuthorDate: Fri Apr 22 10:13:40 2022 -0700 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down ### What changes were proposed in this pull request? This is a followup per comment in https://issues.apache.org/jira/browse/SPARK-34960, to improve the documentation for data source v2 aggregate push down of Parquet and ORC. * Unify SQL config docs between Parquet and ORC, and add the note that if statistics is missing from any file footer, exception would be thrown. * Also adding the same note for exception in Parquet and ORC methods to aggregate from statistics. Though in future Spark release, we may improve the behavior to fallback to aggregate from real data of file, in case any statistics are missing. We'd better to make a clear documentation for current behavior now. ### Why are the changes needed? Give users & developers a better idea of when aggregate push down would throw exception. Have a better documentation for current behavior. ### Does this PR introduce _any_ user-facing change? Yes, the documentation change in SQL configs. ### How was this patch tested? Existing tests as this is just documentation change. Closes #36311 from c21/agg-doc. Authored-by: Cheng Su Signed-off-by: huaxingao (cherry picked from commit 86b8757c2c4bab6a0f7a700cf2c690cdd7f31eba) Signed-off-by: huaxingao --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++ .../apache/spark/sql/execution/datasources/orc/OrcUtils.scala | 2 ++ .../spark/sql/execution/datasources/parquet/ParquetUtils.scala | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index f97b7f8f004..76f3d1f5a84 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -974,9 +974,10 @@ object SQLConf { .createWithDefault(10) val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.aggregatePushdown") -.doc("If true, MAX/MIN/COUNT without filter and group by will be pushed" + - " down to Parquet for optimization. MAX/MIN/COUNT for complex types and timestamp" + - " can't be pushed down") +.doc("If true, aggregates will be pushed down to Parquet for optimization. Support MIN, MAX " + + "and COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date " + + "type. For COUNT, support all data types. If statistics is missing from any Parquet file " + + "footer, exception would be thrown.") .version("3.3.0") .booleanConf .createWithDefault(false) @@ -1110,7 +,8 @@ object SQLConf { val ORC_AGGREGATE_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.aggregatePushdown") .doc("If true, aggregates will be pushed down to ORC for optimization. Support MIN, MAX and " + "COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date " + - "type. For COUNT, support all data types.") + "type. For COUNT, support all data types. If statistics is missing from any ORC file " + + "footer, exception would be thrown.") .version("3.3.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index a68ce1a8636..9011821e1a7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -408,6 +408,8 @@ object OrcUtils extends Logging { * (Max/Min/Count) result using the statistics information from ORC file footer, and then * construct an InternalRow from these aggregate results. * + * NOTE: if statistics is missing from ORC file footer, exception would be thrown. + * * @return Aggregate results in the format of InternalRow */ def createAggInternalRowFromFooter( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala index
[spark] branch master updated: [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down
This is an automated email from the ASF dual-hosted git repository. huaxingao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 86b8757c2c4 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down 86b8757c2c4 is described below commit 86b8757c2c4bab6a0f7a700cf2c690cdd7f31eba Author: Cheng Su AuthorDate: Fri Apr 22 10:13:40 2022 -0700 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down ### What changes were proposed in this pull request? This is a followup per comment in https://issues.apache.org/jira/browse/SPARK-34960, to improve the documentation for data source v2 aggregate push down of Parquet and ORC. * Unify SQL config docs between Parquet and ORC, and add the note that if statistics is missing from any file footer, exception would be thrown. * Also adding the same note for exception in Parquet and ORC methods to aggregate from statistics. Though in future Spark release, we may improve the behavior to fallback to aggregate from real data of file, in case any statistics are missing. We'd better to make a clear documentation for current behavior now. ### Why are the changes needed? Give users & developers a better idea of when aggregate push down would throw exception. Have a better documentation for current behavior. ### Does this PR introduce _any_ user-facing change? Yes, the documentation change in SQL configs. ### How was this patch tested? Existing tests as this is just documentation change. Closes #36311 from c21/agg-doc. Authored-by: Cheng Su Signed-off-by: huaxingao --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++ .../apache/spark/sql/execution/datasources/orc/OrcUtils.scala | 2 ++ .../spark/sql/execution/datasources/parquet/ParquetUtils.scala | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 50d09d046bc..6d3f283fa73 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -974,9 +974,10 @@ object SQLConf { .createWithDefault(10) val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.aggregatePushdown") -.doc("If true, MAX/MIN/COUNT without filter and group by will be pushed" + - " down to Parquet for optimization. MAX/MIN/COUNT for complex types and timestamp" + - " can't be pushed down") +.doc("If true, aggregates will be pushed down to Parquet for optimization. Support MIN, MAX " + + "and COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date " + + "type. For COUNT, support all data types. If statistics is missing from any Parquet file " + + "footer, exception would be thrown.") .version("3.3.0") .booleanConf .createWithDefault(false) @@ -1110,7 +,8 @@ object SQLConf { val ORC_AGGREGATE_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.aggregatePushdown") .doc("If true, aggregates will be pushed down to ORC for optimization. Support MIN, MAX and " + "COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date " + - "type. For COUNT, support all data types.") + "type. For COUNT, support all data types. If statistics is missing from any ORC file " + + "footer, exception would be thrown.") .version("3.3.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index 79abdfe4690..f07573beae6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -407,6 +407,8 @@ object OrcUtils extends Logging { * (Max/Min/Count) result using the statistics information from ORC file footer, and then * construct an InternalRow from these aggregate results. * + * NOTE: if statistics is missing from ORC file footer, exception would be thrown. + * * @return Aggregate results in the format of InternalRow */ def createAggInternalRowFromFooter( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala index 9f2e6580ecb..5a291e6a2e5 100644 ---
[spark] branch master updated (80929d6b549 -> ffaceac4329)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 80929d6b549 [SPARK-38832][SQL][FOLLOWUP] Support propagate empty expression set for distinct key add ffaceac4329 [SPARK-38968][K8S] Remove a variable `hadoopConf` from `KerberosConfDriverFeatureStep` No new revisions were added by this update. Summary of changes: .../apache/spark/deploy/k8s/features/KerberosConfDriverFeatureStep.scala | 1 - 1 file changed, 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-38832][SQL][FOLLOWUP] Support propagate empty expression set for distinct key
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 80929d6b549 [SPARK-38832][SQL][FOLLOWUP] Support propagate empty expression set for distinct key 80929d6b549 is described below commit 80929d6b549dfc61ade130a9d59dfa1abe72d681 Author: ulysses-you AuthorDate: Fri Apr 22 18:17:04 2022 +0800 [SPARK-38832][SQL][FOLLOWUP] Support propagate empty expression set for distinct key ### What changes were proposed in this pull request? - Improve `DistinctKeyVisitor` that support propagate empty set - Small improvement for match alias ### Why are the changes needed? Make distinct keys can be used to optimize more case, see comment https://github.com/apache/spark/pull/36117#discussion_r853755920 ### Does this PR introduce _any_ user-facing change? Improve performance ### How was this patch tested? add test Closes #36281 from ulysses-you/SPARK-38832-followup. Authored-by: ulysses-you Signed-off-by: Wenchen Fan --- .../plans/logical/DistinctKeyVisitor.scala | 25 ++ .../plans/logical/LogicalPlanDistinctKeys.scala| 8 +-- .../plans/logical/DistinctKeyVisitorSuite.scala| 6 +- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala index 5b25a326831..1f495688bc5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala @@ -29,25 +29,21 @@ object DistinctKeyVisitor extends LogicalPlanVisitor[Set[ExpressionSet]] { private def projectDistinctKeys( keys: Set[ExpressionSet], projectList: Seq[NamedExpression]): Set[ExpressionSet] = { val outputSet = ExpressionSet(projectList.map(_.toAttribute)) -val aliases = projectList.filter(_.isInstanceOf[Alias]) +val aliases = projectList.collect { + // TODO: Expand distinctKeys for redundant aliases on the same expression + case alias: Alias if alias.child.deterministic => alias.child.canonicalized -> alias +}.toMap if (aliases.isEmpty) { keys.filter(_.subsetOf(outputSet)) } else { - val aliasedDistinctKeys = keys.map { expressionSet => -expressionSet.map { expression => - expression transform { -case expr: Expression => - // TODO: Expand distinctKeys for redundant aliases on the same expression - aliases -.collectFirst { case a: Alias if a.child.semanticEquals(expr) => a.toAttribute } -.getOrElse(expr) - } -} - } + val aliasedDistinctKeys = keys.map(_.map(_.transform { +case expr: Expression => + aliases.get(expr.canonicalized).map(_.toAttribute).getOrElse(expr) + })) aliasedDistinctKeys.collect { case es: ExpressionSet if es.subsetOf(outputSet) => ExpressionSet(es) } ++ keys.filter(_.subsetOf(outputSet)) -}.filter(_.nonEmpty) +} } /** @@ -69,7 +65,8 @@ object DistinctKeyVisitor extends LogicalPlanVisitor[Set[ExpressionSet]] { override def default(p: LogicalPlan): Set[ExpressionSet] = Set.empty[ExpressionSet] override def visitAggregate(p: Aggregate): Set[ExpressionSet] = { -val groupingExps = ExpressionSet(p.groupingExpressions) // handle group by a, a +// handle group by a, a and global aggregate +val groupingExps = ExpressionSet(p.groupingExpressions) projectDistinctKeys(addDistinctKey(p.child.distinctKeys, groupingExps), p.aggregateExpressions) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala index 2ffa5a0e594..1843c2da478 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala @@ -29,12 +29,6 @@ import org.apache.spark.sql.internal.SQLConf.PROPAGATE_DISTINCT_KEYS_ENABLED */ trait LogicalPlanDistinctKeys { self: LogicalPlan => lazy val distinctKeys: Set[ExpressionSet] = { -if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) { - val keys = DistinctKeyVisitor.visit(self) - require(keys.forall(_.nonEmpty)) - keys -} else { - Set.empty -} +if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) DistinctKeyVisitor.visit(self) else
[spark] branch branch-3.1 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new ee1d0c82311 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider ee1d0c82311 is described below commit ee1d0c82311e2caebf665e05f3c10d02cbfae196 Author: Hyukjin Kwon AuthorDate: Fri Apr 22 19:01:05 2022 +0900 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider ### What changes were proposed in this pull request? This PR proposes to avoid using `bash -c` in `ShellBasedGroupsMappingProvider`. This could allow users a command injection. ### Why are the changes needed? For a security purpose. ### Does this PR introduce _any_ user-facing change? Virtually no. ### How was this patch tested? Manually tested. Closes #36315 from HyukjinKwon/SPARK-38992. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc) Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala index f71dd08246b..7ef8ef165e3 100644 --- a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala +++ b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala @@ -30,6 +30,8 @@ import org.apache.spark.util.Utils private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingServiceProvider with Logging { + private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: Nil).stripLineEnd + override def getGroups(username: String): Set[String] = { val userGroups = getUnixGroups(username) logDebug("User: " + username + " Groups: " + userGroups.mkString(",")) @@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingService // shells out a "bash -c id -Gn username" to get user groups private def getUnixGroups(username: String): Set[String] = { -val cmdSeq = Seq("bash", "-c", "id -Gn " + username) // we need to get rid of the trailing "\n" from the result of command execution -Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet +Utils.executeAndGetOutput(idPath :: "-Gn" :: username :: Nil).stripLineEnd.split(" ").toSet } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new aeea56d7c51 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider aeea56d7c51 is described below commit aeea56d7c51032bb4921c16f41ece8bbb3da0eee Author: Hyukjin Kwon AuthorDate: Fri Apr 22 19:01:05 2022 +0900 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider ### What changes were proposed in this pull request? This PR proposes to avoid using `bash -c` in `ShellBasedGroupsMappingProvider`. This could allow users a command injection. ### Why are the changes needed? For a security purpose. ### Does this PR introduce _any_ user-facing change? Virtually no. ### How was this patch tested? Manually tested. Closes #36315 from HyukjinKwon/SPARK-38992. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc) Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala index f71dd08246b..7ef8ef165e3 100644 --- a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala +++ b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala @@ -30,6 +30,8 @@ import org.apache.spark.util.Utils private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingServiceProvider with Logging { + private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: Nil).stripLineEnd + override def getGroups(username: String): Set[String] = { val userGroups = getUnixGroups(username) logDebug("User: " + username + " Groups: " + userGroups.mkString(",")) @@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingService // shells out a "bash -c id -Gn username" to get user groups private def getUnixGroups(username: String): Set[String] = { -val cmdSeq = Seq("bash", "-c", "id -Gn " + username) // we need to get rid of the trailing "\n" from the result of command execution -Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet +Utils.executeAndGetOutput(idPath :: "-Gn" :: username :: Nil).stripLineEnd.split(" ").toSet } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 1d524a88f6e [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider 1d524a88f6e is described below commit 1d524a88f6e93e9971a09f70eb2804dca51d578c Author: Hyukjin Kwon AuthorDate: Fri Apr 22 19:01:05 2022 +0900 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider ### What changes were proposed in this pull request? This PR proposes to avoid using `bash -c` in `ShellBasedGroupsMappingProvider`. This could allow users a command injection. ### Why are the changes needed? For a security purpose. ### Does this PR introduce _any_ user-facing change? Virtually no. ### How was this patch tested? Manually tested. Closes #36315 from HyukjinKwon/SPARK-38992. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc) Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala index f71dd08246b..7ef8ef165e3 100644 --- a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala +++ b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala @@ -30,6 +30,8 @@ import org.apache.spark.util.Utils private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingServiceProvider with Logging { + private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: Nil).stripLineEnd + override def getGroups(username: String): Set[String] = { val userGroups = getUnixGroups(username) logDebug("User: " + username + " Groups: " + userGroups.mkString(",")) @@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingService // shells out a "bash -c id -Gn username" to get user groups private def getUnixGroups(username: String): Set[String] = { -val cmdSeq = Seq("bash", "-c", "id -Gn " + username) // we need to get rid of the trailing "\n" from the result of command execution -Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet +Utils.executeAndGetOutput(idPath :: "-Gn" :: username :: Nil).stripLineEnd.split(" ").toSet } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 9cc2ae78041 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider 9cc2ae78041 is described below commit 9cc2ae7804156899850031bd694b1925473fb4cd Author: Hyukjin Kwon AuthorDate: Fri Apr 22 19:01:05 2022 +0900 [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider ### What changes were proposed in this pull request? This PR proposes to avoid using `bash -c` in `ShellBasedGroupsMappingProvider`. This could allow users a command injection. ### Why are the changes needed? For a security purpose. ### Does this PR introduce _any_ user-facing change? Virtually no. ### How was this patch tested? Manually tested. Closes #36315 from HyukjinKwon/SPARK-38992. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc) Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala index f71dd08246b..7ef8ef165e3 100644 --- a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala +++ b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala @@ -30,6 +30,8 @@ import org.apache.spark.util.Utils private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingServiceProvider with Logging { + private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: Nil).stripLineEnd + override def getGroups(username: String): Set[String] = { val userGroups = getUnixGroups(username) logDebug("User: " + username + " Groups: " + userGroups.mkString(",")) @@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingService // shells out a "bash -c id -Gn username" to get user groups private def getUnixGroups(username: String): Set[String] = { -val cmdSeq = Seq("bash", "-c", "id -Gn " + username) // we need to get rid of the trailing "\n" from the result of command execution -Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet +Utils.executeAndGetOutput(idPath :: "-Gn" :: username :: Nil).stripLineEnd.split(" ").toSet } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (8d59fdbacf2 -> c83618e4e5f)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 8d59fdbacf2 [SPARK-38736][SQL][TESTS] Test the error classes: INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT add c83618e4e5f [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider No new revisions were added by this update. Summary of changes: .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-38736][SQL][TESTS] Test the error classes: INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 8d59fdbacf2 [SPARK-38736][SQL][TESTS] Test the error classes: INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT 8d59fdbacf2 is described below commit 8d59fdbacf28427f72dd30e5e7e135644a0f2190 Author: panbingkun AuthorDate: Fri Apr 22 12:54:48 2022 +0300 [SPARK-38736][SQL][TESTS] Test the error classes: INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT ## What changes were proposed in this pull request? This pr aims to add one test for the error class INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT to QueryExecutionAnsiErrorsSuite. ### Why are the changes needed? The changes improve test coverage, and document expected error messages in tests. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? By running new test: ``` build/sbt "sql/testOnly *QueryExecutionAnsiErrorsSuite*" ``` Closes #36314 from panbingkun/SPARK-38736. Authored-by: panbingkun Signed-off-by: Max Gekk --- .../sql/errors/QueryExecutionAnsiErrorsSuite.scala | 25 +- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala index 3580f90f1bd..22e6711d74d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.errors -import org.apache.spark.{SparkArithmeticException, SparkConf, SparkDateTimeException} +import org.apache.spark.{SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkConf, SparkDateTimeException} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf @@ -81,4 +81,27 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest with QueryErrorsSuiteBase |""".stripMargin, sqlState = Some("22005")) } + + test("INVALID_ARRAY_INDEX: get element from array") { +checkErrorClass( + exception = intercept[SparkArrayIndexOutOfBoundsException] { +sql("select array(1, 2, 3, 4, 5)[8]").collect() + }, + errorClass = "INVALID_ARRAY_INDEX", + msg = "Invalid index: 8, numElements: 5. " + +"If necessary set spark.sql.ansi.enabled to false to bypass this error." +) + } + + test("INVALID_ARRAY_INDEX_IN_ELEMENT_AT: element_at from array") { +checkErrorClass( + exception = intercept[SparkArrayIndexOutOfBoundsException] { +sql("select element_at(array(1, 2, 3, 4, 5), 8)").collect() + }, + errorClass = "INVALID_ARRAY_INDEX_IN_ELEMENT_AT", + msg = "Invalid index: 8, numElements: 5. " + +"To return NULL instead, use 'try_element_at'. " + +"If necessary set spark.sql.ansi.enabled to false to bypass this error." +) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org