[spark] branch branch-3.2 updated: [SPARK-38977][SQL] Fix schema pruning with correlated subqueries

2022-04-22 Thread viirya
This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 38d818913bb [SPARK-38977][SQL] Fix schema pruning with correlated 
subqueries
38d818913bb is described below

commit 38d818913bbb84e720cae149a236c61bf8fc4f18
Author: Anton Okolnychyi 
AuthorDate: Fri Apr 22 14:11:47 2022 -0700

[SPARK-38977][SQL] Fix schema pruning with correlated subqueries

### What changes were proposed in this pull request?

This PR fixes schema pruning for queries with multiple correlated 
subqueries. Previously, Spark would throw an exception trying to determine root 
fields in `SchemaPruning$identifyRootFields`. That was happening because 
expressions in predicates that referenced attributes in subqueries were not 
ignored. That's why attributes from multiple subqueries could conflict with 
each other (e.g. incompatible types) even though they should be ignored.

For instance, the following query would throw a runtime exception.

```
SELECT name FROM contacts c
WHERE
 EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
 AND
 EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
```
```
[info]   org.apache.spark.SparkException: Failed to merge fields 'value' 
and 'value'. Failed to merge incompatible data types int and string
[info]   at 
org.apache.spark.sql.errors.QueryExecutionErrors$.failedMergingFieldsError(QueryExecutionErrors.scala:936)
```

### Why are the changes needed?

These changes are needed to avoid exceptions for some queries with multiple 
correlated subqueries.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

This PR comes with tests.

Closes #36303 from aokolnychyi/spark-38977.

Authored-by: Anton Okolnychyi 
Signed-off-by: Liang-Chi Hsieh 
(cherry picked from commit 0c9947dabcb71de414c97c0e60a1067e468f2642)
Signed-off-by: Liang-Chi Hsieh 
---
 .../sql/catalyst/expressions/SchemaPruning.scala   |   4 +
 .../execution/datasources/SchemaPruningSuite.scala | 102 +
 2 files changed, 106 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
index 9aa2766dd3e..1ebe680263f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
@@ -140,6 +140,10 @@ object SchemaPruning extends SQLConfHelper {
 RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = 
true) :: Nil
   case IsNotNull(_: Attribute) | IsNull(_: Attribute) =>
 
expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = 
true))
+  case s: SubqueryExpression =>
+// use subquery references that only include outer attrs and
+// ignore join conditions as those may include attributes from other 
tables
+s.references.toSeq.flatMap(getRootFields)
   case _ =>
 expr.children.flatMap(getRootFields)
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
index 1175063bfa9..0b745f18768 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
@@ -901,4 +901,106 @@ abstract class SchemaPruningSuite
   .count()
 assert(count == 0)
   }
+
+  testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS 
subquery") {
+
+import testImplicits._
+
+withTempView("ids", "first_names") {
+  val df1 = Seq(1, 2, 3).toDF("value")
+  df1.createOrReplaceTempView("ids")
+
+  val df2 = Seq("John", "Bob").toDF("value")
+  df2.createOrReplaceTempView("first_names")
+
+  val query = sql(
+"""SELECT name FROM contacts c
+  |WHERE
+  |  EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
+  |  AND
+  |  EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
+  |""".stripMargin)
+
+  checkScan(query, 
"struct>")
+
+  checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil)
+}
+  }
+
+  testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS 
subquery") {
+
+import testImplicits._
+
+withTempView("ids", "first_names") {
+  val df1 = Seq(1, 2, 3).toDF("value")
+  df1.createOrReplaceTempView("ids")
+
+  val df2 = Seq("John", "Bob").toDF("value")
+  

[spark] branch branch-3.3 updated: [SPARK-38977][SQL] Fix schema pruning with correlated subqueries

2022-04-22 Thread viirya
This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 9c5f38d8085 [SPARK-38977][SQL] Fix schema pruning with correlated 
subqueries
9c5f38d8085 is described below

commit 9c5f38d808573ced34bb52bdf4c5102ff2d1a7e2
Author: Anton Okolnychyi 
AuthorDate: Fri Apr 22 14:11:47 2022 -0700

[SPARK-38977][SQL] Fix schema pruning with correlated subqueries

### What changes were proposed in this pull request?

This PR fixes schema pruning for queries with multiple correlated 
subqueries. Previously, Spark would throw an exception trying to determine root 
fields in `SchemaPruning$identifyRootFields`. That was happening because 
expressions in predicates that referenced attributes in subqueries were not 
ignored. That's why attributes from multiple subqueries could conflict with 
each other (e.g. incompatible types) even though they should be ignored.

For instance, the following query would throw a runtime exception.

```
SELECT name FROM contacts c
WHERE
 EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
 AND
 EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
```
```
[info]   org.apache.spark.SparkException: Failed to merge fields 'value' 
and 'value'. Failed to merge incompatible data types int and string
[info]   at 
org.apache.spark.sql.errors.QueryExecutionErrors$.failedMergingFieldsError(QueryExecutionErrors.scala:936)
```

### Why are the changes needed?

These changes are needed to avoid exceptions for some queries with multiple 
correlated subqueries.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

This PR comes with tests.

Closes #36303 from aokolnychyi/spark-38977.

Authored-by: Anton Okolnychyi 
Signed-off-by: Liang-Chi Hsieh 
(cherry picked from commit 0c9947dabcb71de414c97c0e60a1067e468f2642)
Signed-off-by: Liang-Chi Hsieh 
---
 .../sql/catalyst/expressions/SchemaPruning.scala   |   4 +
 .../execution/datasources/SchemaPruningSuite.scala | 102 +
 2 files changed, 106 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
index fd5b2db61f3..e14bcba0ace 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
@@ -152,6 +152,10 @@ object SchemaPruning extends SQLConfHelper {
 RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = 
true) :: Nil
   case IsNotNull(_: Attribute) | IsNull(_: Attribute) =>
 
expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = 
true))
+  case s: SubqueryExpression =>
+// use subquery references that only include outer attrs and
+// ignore join conditions as those may include attributes from other 
tables
+s.references.toSeq.flatMap(getRootFields)
   case _ =>
 expr.children.flatMap(getRootFields)
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
index 2c227baa04f..1d62713331f 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
@@ -935,4 +935,106 @@ abstract class SchemaPruningSuite
   .count()
 assert(count == 0)
   }
+
+  testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS 
subquery") {
+
+import testImplicits._
+
+withTempView("ids", "first_names") {
+  val df1 = Seq(1, 2, 3).toDF("value")
+  df1.createOrReplaceTempView("ids")
+
+  val df2 = Seq("John", "Bob").toDF("value")
+  df2.createOrReplaceTempView("first_names")
+
+  val query = sql(
+"""SELECT name FROM contacts c
+  |WHERE
+  |  EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
+  |  AND
+  |  EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
+  |""".stripMargin)
+
+  checkScan(query, 
"struct>")
+
+  checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil)
+}
+  }
+
+  testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS 
subquery") {
+
+import testImplicits._
+
+withTempView("ids", "first_names") {
+  val df1 = Seq(1, 2, 3).toDF("value")
+  df1.createOrReplaceTempView("ids")
+
+  val df2 = Seq("John", "Bob").toDF("value")
+  

[spark] branch master updated: [SPARK-38977][SQL] Fix schema pruning with correlated subqueries

2022-04-22 Thread viirya
This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 0c9947dabcb [SPARK-38977][SQL] Fix schema pruning with correlated 
subqueries
0c9947dabcb is described below

commit 0c9947dabcb71de414c97c0e60a1067e468f2642
Author: Anton Okolnychyi 
AuthorDate: Fri Apr 22 14:11:47 2022 -0700

[SPARK-38977][SQL] Fix schema pruning with correlated subqueries

### What changes were proposed in this pull request?

This PR fixes schema pruning for queries with multiple correlated 
subqueries. Previously, Spark would throw an exception trying to determine root 
fields in `SchemaPruning$identifyRootFields`. That was happening because 
expressions in predicates that referenced attributes in subqueries were not 
ignored. That's why attributes from multiple subqueries could conflict with 
each other (e.g. incompatible types) even though they should be ignored.

For instance, the following query would throw a runtime exception.

```
SELECT name FROM contacts c
WHERE
 EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
 AND
 EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
```
```
[info]   org.apache.spark.SparkException: Failed to merge fields 'value' 
and 'value'. Failed to merge incompatible data types int and string
[info]   at 
org.apache.spark.sql.errors.QueryExecutionErrors$.failedMergingFieldsError(QueryExecutionErrors.scala:936)
```

### Why are the changes needed?

These changes are needed to avoid exceptions for some queries with multiple 
correlated subqueries.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

This PR comes with tests.

Closes #36303 from aokolnychyi/spark-38977.

Authored-by: Anton Okolnychyi 
Signed-off-by: Liang-Chi Hsieh 
---
 .../sql/catalyst/expressions/SchemaPruning.scala   |   4 +
 .../execution/datasources/SchemaPruningSuite.scala | 102 +
 2 files changed, 106 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
index fd5b2db61f3..e14bcba0ace 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
@@ -152,6 +152,10 @@ object SchemaPruning extends SQLConfHelper {
 RootField(field, derivedFromAtt = false, prunedIfAnyChildAccessed = 
true) :: Nil
   case IsNotNull(_: Attribute) | IsNull(_: Attribute) =>
 
expr.children.flatMap(getRootFields).map(_.copy(prunedIfAnyChildAccessed = 
true))
+  case s: SubqueryExpression =>
+// use subquery references that only include outer attrs and
+// ignore join conditions as those may include attributes from other 
tables
+s.references.toSeq.flatMap(getRootFields)
   case _ =>
 expr.children.flatMap(getRootFields)
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
index 4eb8258830c..becace3c69b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
@@ -935,4 +935,106 @@ abstract class SchemaPruningSuite
   .count()
 assert(count == 0)
   }
+
+  testSchemaPruning("SPARK-38977: schema pruning with correlated EXISTS 
subquery") {
+
+import testImplicits._
+
+withTempView("ids", "first_names") {
+  val df1 = Seq(1, 2, 3).toDF("value")
+  df1.createOrReplaceTempView("ids")
+
+  val df2 = Seq("John", "Bob").toDF("value")
+  df2.createOrReplaceTempView("first_names")
+
+  val query = sql(
+"""SELECT name FROM contacts c
+  |WHERE
+  |  EXISTS (SELECT 1 FROM ids i WHERE i.value = c.id)
+  |  AND
+  |  EXISTS (SELECT 1 FROM first_names n WHERE c.name.first = n.value)
+  |""".stripMargin)
+
+  checkScan(query, 
"struct>")
+
+  checkAnswer(query, Row(Row("John", "Y.", "Doe")) :: Nil)
+}
+  }
+
+  testSchemaPruning("SPARK-38977: schema pruning with correlated NOT EXISTS 
subquery") {
+
+import testImplicits._
+
+withTempView("ids", "first_names") {
+  val df1 = Seq(1, 2, 3).toDF("value")
+  df1.createOrReplaceTempView("ids")
+
+  val df2 = Seq("John", "Bob").toDF("value")
+  df2.createOrReplaceTempView("first_names")
+
+  val query = sql(
+"""SELECT name FROM contacts c
+   

[spark] branch master updated: [SPARK-38996][SQL] Use double quotes for types in error massages

2022-04-22 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 5e494d3de70 [SPARK-38996][SQL] Use double quotes for types in error 
massages
5e494d3de70 is described below

commit 5e494d3de70c6e46f33addd751a227e6f9d5703f
Author: Max Gekk 
AuthorDate: Fri Apr 22 23:07:01 2022 +0300

[SPARK-38996][SQL] Use double quotes for types in error massages

### What changes were proposed in this pull request?
In the PR, I propose to modify the method `QueryErrorsBase.toSQLType()` to 
use double quotes for types in error messages.

### Why are the changes needed?
1. To highlight types and make them more visible for users.
2. To be able to easily parse types from error text.
3. To be consistent to other outputs of identifiers, sql statement and etc. 
where Spark uses quotes or ticks.

### Does this PR introduce _any_ user-facing change?
Yes, the PR changes user-facing errors.

### How was this patch tested?
By running the modified test suites:
```
$ build/sbt "test:testOnly *QueryParsingErrorsSuite"
$ build/sbt "test:testOnly *QueryCompilationErrorsSuite"
$ build/sbt "test:testOnly *QueryExecutionErrorsSuite"
$ build/sbt "testOnly *CastSuite"
$ build/sbt "testOnly *AnsiCastSuiteWithAnsiModeOn"
$ build/sbt "testOnly *EncoderResolutionSuite"
$ build/sbt "test:testOnly *DatasetSuite"
$ build/sbt "test:testOnly *InsertSuite"
```

Closes #36324 from MaxGekk/wrap-types-in-error-classes.

Authored-by: Max Gekk 
Signed-off-by: Max Gekk 
---
 .../apache/spark/sql/errors/QueryErrorsBase.scala  |  2 +-
 .../catalyst/encoders/EncoderResolutionSuite.scala |  8 +--
 .../catalyst/expressions/AnsiCastSuiteBase.scala   | 36 ++--
 .../spark/sql/catalyst/expressions/CastSuite.scala | 66 +++---
 .../sql/catalyst/util/DateFormatterSuite.scala |  2 +-
 .../catalyst/util/TimestampFormatterSuite.scala|  2 +-
 .../org/apache/spark/sql/types/DecimalSuite.scala  |  2 +-
 .../resources/sql-tests/results/ansi/cast.sql.out  | 66 +++---
 .../resources/sql-tests/results/ansi/date.sql.out  |  6 +-
 .../results/ansi/datetime-parsing-invalid.sql.out  |  4 +-
 .../sql-tests/results/ansi/interval.sql.out| 20 +++
 .../results/ansi/string-functions.sql.out  |  8 +--
 .../sql-tests/results/postgreSQL/float4.sql.out| 14 ++---
 .../sql-tests/results/postgreSQL/float8.sql.out| 10 ++--
 .../sql-tests/results/postgreSQL/int8.sql.out  |  8 +--
 .../sql-tests/results/postgreSQL/text.sql.out  |  4 +-
 .../results/postgreSQL/window_part2.sql.out|  2 +-
 .../results/postgreSQL/window_part3.sql.out|  2 +-
 .../results/postgreSQL/window_part4.sql.out|  2 +-
 .../results/timestampNTZ/timestamp-ansi.sql.out|  2 +-
 .../scala/org/apache/spark/sql/DatasetSuite.scala  |  2 +-
 .../org/apache/spark/sql/SQLInsertTestSuite.scala  |  2 +-
 .../sql/errors/QueryCompilationErrorsSuite.scala   |  4 +-
 .../sql/errors/QueryExecutionAnsiErrorsSuite.scala |  2 +-
 .../sql/errors/QueryExecutionErrorsSuite.scala | 10 ++--
 .../spark/sql/errors/QueryParsingErrorsSuite.scala |  4 +-
 .../org/apache/spark/sql/sources/InsertSuite.scala |  8 +--
 27 files changed, 149 insertions(+), 149 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala
index 7daf8ae7325..4400bedfd5d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala
@@ -62,6 +62,6 @@ trait QueryErrorsBase {
   }
 
   def toSQLType(t: DataType): String = {
-t.sql
+"\"" + t.sql + "\""
   }
 }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
index c83e85ef9de..dae7340ac08 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
@@ -88,7 +88,7 @@ class EncoderResolutionSuite extends PlanTest {
 val attrs = Seq($"arr".array(StringType))
 assert(intercept[AnalysisException](encoder.resolveAndBind(attrs)).message 
==
   s"""
- |[CANNOT_UP_CAST_DATATYPE] Cannot up cast array element from STRING 
to BIGINT.
+ |[CANNOT_UP_CAST_DATATYPE] Cannot up cast array element from "STRING" 
to "BIGINT".
  |The type path of the target object is:
  |- array element class: "scala.Long"
  |- field (class: "scala.Array", 

[spark] branch branch-3.3 updated: [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down

2022-04-22 Thread huaxingao
This is an automated email from the ASF dual-hosted git repository.

huaxingao pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new ca9138ee8b6 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 
aggregate push down
ca9138ee8b6 is described below

commit ca9138ee8b6d8645943b737cc4231fbb0154c8cb
Author: Cheng Su 
AuthorDate: Fri Apr 22 10:13:40 2022 -0700

[SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down

### What changes were proposed in this pull request?

This is a followup per comment in 
https://issues.apache.org/jira/browse/SPARK-34960, to improve the documentation 
for data source v2 aggregate push down of Parquet and ORC.

* Unify SQL config docs between Parquet and ORC, and add the note that if 
statistics is missing from any file footer, exception would be thrown.
* Also adding the same note for exception in Parquet and ORC methods to 
aggregate from statistics.

Though in future Spark release, we may improve the behavior to fallback to 
aggregate from real data of file, in case any statistics are missing. We'd 
better to make a clear documentation for current behavior now.

### Why are the changes needed?

Give users & developers a better idea of when aggregate push down would 
throw exception.
Have a better documentation for current behavior.

### Does this PR introduce _any_ user-facing change?

Yes, the documentation change in SQL configs.

### How was this patch tested?

Existing tests as this is just documentation change.

Closes #36311 from c21/agg-doc.

Authored-by: Cheng Su 
Signed-off-by: huaxingao 
(cherry picked from commit 86b8757c2c4bab6a0f7a700cf2c690cdd7f31eba)
Signed-off-by: huaxingao 
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++
 .../apache/spark/sql/execution/datasources/orc/OrcUtils.scala  |  2 ++
 .../spark/sql/execution/datasources/parquet/ParquetUtils.scala |  4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f97b7f8f004..76f3d1f5a84 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -974,9 +974,10 @@ object SQLConf {
   .createWithDefault(10)
 
   val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = 
buildConf("spark.sql.parquet.aggregatePushdown")
-.doc("If true, MAX/MIN/COUNT without filter and group by will be pushed" +
-  " down to Parquet for optimization. MAX/MIN/COUNT for complex types and 
timestamp" +
-  " can't be pushed down")
+.doc("If true, aggregates will be pushed down to Parquet for optimization. 
Support MIN, MAX " +
+  "and COUNT as aggregate expression. For MIN/MAX, support boolean, 
integer, float and date " +
+  "type. For COUNT, support all data types. If statistics is missing from 
any Parquet file " +
+  "footer, exception would be thrown.")
 .version("3.3.0")
 .booleanConf
 .createWithDefault(false)
@@ -1110,7 +,8 @@ object SQLConf {
   val ORC_AGGREGATE_PUSHDOWN_ENABLED = 
buildConf("spark.sql.orc.aggregatePushdown")
 .doc("If true, aggregates will be pushed down to ORC for optimization. 
Support MIN, MAX and " +
   "COUNT as aggregate expression. For MIN/MAX, support boolean, integer, 
float and date " +
-  "type. For COUNT, support all data types.")
+  "type. For COUNT, support all data types. If statistics is missing from 
any ORC file " +
+  "footer, exception would be thrown.")
 .version("3.3.0")
 .booleanConf
 .createWithDefault(false)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index a68ce1a8636..9011821e1a7 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -408,6 +408,8 @@ object OrcUtils extends Logging {
* (Max/Min/Count) result using the statistics information from ORC file 
footer, and then
* construct an InternalRow from these aggregate results.
*
+   * NOTE: if statistics is missing from ORC file footer, exception would be 
thrown.
+   *
* @return Aggregate results in the format of InternalRow
*/
   def createAggInternalRowFromFooter(
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
index 

[spark] branch master updated: [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down

2022-04-22 Thread huaxingao
This is an automated email from the ASF dual-hosted git repository.

huaxingao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 86b8757c2c4 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 
aggregate push down
86b8757c2c4 is described below

commit 86b8757c2c4bab6a0f7a700cf2c690cdd7f31eba
Author: Cheng Su 
AuthorDate: Fri Apr 22 10:13:40 2022 -0700

[SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down

### What changes were proposed in this pull request?

This is a followup per comment in 
https://issues.apache.org/jira/browse/SPARK-34960, to improve the documentation 
for data source v2 aggregate push down of Parquet and ORC.

* Unify SQL config docs between Parquet and ORC, and add the note that if 
statistics is missing from any file footer, exception would be thrown.
* Also adding the same note for exception in Parquet and ORC methods to 
aggregate from statistics.

Though in future Spark release, we may improve the behavior to fallback to 
aggregate from real data of file, in case any statistics are missing. We'd 
better to make a clear documentation for current behavior now.

### Why are the changes needed?

Give users & developers a better idea of when aggregate push down would 
throw exception.
Have a better documentation for current behavior.

### Does this PR introduce _any_ user-facing change?

Yes, the documentation change in SQL configs.

### How was this patch tested?

Existing tests as this is just documentation change.

Closes #36311 from c21/agg-doc.

Authored-by: Cheng Su 
Signed-off-by: huaxingao 
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++
 .../apache/spark/sql/execution/datasources/orc/OrcUtils.scala  |  2 ++
 .../spark/sql/execution/datasources/parquet/ParquetUtils.scala |  4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 50d09d046bc..6d3f283fa73 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -974,9 +974,10 @@ object SQLConf {
   .createWithDefault(10)
 
   val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = 
buildConf("spark.sql.parquet.aggregatePushdown")
-.doc("If true, MAX/MIN/COUNT without filter and group by will be pushed" +
-  " down to Parquet for optimization. MAX/MIN/COUNT for complex types and 
timestamp" +
-  " can't be pushed down")
+.doc("If true, aggregates will be pushed down to Parquet for optimization. 
Support MIN, MAX " +
+  "and COUNT as aggregate expression. For MIN/MAX, support boolean, 
integer, float and date " +
+  "type. For COUNT, support all data types. If statistics is missing from 
any Parquet file " +
+  "footer, exception would be thrown.")
 .version("3.3.0")
 .booleanConf
 .createWithDefault(false)
@@ -1110,7 +,8 @@ object SQLConf {
   val ORC_AGGREGATE_PUSHDOWN_ENABLED = 
buildConf("spark.sql.orc.aggregatePushdown")
 .doc("If true, aggregates will be pushed down to ORC for optimization. 
Support MIN, MAX and " +
   "COUNT as aggregate expression. For MIN/MAX, support boolean, integer, 
float and date " +
-  "type. For COUNT, support all data types.")
+  "type. For COUNT, support all data types. If statistics is missing from 
any ORC file " +
+  "footer, exception would be thrown.")
 .version("3.3.0")
 .booleanConf
 .createWithDefault(false)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index 79abdfe4690..f07573beae6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -407,6 +407,8 @@ object OrcUtils extends Logging {
* (Max/Min/Count) result using the statistics information from ORC file 
footer, and then
* construct an InternalRow from these aggregate results.
*
+   * NOTE: if statistics is missing from ORC file footer, exception would be 
thrown.
+   *
* @return Aggregate results in the format of InternalRow
*/
   def createAggInternalRowFromFooter(
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
index 9f2e6580ecb..5a291e6a2e5 100644
--- 

[spark] branch master updated (80929d6b549 -> ffaceac4329)

2022-04-22 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 80929d6b549 [SPARK-38832][SQL][FOLLOWUP] Support propagate empty 
expression set for distinct key
 add ffaceac4329 [SPARK-38968][K8S] Remove a variable `hadoopConf` from 
`KerberosConfDriverFeatureStep`

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/deploy/k8s/features/KerberosConfDriverFeatureStep.scala | 1 -
 1 file changed, 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-38832][SQL][FOLLOWUP] Support propagate empty expression set for distinct key

2022-04-22 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 80929d6b549 [SPARK-38832][SQL][FOLLOWUP] Support propagate empty 
expression set for distinct key
80929d6b549 is described below

commit 80929d6b549dfc61ade130a9d59dfa1abe72d681
Author: ulysses-you 
AuthorDate: Fri Apr 22 18:17:04 2022 +0800

[SPARK-38832][SQL][FOLLOWUP] Support propagate empty expression set for 
distinct key

### What changes were proposed in this pull request?

- Improve `DistinctKeyVisitor` that support propagate empty set
- Small improvement for match alias

### Why are the changes needed?

Make distinct keys can be used to optimize more case, see comment 
https://github.com/apache/spark/pull/36117#discussion_r853755920

### Does this PR introduce _any_ user-facing change?

Improve performance

### How was this patch tested?

add test

Closes #36281 from ulysses-you/SPARK-38832-followup.

Authored-by: ulysses-you 
Signed-off-by: Wenchen Fan 
---
 .../plans/logical/DistinctKeyVisitor.scala | 25 ++
 .../plans/logical/LogicalPlanDistinctKeys.scala|  8 +--
 .../plans/logical/DistinctKeyVisitorSuite.scala|  6 +-
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
index 5b25a326831..1f495688bc5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
@@ -29,25 +29,21 @@ object DistinctKeyVisitor extends 
LogicalPlanVisitor[Set[ExpressionSet]] {
   private def projectDistinctKeys(
   keys: Set[ExpressionSet], projectList: Seq[NamedExpression]): 
Set[ExpressionSet] = {
 val outputSet = ExpressionSet(projectList.map(_.toAttribute))
-val aliases = projectList.filter(_.isInstanceOf[Alias])
+val aliases = projectList.collect {
+  // TODO: Expand distinctKeys for redundant aliases on the same expression
+  case alias: Alias if alias.child.deterministic => 
alias.child.canonicalized -> alias
+}.toMap
 if (aliases.isEmpty) {
   keys.filter(_.subsetOf(outputSet))
 } else {
-  val aliasedDistinctKeys = keys.map { expressionSet =>
-expressionSet.map { expression =>
-  expression transform {
-case expr: Expression =>
-  // TODO: Expand distinctKeys for redundant aliases on the same 
expression
-  aliases
-.collectFirst { case a: Alias if a.child.semanticEquals(expr) 
=> a.toAttribute }
-.getOrElse(expr)
-  }
-}
-  }
+  val aliasedDistinctKeys = keys.map(_.map(_.transform {
+case expr: Expression =>
+  aliases.get(expr.canonicalized).map(_.toAttribute).getOrElse(expr)
+  }))
   aliasedDistinctKeys.collect {
 case es: ExpressionSet if es.subsetOf(outputSet) => ExpressionSet(es)
   } ++ keys.filter(_.subsetOf(outputSet))
-}.filter(_.nonEmpty)
+}
   }
 
   /**
@@ -69,7 +65,8 @@ object DistinctKeyVisitor extends 
LogicalPlanVisitor[Set[ExpressionSet]] {
   override def default(p: LogicalPlan): Set[ExpressionSet] = 
Set.empty[ExpressionSet]
 
   override def visitAggregate(p: Aggregate): Set[ExpressionSet] = {
-val groupingExps = ExpressionSet(p.groupingExpressions) // handle group by 
a, a
+// handle group by a, a and global aggregate
+val groupingExps = ExpressionSet(p.groupingExpressions)
 projectDistinctKeys(addDistinctKey(p.child.distinctKeys, groupingExps), 
p.aggregateExpressions)
   }
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
index 2ffa5a0e594..1843c2da478 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
@@ -29,12 +29,6 @@ import 
org.apache.spark.sql.internal.SQLConf.PROPAGATE_DISTINCT_KEYS_ENABLED
  */
 trait LogicalPlanDistinctKeys { self: LogicalPlan =>
   lazy val distinctKeys: Set[ExpressionSet] = {
-if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) {
-  val keys = DistinctKeyVisitor.visit(self)
-  require(keys.forall(_.nonEmpty))
-  keys
-} else {
-  Set.empty
-}
+if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) 
DistinctKeyVisitor.visit(self) else 

[spark] branch branch-3.1 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

2022-04-22 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
 new ee1d0c82311 [SPARK-38992][CORE] Avoid using bash -c in 
ShellBasedGroupsMappingProvider
ee1d0c82311 is described below

commit ee1d0c82311e2caebf665e05f3c10d02cbfae196
Author: Hyukjin Kwon 
AuthorDate: Fri Apr 22 19:01:05 2022 +0900

[SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

### What changes were proposed in this pull request?

This PR proposes to avoid using `bash -c` in 
`ShellBasedGroupsMappingProvider`. This could allow users a command injection.

### Why are the changes needed?

For a security purpose.

### Does this PR introduce _any_ user-facing change?

Virtually no.

### How was this patch tested?

Manually tested.

Closes #36315 from HyukjinKwon/SPARK-38992.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc)
Signed-off-by: Hyukjin Kwon 
---
 .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
index f71dd08246b..7ef8ef165e3 100644
--- 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
+++ 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
@@ -30,6 +30,8 @@ import org.apache.spark.util.Utils
 private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingServiceProvider
   with Logging {
 
+  private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: 
Nil).stripLineEnd
+
   override def getGroups(username: String): Set[String] = {
 val userGroups = getUnixGroups(username)
 logDebug("User: " + username + " Groups: " + userGroups.mkString(","))
@@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingService
 
   // shells out a "bash -c id -Gn username" to get user groups
   private def getUnixGroups(username: String): Set[String] = {
-val cmdSeq = Seq("bash", "-c", "id -Gn " + username)
 // we need to get rid of the trailing "\n" from the result of command 
execution
-Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet
+Utils.executeAndGetOutput(idPath ::  "-Gn" :: username :: 
Nil).stripLineEnd.split(" ").toSet
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.0 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

2022-04-22 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new aeea56d7c51 [SPARK-38992][CORE] Avoid using bash -c in 
ShellBasedGroupsMappingProvider
aeea56d7c51 is described below

commit aeea56d7c51032bb4921c16f41ece8bbb3da0eee
Author: Hyukjin Kwon 
AuthorDate: Fri Apr 22 19:01:05 2022 +0900

[SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

### What changes were proposed in this pull request?

This PR proposes to avoid using `bash -c` in 
`ShellBasedGroupsMappingProvider`. This could allow users a command injection.

### Why are the changes needed?

For a security purpose.

### Does this PR introduce _any_ user-facing change?

Virtually no.

### How was this patch tested?

Manually tested.

Closes #36315 from HyukjinKwon/SPARK-38992.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc)
Signed-off-by: Hyukjin Kwon 
---
 .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
index f71dd08246b..7ef8ef165e3 100644
--- 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
+++ 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
@@ -30,6 +30,8 @@ import org.apache.spark.util.Utils
 private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingServiceProvider
   with Logging {
 
+  private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: 
Nil).stripLineEnd
+
   override def getGroups(username: String): Set[String] = {
 val userGroups = getUnixGroups(username)
 logDebug("User: " + username + " Groups: " + userGroups.mkString(","))
@@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingService
 
   // shells out a "bash -c id -Gn username" to get user groups
   private def getUnixGroups(username: String): Set[String] = {
-val cmdSeq = Seq("bash", "-c", "id -Gn " + username)
 // we need to get rid of the trailing "\n" from the result of command 
execution
-Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet
+Utils.executeAndGetOutput(idPath ::  "-Gn" :: username :: 
Nil).stripLineEnd.split(" ").toSet
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.2 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

2022-04-22 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 1d524a88f6e [SPARK-38992][CORE] Avoid using bash -c in 
ShellBasedGroupsMappingProvider
1d524a88f6e is described below

commit 1d524a88f6e93e9971a09f70eb2804dca51d578c
Author: Hyukjin Kwon 
AuthorDate: Fri Apr 22 19:01:05 2022 +0900

[SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

### What changes were proposed in this pull request?

This PR proposes to avoid using `bash -c` in 
`ShellBasedGroupsMappingProvider`. This could allow users a command injection.

### Why are the changes needed?

For a security purpose.

### Does this PR introduce _any_ user-facing change?

Virtually no.

### How was this patch tested?

Manually tested.

Closes #36315 from HyukjinKwon/SPARK-38992.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc)
Signed-off-by: Hyukjin Kwon 
---
 .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
index f71dd08246b..7ef8ef165e3 100644
--- 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
+++ 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
@@ -30,6 +30,8 @@ import org.apache.spark.util.Utils
 private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingServiceProvider
   with Logging {
 
+  private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: 
Nil).stripLineEnd
+
   override def getGroups(username: String): Set[String] = {
 val userGroups = getUnixGroups(username)
 logDebug("User: " + username + " Groups: " + userGroups.mkString(","))
@@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingService
 
   // shells out a "bash -c id -Gn username" to get user groups
   private def getUnixGroups(username: String): Set[String] = {
-val cmdSeq = Seq("bash", "-c", "id -Gn " + username)
 // we need to get rid of the trailing "\n" from the result of command 
execution
-Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet
+Utils.executeAndGetOutput(idPath ::  "-Gn" :: username :: 
Nil).stripLineEnd.split(" ").toSet
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

2022-04-22 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 9cc2ae78041 [SPARK-38992][CORE] Avoid using bash -c in 
ShellBasedGroupsMappingProvider
9cc2ae78041 is described below

commit 9cc2ae7804156899850031bd694b1925473fb4cd
Author: Hyukjin Kwon 
AuthorDate: Fri Apr 22 19:01:05 2022 +0900

[SPARK-38992][CORE] Avoid using bash -c in ShellBasedGroupsMappingProvider

### What changes were proposed in this pull request?

This PR proposes to avoid using `bash -c` in 
`ShellBasedGroupsMappingProvider`. This could allow users a command injection.

### Why are the changes needed?

For a security purpose.

### Does this PR introduce _any_ user-facing change?

Virtually no.

### How was this patch tested?

Manually tested.

Closes #36315 from HyukjinKwon/SPARK-38992.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit c83618e4e5fc092829a1f2a726f12fb832e802cc)
Signed-off-by: Hyukjin Kwon 
---
 .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
index f71dd08246b..7ef8ef165e3 100644
--- 
a/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
+++ 
b/core/src/main/scala/org/apache/spark/security/ShellBasedGroupsMappingProvider.scala
@@ -30,6 +30,8 @@ import org.apache.spark.util.Utils
 private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingServiceProvider
   with Logging {
 
+  private lazy val idPath = Utils.executeAndGetOutput("which" :: "id" :: 
Nil).stripLineEnd
+
   override def getGroups(username: String): Set[String] = {
 val userGroups = getUnixGroups(username)
 logDebug("User: " + username + " Groups: " + userGroups.mkString(","))
@@ -38,8 +40,7 @@ private[spark] class ShellBasedGroupsMappingProvider extends 
GroupMappingService
 
   // shells out a "bash -c id -Gn username" to get user groups
   private def getUnixGroups(username: String): Set[String] = {
-val cmdSeq = Seq("bash", "-c", "id -Gn " + username)
 // we need to get rid of the trailing "\n" from the result of command 
execution
-Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet
+Utils.executeAndGetOutput(idPath ::  "-Gn" :: username :: 
Nil).stripLineEnd.split(" ").toSet
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (8d59fdbacf2 -> c83618e4e5f)

2022-04-22 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 8d59fdbacf2 [SPARK-38736][SQL][TESTS] Test the error classes: 
INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT
 add c83618e4e5f [SPARK-38992][CORE] Avoid using bash -c in 
ShellBasedGroupsMappingProvider

No new revisions were added by this update.

Summary of changes:
 .../org/apache/spark/security/ShellBasedGroupsMappingProvider.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-38736][SQL][TESTS] Test the error classes: INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT

2022-04-22 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 8d59fdbacf2 [SPARK-38736][SQL][TESTS] Test the error classes: 
INVALID_ARRAY_INDEX & INVALID_ARRAY_INDEX_IN_ELEMENT_AT
8d59fdbacf2 is described below

commit 8d59fdbacf28427f72dd30e5e7e135644a0f2190
Author: panbingkun 
AuthorDate: Fri Apr 22 12:54:48 2022 +0300

[SPARK-38736][SQL][TESTS] Test the error classes: INVALID_ARRAY_INDEX & 
INVALID_ARRAY_INDEX_IN_ELEMENT_AT

## What changes were proposed in this pull request?
This pr aims to add one test for the error class INVALID_ARRAY_INDEX & 
INVALID_ARRAY_INDEX_IN_ELEMENT_AT to QueryExecutionAnsiErrorsSuite.

### Why are the changes needed?
The changes improve test coverage, and document expected error messages in 
tests.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
By running new test:
```
build/sbt "sql/testOnly *QueryExecutionAnsiErrorsSuite*"
```

Closes #36314 from panbingkun/SPARK-38736.

Authored-by: panbingkun 
Signed-off-by: Max Gekk 
---
 .../sql/errors/QueryExecutionAnsiErrorsSuite.scala | 25 +-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
index 3580f90f1bd..22e6711d74d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.sql.errors
 
-import org.apache.spark.{SparkArithmeticException, SparkConf, 
SparkDateTimeException}
+import org.apache.spark.{SparkArithmeticException, 
SparkArrayIndexOutOfBoundsException, SparkConf, SparkDateTimeException}
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.internal.SQLConf
 
@@ -81,4 +81,27 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest with 
QueryErrorsSuiteBase
   |""".stripMargin,
   sqlState = Some("22005"))
   }
+
+  test("INVALID_ARRAY_INDEX: get element from array") {
+checkErrorClass(
+  exception = intercept[SparkArrayIndexOutOfBoundsException] {
+sql("select array(1, 2, 3, 4, 5)[8]").collect()
+  },
+  errorClass = "INVALID_ARRAY_INDEX",
+  msg = "Invalid index: 8, numElements: 5. " +
+"If necessary set spark.sql.ansi.enabled to false to bypass this 
error."
+)
+  }
+
+  test("INVALID_ARRAY_INDEX_IN_ELEMENT_AT: element_at from array") {
+checkErrorClass(
+  exception = intercept[SparkArrayIndexOutOfBoundsException] {
+sql("select element_at(array(1, 2, 3, 4, 5), 8)").collect()
+  },
+  errorClass = "INVALID_ARRAY_INDEX_IN_ELEMENT_AT",
+  msg = "Invalid index: 8, numElements: 5. " +
+"To return NULL instead, use 'try_element_at'. " +
+"If necessary set spark.sql.ansi.enabled to false to bypass this 
error."
+)
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org