[spark] branch master updated (14622fc -> 1a42aa5)

2021-08-25 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 14622fc  [SPARK-36488][SQL] Improve error message with 
quotedRegexColumnNames
 add 1a42aa5  [SPARK-36457][DOCS] Review and fix issues in Scala/Java API 
docs

No new revisions were added by this update.

Summary of changes:
 core/src/main/scala/org/apache/spark/SparkException.scala| 2 +-
 core/src/main/scala/org/apache/spark/errors/SparkCoreErrors.scala| 2 +-
 .../main/scala/org/apache/spark/io/MutableCheckedOutputStream.scala  | 2 +-
 .../org/apache/spark/scheduler/MiscellaneousProcessDetails.scala | 3 ++-
 core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala   | 1 +
 core/src/main/scala/org/apache/spark/util/DependencyUtils.scala  | 2 +-
 project/SparkBuild.scala | 1 +
 .../java/org/apache/spark/sql/connector/catalog/FunctionCatalog.java | 4 
 .../org/apache/spark/sql/connector/catalog/TruncatableTable.java | 2 ++
 .../spark/sql/connector/catalog/functions/AggregateFunction.java | 4 
 .../apache/spark/sql/connector/catalog/functions/BoundFunction.java  | 4 
 .../org/apache/spark/sql/connector/catalog/functions/Function.java   | 5 +
 .../apache/spark/sql/connector/catalog/functions/ScalarFunction.java | 4 
 .../spark/sql/connector/catalog/functions/UnboundFunction.java   | 4 
 .../spark/sql/connector/read/streaming/ReportsSourceMetrics.java | 2 ++
 .../scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala   | 2 +-
 .../src/main/scala/org/apache/spark/sql/types/UDTRegistration.scala  | 3 ++-
 .../src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala  | 3 ++-
 .../src/main/java/org/apache/spark/sql/connector/write/V1Write.java  | 2 ++
 19 files changed, 44 insertions(+), 8 deletions(-)

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-36488][SQL] Improve error message with quotedRegexColumnNames

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 14622fc  [SPARK-36488][SQL] Improve error message with 
quotedRegexColumnNames
14622fc is described below

commit 14622fcec8b977e2c2f7b3860797cc0b544bad3b
Author: Pablo Langa 
AuthorDate: Thu Aug 26 11:33:40 2021 +0800

[SPARK-36488][SQL] Improve error message with quotedRegexColumnNames

### What changes were proposed in this pull request?

When `spark.sql.parser.quotedRegexColumnNames=true` and a pattern is used 
in a place where is not allowed the message is a little bit confusing

```
scala> spark.sql("set spark.sql.parser.quotedRegexColumnNames=true")

scala> spark.sql("SELECT `col_.?`/col_b FROM (SELECT 3 AS col_a, 1 as 
col_b)")
org.apache.spark.sql.AnalysisException: Invalid usage of '*' in expression 
'divide'
```
This PR attempts to improve the error message
```
scala> spark.sql("SELECT `col_.?`/col_b FROM (SELECT 3 AS col_a, 1 as 
col_b)")
org.apache.spark.sql.AnalysisException: Invalid usage of regular expression 
in expression 'divide'
```

### Why are the changes needed?

To clarify the error message with this option active

### Does this PR introduce _any_ user-facing change?

Yes, change the error message

### How was this patch tested?

Unit testing and manual testing

Closes #33802 from planga82/feature/spark36488_improve_error_message.

Authored-by: Pablo Langa 
Signed-off-by: Wenchen Fan 
---
 .../spark/sql/catalyst/analysis/Analyzer.scala |  9 ++--
 .../sql/catalyst/analysis/CheckAnalysis.scala  |  2 +-
 .../spark/sql/errors/QueryCompilationErrors.scala  | 17 ---
 .../sql/catalyst/analysis/AnalysisErrorSuite.scala | 24 ++
 4 files changed, 46 insertions(+), 6 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index af9ff0d..a26f6b6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1443,7 +1443,8 @@ class Analyzer(override val catalogManager: 
CatalogManager)
   a.copy(aggregateExpressions = 
buildExpandedProjectList(a.aggregateExpressions, a.child))
 }
   case g: Generate if containsStar(g.generator.children) =>
-throw 
QueryCompilationErrors.invalidStarUsageError("explode/json_tuple/UDTF")
+throw 
QueryCompilationErrors.invalidStarUsageError("explode/json_tuple/UDTF",
+  extractStar(g.generator.children))
 
   // When resolve `SortOrder`s in Sort based on child, don't report errors 
as
   // we still have chance to resolve it based on its descendants
@@ -1657,6 +1658,9 @@ class Analyzer(override val catalogManager: 
CatalogManager)
 def containsStar(exprs: Seq[Expression]): Boolean =
   exprs.exists(_.collect { case _: Star => true }.nonEmpty)
 
+private def extractStar(exprs: Seq[Expression]): Seq[Star] =
+  exprs.map(_.collect { case s: Star => s }).flatten
+
 /**
  * Expands the matching attribute.*'s in `child`'s output.
  */
@@ -1704,7 +1708,8 @@ class Analyzer(override val catalogManager: 
CatalogManager)
   })
 // count(*) has been replaced by count(1)
 case o if containsStar(o.children) =>
-  throw QueryCompilationErrors.invalidStarUsageError(s"expression 
'${o.prettyName}'")
+  throw QueryCompilationErrors.invalidStarUsageError(s"expression 
'${o.prettyName}'",
+extractStar(o.children))
   }
 }
   }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 932414e..2adf110 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -173,7 +173,7 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
 
   case s: Star =>
 withPosition(s) {
-  throw 
QueryCompilationErrors.invalidStarUsageError(operator.nodeName)
+  throw 
QueryCompilationErrors.invalidStarUsageError(operator.nodeName, Seq(s))
 }
 
   case e: Expression if e.checkInputDataTypes().isFailure =>
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
index 2cbca6f..0c7b322 100644
--- 

[spark] branch master updated (159ff9f -> 3bb8cd9)

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 159ff9f  [SPARK-36590][SQL] Convert special timestamp_ntz values in 
the session time zone
 add 3bb8cd9  [SPARK-36568][SQL] Better FileScan statistics estimation

No new revisions were added by this update.

Summary of changes:
 .../sql/execution/datasources/v2/FileScan.scala|  7 ++-
 .../execution/datasources/v2/text/TextScan.scala   |  1 +
 .../datasources/v2/text/TextScanBuilder.scala  |  2 +-
 .../spark/sql/FileBasedDataSourceSuite.scala   | 22 ++
 .../scala/org/apache/spark/sql/FileScanSuite.scala |  2 +-
 .../org/apache/spark/sql/test/SQLTestUtils.scala   |  5 -
 6 files changed, 35 insertions(+), 4 deletions(-)

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.2 updated: [SPARK-36590][SQL] Convert special timestamp_ntz values in the session time zone

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 0c364e6  [SPARK-36590][SQL] Convert special timestamp_ntz values in 
the session time zone
0c364e6 is described below

commit 0c364e607dae344a3eb88443ae87fd1d32819def
Author: Max Gekk 
AuthorDate: Thu Aug 26 10:09:18 2021 +0800

[SPARK-36590][SQL] Convert special timestamp_ntz values in the session time 
zone

In the PR, I propose to use the session time zone ( see the SQL config 
`spark.sql.session.timeZone`) instead of JVM default time zone while converting 
of special timestamp_ntz strings such as "today", "tomorrow" and so on.

Current implementation is based on the system time zone, and it 
controverses to other functions/classes that use the session time zone. For 
example, Spark doesn't respects user's settings:
```sql
$ export TZ="Europe/Amsterdam"
$ ./bin/spark-sql -S
spark-sql> select timestamp_ntz'now';
2021-08-25 18:12:36.233

spark-sql> set spark.sql.session.timeZone=America/Los_Angeles;
spark.sql.session.timeZone  America/Los_Angeles
spark-sql> select timestamp_ntz'now';
2021-08-25 18:14:40.547
```

Yes. For the example above, after the changes:
```sql
spark-sql> select timestamp_ntz'now';
2021-08-25 18:47:46.832

spark-sql> set spark.sql.session.timeZone=America/Los_Angeles;
spark.sql.session.timeZone  America/Los_Angeles
spark-sql> select timestamp_ntz'now';
2021-08-25 09:48:05.211
```

By running the affected test suites:
```
$ build/sbt "test:testOnly *DateTimeUtilsSuite"
```

Closes #33838 from MaxGekk/fix-ts_ntz-special-values.

Authored-by: Max Gekk 
Signed-off-by: Wenchen Fan 
(cherry picked from commit 159ff9fd14f7e0581833428c495c0e2c34f7e320)
Signed-off-by: Wenchen Fan 
---
 .../sql/catalyst/optimizer/finishAnalysis.scala|  3 +--
 .../spark/sql/catalyst/parser/AstBuilder.scala | 28 --
 .../spark/sql/catalyst/util/DateTimeUtils.scala| 13 +-
 .../optimizer/SpecialDatetimeValuesSuite.scala |  6 ++---
 .../sql/catalyst/util/DateTimeUtilsSuite.scala | 22 +
 5 files changed, 38 insertions(+), 34 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index daf4c5e..802e0b4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -129,8 +129,7 @@ object SpecialDatetimeValues extends Rule[LogicalPlan] {
   private val conv = Map[DataType, (String, java.time.ZoneId) => Option[Any]](
 DateType -> convertSpecialDate,
 TimestampType -> convertSpecialTimestamp,
-TimestampNTZType -> ((s: String, _: java.time.ZoneId) => 
convertSpecialTimestampNTZ(s))
-  )
+TimestampNTZType -> convertSpecialTimestampNTZ)
   def apply(plan: LogicalPlan): LogicalPlan = {
 plan.transformAllExpressionsWithPruning(_.containsPattern(CAST)) {
   case cast @ Cast(e, dt @ (DateType | TimestampType | TimestampNTZType), 
_, _)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index b447dc3..51ef5df 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2134,25 +2134,27 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] 
with SQLConfHelper with Logg
   specialDate.getOrElse(toLiteral(stringToDate, DateType))
 // SPARK-36227: Remove TimestampNTZ type support in Spark 3.2 with 
minimal code changes.
 case "TIMESTAMP_NTZ" if isTesting =>
-  val specialTs = convertSpecialTimestampNTZ(value).map(Literal(_, 
TimestampNTZType))
-  specialTs.getOrElse(toLiteral(stringToTimestampWithoutTimeZone, 
TimestampNTZType))
+  convertSpecialTimestampNTZ(value, 
getZoneId(conf.sessionLocalTimeZone))
+.map(Literal(_, TimestampNTZType))
+.getOrElse(toLiteral(stringToTimestampWithoutTimeZone, 
TimestampNTZType))
 case "TIMESTAMP_LTZ" if isTesting =>
   constructTimestampLTZLiteral(value)
 case "TIMESTAMP" =>
   SQLConf.get.timestampType match {
 case TimestampNTZType =>
-  val specialTs = convertSpecialTimestampNTZ(value).map(Literal(_, 
TimestampNTZType))
-  specialTs.getOrElse {
-val containsTimeZonePart =
-  

[spark] branch master updated: [SPARK-36590][SQL] Convert special timestamp_ntz values in the session time zone

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 159ff9f  [SPARK-36590][SQL] Convert special timestamp_ntz values in 
the session time zone
159ff9f is described below

commit 159ff9fd14f7e0581833428c495c0e2c34f7e320
Author: Max Gekk 
AuthorDate: Thu Aug 26 10:09:18 2021 +0800

[SPARK-36590][SQL] Convert special timestamp_ntz values in the session time 
zone

### What changes were proposed in this pull request?
In the PR, I propose to use the session time zone ( see the SQL config 
`spark.sql.session.timeZone`) instead of JVM default time zone while converting 
of special timestamp_ntz strings such as "today", "tomorrow" and so on.

### Why are the changes needed?
Current implementation is based on the system time zone, and it 
controverses to other functions/classes that use the session time zone. For 
example, Spark doesn't respects user's settings:
```sql
$ export TZ="Europe/Amsterdam"
$ ./bin/spark-sql -S
spark-sql> select timestamp_ntz'now';
2021-08-25 18:12:36.233

spark-sql> set spark.sql.session.timeZone=America/Los_Angeles;
spark.sql.session.timeZone  America/Los_Angeles
spark-sql> select timestamp_ntz'now';
2021-08-25 18:14:40.547
```

### Does this PR introduce _any_ user-facing change?
Yes. For the example above, after the changes:
```sql
spark-sql> select timestamp_ntz'now';
2021-08-25 18:47:46.832

spark-sql> set spark.sql.session.timeZone=America/Los_Angeles;
spark.sql.session.timeZone  America/Los_Angeles
spark-sql> select timestamp_ntz'now';
2021-08-25 09:48:05.211
```

### How was this patch tested?
By running the affected test suites:
```
$ build/sbt "test:testOnly *DateTimeUtilsSuite"
```

Closes #33838 from MaxGekk/fix-ts_ntz-special-values.

Authored-by: Max Gekk 
Signed-off-by: Wenchen Fan 
---
 .../sql/catalyst/optimizer/finishAnalysis.scala|  3 +--
 .../spark/sql/catalyst/parser/AstBuilder.scala | 28 --
 .../spark/sql/catalyst/util/DateTimeUtils.scala| 13 +-
 .../optimizer/SpecialDatetimeValuesSuite.scala |  6 ++---
 .../sql/catalyst/util/DateTimeUtilsSuite.scala | 22 +
 5 files changed, 38 insertions(+), 34 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index daf4c5e..802e0b4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -129,8 +129,7 @@ object SpecialDatetimeValues extends Rule[LogicalPlan] {
   private val conv = Map[DataType, (String, java.time.ZoneId) => Option[Any]](
 DateType -> convertSpecialDate,
 TimestampType -> convertSpecialTimestamp,
-TimestampNTZType -> ((s: String, _: java.time.ZoneId) => 
convertSpecialTimestampNTZ(s))
-  )
+TimestampNTZType -> convertSpecialTimestampNTZ)
   def apply(plan: LogicalPlan): LogicalPlan = {
 plan.transformAllExpressionsWithPruning(_.containsPattern(CAST)) {
   case cast @ Cast(e, dt @ (DateType | TimestampType | TimestampNTZType), 
_, _)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 3c3dfd3..fcbc6d2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2133,25 +2133,27 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] 
with SQLConfHelper with Logg
   val specialDate = convertSpecialDate(value, zoneId).map(Literal(_, 
DateType))
   specialDate.getOrElse(toLiteral(stringToDate, DateType))
 case "TIMESTAMP_NTZ" =>
-  val specialTs = convertSpecialTimestampNTZ(value).map(Literal(_, 
TimestampNTZType))
-  specialTs.getOrElse(toLiteral(stringToTimestampWithoutTimeZone, 
TimestampNTZType))
+  convertSpecialTimestampNTZ(value, 
getZoneId(conf.sessionLocalTimeZone))
+.map(Literal(_, TimestampNTZType))
+.getOrElse(toLiteral(stringToTimestampWithoutTimeZone, 
TimestampNTZType))
 case "TIMESTAMP_LTZ" =>
   constructTimestampLTZLiteral(value)
 case "TIMESTAMP" =>
   SQLConf.get.timestampType match {
 case TimestampNTZType =>
-  val specialTs = convertSpecialTimestampNTZ(value).map(Literal(_, 
TimestampNTZType))
-  specialTs.getOrElse {
-val containsTimeZonePart 

[spark] branch branch-3.2 updated: [SPARK-35581][SPARK-36567][SQL][DOCS][FOLLOWUP] Update the SQL migration guide about foldable special datetime values

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 5198c0c  [SPARK-35581][SPARK-36567][SQL][DOCS][FOLLOWUP] Update the 
SQL migration guide about foldable special datetime values
5198c0c is described below

commit 5198c0c3163dadac1a8eab5c4dc46fd5bb508e35
Author: Max Gekk 
AuthorDate: Thu Aug 26 10:02:00 2021 +0800

[SPARK-35581][SPARK-36567][SQL][DOCS][FOLLOWUP] Update the SQL migration 
guide about foldable special datetime values

### What changes were proposed in this pull request?
In the PR, I propose to update an existing item in the SQL migration guide, 
and mention that Spark 3.2 supports foldable special datetime values as well.
https://user-images.githubusercontent.com/1580697/130860184-27f0ba56-6c2d-4a5a-91a8-195f2f8aa5da.png;>

### Why are the changes needed?
To inform users about actual Spark SQL behavior introduced by 
https://github.com/apache/spark/pull/33816

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
By generating docs, and checking results manually.

Closes #33840 from MaxGekk/special-datetime-cast-migr-guide.

Authored-by: Max Gekk 
Signed-off-by: Wenchen Fan 
(cherry picked from commit c4e739fb4bba6a764062a756035a27fdca4f72e5)
Signed-off-by: Wenchen Fan 
---
 docs/sql-migration-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index d2813a8..bdb4fe0 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -93,7 +93,7 @@ license: |
 
   - In Spark 3.2, `CREATE TABLE AS SELECT` with non-empty `LOCATION` will 
throw `AnalysisException`. To restore the behavior before Spark 3.2, you can 
set `spark.sql.legacy.allowNonEmptyLocationInCTAS` to `true`.
 
-  - In Spark 3.2, special datetime values such as `epoch`, `today`, 
`yesterday`, `tomorrow`, and `now` are supported in typed literals only, for 
instance, `select timestamp'now'`. In Spark 3.1 and 3.0, such special values 
are supported in any casts of strings to dates/timestamps. To keep these 
special values as dates/timestamps in Spark 3.1 and 3.0, you should replace 
them manually, e.g. `if (c in ('now', 'today'), current_date(), cast(c as 
date))`.
+  - In Spark 3.2, special datetime values such as `epoch`, `today`, 
`yesterday`, `tomorrow`, and `now` are supported in typed literals or in cast 
of foldable strings only, for instance, `select timestamp'now'` or `select 
cast('today' as date)`. In Spark 3.1 and 3.0, such special values are supported 
in any casts of strings to dates/timestamps. To keep these special values as 
dates/timestamps in Spark 3.1 and 3.0, you should replace them manually, e.g. 
`if (c in ('now', 'today'), current_ [...]
   
   - In Spark 3.2, `FloatType` is mapped to `FLOAT` in MySQL. Prior to this, it 
used to be mapped to `REAL`, which is by default a synonym to `DOUBLE 
PRECISION` in MySQL. 
 

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-35581][SPARK-36567][SQL][DOCS][FOLLOWUP] Update the SQL migration guide about foldable special datetime values

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new c4e739f  [SPARK-35581][SPARK-36567][SQL][DOCS][FOLLOWUP] Update the 
SQL migration guide about foldable special datetime values
c4e739f is described below

commit c4e739fb4bba6a764062a756035a27fdca4f72e5
Author: Max Gekk 
AuthorDate: Thu Aug 26 10:02:00 2021 +0800

[SPARK-35581][SPARK-36567][SQL][DOCS][FOLLOWUP] Update the SQL migration 
guide about foldable special datetime values

### What changes were proposed in this pull request?
In the PR, I propose to update an existing item in the SQL migration guide, 
and mention that Spark 3.2 supports foldable special datetime values as well.
https://user-images.githubusercontent.com/1580697/130860184-27f0ba56-6c2d-4a5a-91a8-195f2f8aa5da.png;>

### Why are the changes needed?
To inform users about actual Spark SQL behavior introduced by 
https://github.com/apache/spark/pull/33816

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
By generating docs, and checking results manually.

Closes #33840 from MaxGekk/special-datetime-cast-migr-guide.

Authored-by: Max Gekk 
Signed-off-by: Wenchen Fan 
---
 docs/sql-migration-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 47e7921..e0b18a3 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -117,7 +117,7 @@ license: |
 
   - In Spark 3.2, `CREATE TABLE AS SELECT` with non-empty `LOCATION` will 
throw `AnalysisException`. To restore the behavior before Spark 3.2, you can 
set `spark.sql.legacy.allowNonEmptyLocationInCTAS` to `true`.
 
-  - In Spark 3.2, special datetime values such as `epoch`, `today`, 
`yesterday`, `tomorrow`, and `now` are supported in typed literals only, for 
instance, `select timestamp'now'`. In Spark 3.1 and 3.0, such special values 
are supported in any casts of strings to dates/timestamps. To keep these 
special values as dates/timestamps in Spark 3.1 and 3.0, you should replace 
them manually, e.g. `if (c in ('now', 'today'), current_date(), cast(c as 
date))`.
+  - In Spark 3.2, special datetime values such as `epoch`, `today`, 
`yesterday`, `tomorrow`, and `now` are supported in typed literals or in cast 
of foldable strings only, for instance, `select timestamp'now'` or `select 
cast('today' as date)`. In Spark 3.1 and 3.0, such special values are supported 
in any casts of strings to dates/timestamps. To keep these special values as 
dates/timestamps in Spark 3.1 and 3.0, you should replace them manually, e.g. 
`if (c in ('now', 'today'), current_ [...]
   
   - In Spark 3.2, `FloatType` is mapped to `FLOAT` in MySQL. Prior to this, it 
used to be mapped to `REAL`, which is by default a synonym to `DOUBLE 
PRECISION` in MySQL. 
 

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.2 updated: [MINOR][3.2] Remove unused `numpy` import

2021-08-25 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new d841679  [MINOR][3.2] Remove unused `numpy` import
d841679 is described below

commit d841679ecce180d20090b8e15c129741a6175f87
Author: Dongjoon Hyun 
AuthorDate: Thu Aug 26 09:52:54 2021 +0900

[MINOR][3.2] Remove unused `numpy` import

### What changes were proposed in this pull request?

This fixed Python linter failure.

### Why are the changes needed?

```
flake8 checks failed:
./python/pyspark/ml/tests/test_tuning.py:21:1: F401 'numpy as np' imported 
but unused
import numpy as np
F401 'numpy as np' imported but unused
Error: Process completed with exit code 1.
```

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Pass the GitHub Action Linter job.

Closes #33841 from dongjoon-hyun/unused_import.

Authored-by: Dongjoon Hyun 
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/ml/tests/test_tuning.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/pyspark/ml/tests/test_tuning.py 
b/python/pyspark/ml/tests/test_tuning.py
index c685dcc..3cde34f 100644
--- a/python/pyspark/ml/tests/test_tuning.py
+++ b/python/pyspark/ml/tests/test_tuning.py
@@ -18,7 +18,6 @@
 import tempfile
 import unittest
 
-import numpy as np
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.ml import Estimator, Pipeline, Model
 from pyspark.ml.classification import LogisticRegression, 
LogisticRegressionModel, OneVsRest

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-36058][K8S] Add support for statefulset APIs in K8s

2021-08-25 Thread holden
This is an automated email from the ASF dual-hosted git repository.

holden pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new ff3f3c4  [SPARK-36058][K8S] Add support for statefulset APIs in K8s
ff3f3c4 is described below

commit ff3f3c45668364da9bd10992791b5ee9a46fea21
Author: Holden Karau 
AuthorDate: Wed Aug 25 17:38:57 2021 -0700

[SPARK-36058][K8S] Add support for statefulset APIs in K8s

### What changes were proposed in this pull request?

Generalize the pod allocator and add support for statefulsets.

### Why are the changes needed?

Allocating individual pods in Spark can be not ideal for some clusters and 
using higher level operators like statefulsets and replicasets can be useful.

### Does this PR introduce _any_ user-facing change?

Yes new config options.

### How was this patch tested?

Completed: New unit & basic integration test
PV integration tests

Closes #33508 from 
holdenk/SPARK-36058-support-replicasets-or-job-api-like-things.

Lead-authored-by: Holden Karau 
Co-authored-by: Holden Karau 
Signed-off-by: Holden Karau 
---
 .../executor/CoarseGrainedExecutorBackend.scala|   2 +-
 .../scala/org/apache/spark/storage/DiskStore.scala |   9 +-
 .../apache/spark/examples/MiniReadWriteTest.scala  | 139 +
 .../scala/org/apache/spark/deploy/k8s/Config.scala |  10 +
 .../org/apache/spark/deploy/k8s/Constants.scala|   1 +
 .../k8s/features/BasicExecutorFeatureStep.scala|  15 +-
 .../cluster/k8s/AbstractPodsAllocator.scala|  59 ++
 .../cluster/k8s/ExecutorPodsAllocator.scala|  18 +-
 .../cluster/k8s/ExecutorPodsSnapshot.scala |  11 +-
 .../cluster/k8s/KubernetesClusterManager.scala |  38 +++-
 .../cluster/k8s/KubernetesClusterMessage.scala |  21 ++
 .../k8s/KubernetesClusterSchedulerBackend.scala|  41 +++-
 .../cluster/k8s/KubernetesExecutorBackend.scala| 228 +
 .../cluster/k8s/StatefulsetPodsAllocator.scala | 201 ++
 .../features/BasicExecutorFeatureStepSuite.scala   |   6 +-
 .../cluster/k8s/ExecutorPodsAllocatorSuite.scala   |  31 ++-
 .../k8s/KubernetesClusterManagerSuite.scala|  58 ++
 .../KubernetesClusterSchedulerBackendSuite.scala   |  13 +-
 .../cluster/k8s/StatefulsetAllocatorSuite.scala| 153 ++
 .../src/main/dockerfiles/spark/entrypoint.sh   |   3 +-
 .../k8s/integrationtest/BasicTestsSuite.scala  |  17 ++
 .../k8s/integrationtest/KubernetesSuite.scala  |  33 ++-
 .../deploy/k8s/integrationtest/PVTestsSuite.scala  |  88 +++-
 23 files changed, 1148 insertions(+), 47 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
 
b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 76d01f8..c87e61a 100644
--- 
a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ 
b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -61,7 +61,7 @@ private[spark] class CoarseGrainedExecutorBackend(
 
   private implicit val formats = DefaultFormats
 
-  private[executor] val stopping = new AtomicBoolean(false)
+  private[spark] val stopping = new AtomicBoolean(false)
   var executor: Executor = null
   @volatile var driver: Option[RpcEndpointRef] = None
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala 
b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 7269913..f0334c5 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -59,7 +59,14 @@ private[spark] class DiskStore(
*/
   def put(blockId: BlockId)(writeFunc: WritableByteChannel => Unit): Unit = {
 if (contains(blockId)) {
-  throw new IllegalStateException(s"Block $blockId is already present in 
the disk store")
+  logWarning(s"Block $blockId is already present in the disk store")
+  try {
+diskManager.getFile(blockId).delete()
+  } catch {
+case e: Exception =>
+  throw new IllegalStateException(
+s"Block $blockId is already present in the disk store and could 
not delete it $e")
+  }
 }
 logDebug(s"Attempting to put block $blockId")
 val startTimeNs = System.nanoTime()
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/MiniReadWriteTest.scala 
b/examples/src/main/scala/org/apache/spark/examples/MiniReadWriteTest.scala
new file mode 100644
index 000..5a74e1c
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/MiniReadWriteTest.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ 

[spark] branch branch-3.2 updated: [SPARK-36585][SQL][DOCS] Support setting "since" version in FunctionRegistry

2021-08-25 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 4648412  [SPARK-36585][SQL][DOCS] Support setting "since" version in 
FunctionRegistry
4648412 is described below

commit 464841224ccf14b8fd8a1358e4e6b219f36fd1be
Author: Gengliang Wang 
AuthorDate: Wed Aug 25 22:32:20 2021 +0800

[SPARK-36585][SQL][DOCS] Support setting "since" version in FunctionRegistry

### What changes were proposed in this pull request?

Spark 3.2.0 includes two new functions `regexp` and `regexp_like`, which 
are identical to `rlike`. However, in the generated documentation. the since 
versions of both functions are `1.0.0` since they are based on the expression 
`RLike`:

- 
https://dist.apache.org/repos/dist/dev/spark/v3.2.0-rc1-docs/_site/api/sql/index.html#regexp
- 
https://dist.apache.org/repos/dist/dev/spark/v3.2.0-rc1-docs/_site/api/sql/index.html#regexp_like

This PR is to:
* Support setting `since` version in FunctionRegistry
* Correct the `since` version of `regexp` and `regexp_like`

### Why are the changes needed?

Correct the SQL doc
### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Run
```
sh sql/create-docs.sh
```
and check the SQL doc manually

Closes #33834 from gengliangwang/allowSQLFunVersion.

Authored-by: Gengliang Wang 
Signed-off-by: Gengliang Wang 
(cherry picked from commit 18143fb42604a3ad7ed4bfd89084f7f0a6bce4d6)
Signed-off-by: Gengliang Wang 
---
 .../sql/catalyst/analysis/FunctionRegistry.scala   | 33 ++
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 63b1525..00ece4d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -107,7 +107,9 @@ object FunctionRegistryBase {
* Return an expression info and a function builder for the function as 
defined by
* T using the given name.
*/
-  def build[T : ClassTag](name: String): (ExpressionInfo, Seq[Expression] => 
T) = {
+  def build[T : ClassTag](
+  name: String,
+  since: Option[String]): (ExpressionInfo, Seq[Expression] => T) = {
 val runtimeClass = scala.reflect.classTag[T].runtimeClass
 // For `RuntimeReplaceable`, skip the constructor with most arguments, 
which is the main
 // constructor and contains non-parameter `child` and should not be used 
as function builder.
@@ -150,13 +152,13 @@ object FunctionRegistryBase {
   }
 }
 
-(expressionInfo(name), builder)
+(expressionInfo(name, since), builder)
   }
 
   /**
* Creates an [[ExpressionInfo]] for the function as defined by T using the 
given name.
*/
-  def expressionInfo[T : ClassTag](name: String): ExpressionInfo = {
+  def expressionInfo[T : ClassTag](name: String, since: Option[String]): 
ExpressionInfo = {
 val clazz = scala.reflect.classTag[T].runtimeClass
 val df = clazz.getAnnotation(classOf[ExpressionDescription])
 if (df != null) {
@@ -170,7 +172,7 @@ object FunctionRegistryBase {
   df.examples(),
   df.note(),
   df.group(),
-  df.since(),
+  since.getOrElse(df.since()),
   df.deprecated(),
   df.source())
   } else {
@@ -495,12 +497,12 @@ object FunctionRegistry {
 expression[RegExpExtract]("regexp_extract"),
 expression[RegExpExtractAll]("regexp_extract_all"),
 expression[RegExpReplace]("regexp_replace"),
-expression[RLike]("regexp_like", true),
-expression[RLike]("regexp", true),
 expression[StringRepeat]("repeat"),
 expression[StringReplace]("replace"),
 expression[Overlay]("overlay"),
 expression[RLike]("rlike"),
+expression[RLike]("regexp_like", true, Some("3.2.0")),
+expression[RLike]("regexp", true, Some("3.2.0")),
 expression[StringRPad]("rpad"),
 expression[StringTrimRight]("rtrim"),
 expression[Sentences]("sentences"),
@@ -735,10 +737,19 @@ object FunctionRegistry {
 
   val functionSet: Set[FunctionIdentifier] = builtin.listFunction().toSet
 
-  /** See usage above. */
-  private def expression[T <: Expression : ClassTag](name: String, setAlias: 
Boolean = false)
-  : (String, (ExpressionInfo, FunctionBuilder)) = {
-val (expressionInfo, builder) = FunctionRegistryBase.build[T](name)
+  /**
+   * Create a SQL function builder and corresponding `ExpressionInfo`.
+   * @param name The function name.
+   * @param setAlias The alias name used in 

[spark] branch master updated (b2ff016 -> 18143fb)

2021-08-25 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from b2ff016  [SPARK-36398][SQL] Redact sensitive information in Spark 
Thrift Server log
 add 18143fb  [SPARK-36585][SQL][DOCS] Support setting "since" version in 
FunctionRegistry

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/FunctionRegistry.scala   | 33 ++
 1 file changed, 22 insertions(+), 11 deletions(-)

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.1 updated: [SPARK-36398][SQL] Redact sensitive information in Spark Thrift Server log

2021-08-25 Thread sarutak
This is an automated email from the ASF dual-hosted git repository.

sarutak pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
 new 8c0852c  [SPARK-36398][SQL] Redact sensitive information in Spark 
Thrift Server log
8c0852c is described below

commit 8c0852ca805a918cebe9f22166887128a03b3222
Author: Kousuke Saruta 
AuthorDate: Wed Aug 25 21:30:43 2021 +0900

[SPARK-36398][SQL] Redact sensitive information in Spark Thrift Server log

### What changes were proposed in this pull request?

This PR fixes an issue that there is no way to redact sensitive information 
in Spark Thrift Server log.
For example, JDBC password can be exposed in the log.
```
21/08/25 18:52:37 INFO SparkExecuteStatementOperation: Submitting query 
'CREATE TABLE mytbl2(a int) OPTIONS(url="jdbc:mysql//example.com:3306", 
driver="com.mysql.jdbc.Driver", dbtable="test_tbl", user="test_usr", 
password="abcde")' with ca14ae38-1aaf-4bf4-a099-06b8e5337613
```

### Why are the changes needed?

Bug fix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Ran ThriftServer, connect to it and execute `CREATE TABLE mytbl2(a int) 
OPTIONS(url="jdbc:mysql//example.com:3306", driver="com.mysql.jdbc.Driver", 
dbtable="test_tbl", user="test_usr", password="abcde");` with 
`spark.sql.redaction.string.regex=((?i)(?<=password=))(".*")|('.*')`
Then, confirmed the log.
```
21/08/25 18:54:11 INFO SparkExecuteStatementOperation: Submitting query 
'CREATE TABLE mytbl2(a int) OPTIONS(url="jdbc:mysql//example.com:3306", 
driver="com.mysql.jdbc.Driver", dbtable="test_tbl", user="test_usr", 
password=*(redacted))' with ffc627e2-b1a8-4d83-ab6d-d819b3ccd909
```

Closes #33832 from sarutak/fix-SPARK-36398.

Authored-by: Kousuke Saruta 
Signed-off-by: Kousuke Saruta 
(cherry picked from commit b2ff01608f5ecdba19630e12478bd370f9766f7b)
Signed-off-by: Kousuke Saruta 
---
 .../spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index acb00e4..bb55bb0 100644
--- 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -219,8 +219,8 @@ private[hive] class SparkExecuteStatementOperation(
 
   override def runInternal(): Unit = {
 setState(OperationState.PENDING)
-logInfo(s"Submitting query '$statement' with $statementId")
 val redactedStatement = 
SparkUtils.redact(sqlContext.conf.stringRedactionPattern, statement)
+logInfo(s"Submitting query '$redactedStatement' with $statementId")
 HiveThriftServer2.eventManager.onStatementStart(
   statementId,
   parentSession.getSessionHandle.getSessionId.toString,

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.2 updated: [SPARK-36398][SQL] Redact sensitive information in Spark Thrift Server log

2021-08-25 Thread sarutak
This is an automated email from the ASF dual-hosted git repository.

sarutak pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new fb38887  [SPARK-36398][SQL] Redact sensitive information in Spark 
Thrift Server log
fb38887 is described below

commit fb38887e001d33adef519d0288bd0844dcfe2bd5
Author: Kousuke Saruta 
AuthorDate: Wed Aug 25 21:30:43 2021 +0900

[SPARK-36398][SQL] Redact sensitive information in Spark Thrift Server log

### What changes were proposed in this pull request?

This PR fixes an issue that there is no way to redact sensitive information 
in Spark Thrift Server log.
For example, JDBC password can be exposed in the log.
```
21/08/25 18:52:37 INFO SparkExecuteStatementOperation: Submitting query 
'CREATE TABLE mytbl2(a int) OPTIONS(url="jdbc:mysql//example.com:3306", 
driver="com.mysql.jdbc.Driver", dbtable="test_tbl", user="test_usr", 
password="abcde")' with ca14ae38-1aaf-4bf4-a099-06b8e5337613
```

### Why are the changes needed?

Bug fix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Ran ThriftServer, connect to it and execute `CREATE TABLE mytbl2(a int) 
OPTIONS(url="jdbc:mysql//example.com:3306", driver="com.mysql.jdbc.Driver", 
dbtable="test_tbl", user="test_usr", password="abcde");` with 
`spark.sql.redaction.string.regex=((?i)(?<=password=))(".*")|('.*')`
Then, confirmed the log.
```
21/08/25 18:54:11 INFO SparkExecuteStatementOperation: Submitting query 
'CREATE TABLE mytbl2(a int) OPTIONS(url="jdbc:mysql//example.com:3306", 
driver="com.mysql.jdbc.Driver", dbtable="test_tbl", user="test_usr", 
password=*(redacted))' with ffc627e2-b1a8-4d83-ab6d-d819b3ccd909
```

Closes #33832 from sarutak/fix-SPARK-36398.

Authored-by: Kousuke Saruta 
Signed-off-by: Kousuke Saruta 
(cherry picked from commit b2ff01608f5ecdba19630e12478bd370f9766f7b)
Signed-off-by: Kousuke Saruta 
---
 .../spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 0df5885..4f40889 100644
--- 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -185,8 +185,8 @@ private[hive] class SparkExecuteStatementOperation(
 
   override def runInternal(): Unit = {
 setState(OperationState.PENDING)
-logInfo(s"Submitting query '$statement' with $statementId")
 val redactedStatement = 
SparkUtils.redact(sqlContext.conf.stringRedactionPattern, statement)
+logInfo(s"Submitting query '$redactedStatement' with $statementId")
 HiveThriftServer2.eventManager.onStatementStart(
   statementId,
   parentSession.getSessionHandle.getSessionId.toString,

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-36398][SQL] Redact sensitive information in Spark Thrift Server log

2021-08-25 Thread sarutak
This is an automated email from the ASF dual-hosted git repository.

sarutak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b2ff016  [SPARK-36398][SQL] Redact sensitive information in Spark 
Thrift Server log
b2ff016 is described below

commit b2ff01608f5ecdba19630e12478bd370f9766f7b
Author: Kousuke Saruta 
AuthorDate: Wed Aug 25 21:30:43 2021 +0900

[SPARK-36398][SQL] Redact sensitive information in Spark Thrift Server log

### What changes were proposed in this pull request?

This PR fixes an issue that there is no way to redact sensitive information 
in Spark Thrift Server log.
For example, JDBC password can be exposed in the log.
```
21/08/25 18:52:37 INFO SparkExecuteStatementOperation: Submitting query 
'CREATE TABLE mytbl2(a int) OPTIONS(url="jdbc:mysql//example.com:3306", 
driver="com.mysql.jdbc.Driver", dbtable="test_tbl", user="test_usr", 
password="abcde")' with ca14ae38-1aaf-4bf4-a099-06b8e5337613
```

### Why are the changes needed?

Bug fix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Ran ThriftServer, connect to it and execute `CREATE TABLE mytbl2(a int) 
OPTIONS(url="jdbc:mysql//example.com:3306", driver="com.mysql.jdbc.Driver", 
dbtable="test_tbl", user="test_usr", password="abcde");` with 
`spark.sql.redaction.string.regex=((?i)(?<=password=))(".*")|('.*')`
Then, confirmed the log.
```
21/08/25 18:54:11 INFO SparkExecuteStatementOperation: Submitting query 
'CREATE TABLE mytbl2(a int) OPTIONS(url="jdbc:mysql//example.com:3306", 
driver="com.mysql.jdbc.Driver", dbtable="test_tbl", user="test_usr", 
password=*(redacted))' with ffc627e2-b1a8-4d83-ab6d-d819b3ccd909
```

Closes #33832 from sarutak/fix-SPARK-36398.

Authored-by: Kousuke Saruta 
Signed-off-by: Kousuke Saruta 
---
 .../spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 0df5885..4f40889 100644
--- 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -185,8 +185,8 @@ private[hive] class SparkExecuteStatementOperation(
 
   override def runInternal(): Unit = {
 setState(OperationState.PENDING)
-logInfo(s"Submitting query '$statement' with $statementId")
 val redactedStatement = 
SparkUtils.redact(sqlContext.conf.stringRedactionPattern, statement)
+logInfo(s"Submitting query '$redactedStatement' with $statementId")
 HiveThriftServer2.eventManager.onStatementStart(
   statementId,
   parentSession.getSessionHandle.getSessionId.toString,

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.2 updated: [SPARK-36567][SQL] Support foldable special datetime strings by `CAST`

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new a4c5140  [SPARK-36567][SQL] Support foldable special datetime strings 
by `CAST`
a4c5140 is described below

commit a4c514024270265fd5e7e7f7e0108727c900de41
Author: Max Gekk 
AuthorDate: Wed Aug 25 14:08:59 2021 +0800

[SPARK-36567][SQL] Support foldable special datetime strings by `CAST`

### What changes were proposed in this pull request?
In the PR, I propose to add new correctness rule `SpecialDatetimeValues` to 
the final analysis phase. It replaces casts of strings to 
date/timestamp_ltz/timestamp_ntz by literals of such types if the strings 
contain special datetime values like `today`, `yesterday` and `tomorrow`, and 
the input strings are foldable.

### Why are the changes needed?
1. To avoid a breaking change.
2. To improve user experience with Spark SQL. After the PR 
https://github.com/apache/spark/pull/32714, users have to use typed literals 
instead of implicit casts. For instance,
at Spark 3.1:
```sql
select ts_col > 'now';
```
but the query fails at the moment, and users have to use typed timestamp 
literal:
```sql
select ts_col > timestamp'now';
```

### Does this PR introduce _any_ user-facing change?
No. Previous release 3.1 has supported the feature already till it was 
removed by https://github.com/apache/spark/pull/32714.

### How was this patch tested?
1. Manually tested via the sql command line:
```sql
spark-sql> select cast('today' as date);
2021-08-24
spark-sql> select timestamp('today');
2021-08-24 00:00:00
spark-sql> select timestamp'tomorrow' > 'today';
true
```
2. By running new test suite:
```
$ build/sbt "sql/testOnly 
org.apache.spark.sql.catalyst.optimizer.SpecialDatetimeValuesSuite"
```

Closes #33816 from MaxGekk/foldable-datetime-special-values.

Authored-by: Max Gekk 
Signed-off-by: Wenchen Fan 
(cherry picked from commit df0ec56723f0b47c3629055fa7a8c63bb4285147)
Signed-off-by: Wenchen Fan 
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |   4 +-
 .../sql/catalyst/optimizer/finishAnalysis.scala|  23 +
 .../optimizer/SpecialDatetimeValuesSuite.scala | 101 +
 3 files changed, 127 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 40b9d65..aeb236e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -158,7 +158,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
   RewriteNonCorrelatedExists,
   PullOutGroupingExpressions,
   ComputeCurrentTime,
-  ReplaceCurrentLike(catalogManager)) ::
+  ReplaceCurrentLike(catalogManager),
+  SpecialDatetimeValues) ::
 
//
 // Optimizer rules start here
 
//
@@ -265,6 +266,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
   EliminateView.ruleName ::
   ReplaceExpressions.ruleName ::
   ComputeCurrentTime.ruleName ::
+  SpecialDatetimeValues.ruleName ::
   ReplaceCurrentLike(catalogManager).ruleName ::
   RewriteDistinctAggregates.ruleName ::
   ReplaceDeduplicateWithAggregate.ruleName ::
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index deacc3b..daf4c5e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.TreePattern._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, 
convertSpecialTimestamp, convertSpecialTimestampNTZ}
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -119,3 +120,25 @@ case class ReplaceCurrentLike(catalogManager: 
CatalogManager) extends Rule[Logic
 }
   }
 }
+
+/**
+ * Replaces casts of special datetime strings by its date/timestamp values
+ * if the input strings are 

[spark] branch master updated: [SPARK-36567][SQL] Support foldable special datetime strings by `CAST`

2021-08-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new df0ec56  [SPARK-36567][SQL] Support foldable special datetime strings 
by `CAST`
df0ec56 is described below

commit df0ec56723f0b47c3629055fa7a8c63bb4285147
Author: Max Gekk 
AuthorDate: Wed Aug 25 14:08:59 2021 +0800

[SPARK-36567][SQL] Support foldable special datetime strings by `CAST`

### What changes were proposed in this pull request?
In the PR, I propose to add new correctness rule `SpecialDatetimeValues` to 
the final analysis phase. It replaces casts of strings to 
date/timestamp_ltz/timestamp_ntz by literals of such types if the strings 
contain special datetime values like `today`, `yesterday` and `tomorrow`, and 
the input strings are foldable.

### Why are the changes needed?
1. To avoid a breaking change.
2. To improve user experience with Spark SQL. After the PR 
https://github.com/apache/spark/pull/32714, users have to use typed literals 
instead of implicit casts. For instance,
at Spark 3.1:
```sql
select ts_col > 'now';
```
but the query fails at the moment, and users have to use typed timestamp 
literal:
```sql
select ts_col > timestamp'now';
```

### Does this PR introduce _any_ user-facing change?
No. Previous release 3.1 has supported the feature already till it was 
removed by https://github.com/apache/spark/pull/32714.

### How was this patch tested?
1. Manually tested via the sql command line:
```sql
spark-sql> select cast('today' as date);
2021-08-24
spark-sql> select timestamp('today');
2021-08-24 00:00:00
spark-sql> select timestamp'tomorrow' > 'today';
true
```
2. By running new test suite:
```
$ build/sbt "sql/testOnly 
org.apache.spark.sql.catalyst.optimizer.SpecialDatetimeValuesSuite"
```

Closes #33816 from MaxGekk/foldable-datetime-special-values.

Authored-by: Max Gekk 
Signed-off-by: Wenchen Fan 
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |   4 +-
 .../sql/catalyst/optimizer/finishAnalysis.scala|  23 +
 .../optimizer/SpecialDatetimeValuesSuite.scala | 101 +
 3 files changed, 127 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index ea37cbb..ed4ab9c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -158,7 +158,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
   RewriteNonCorrelatedExists,
   PullOutGroupingExpressions,
   ComputeCurrentTime,
-  ReplaceCurrentLike(catalogManager)) ::
+  ReplaceCurrentLike(catalogManager),
+  SpecialDatetimeValues) ::
 
//
 // Optimizer rules start here
 
//
@@ -266,6 +267,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
   EliminateView.ruleName ::
   ReplaceExpressions.ruleName ::
   ComputeCurrentTime.ruleName ::
+  SpecialDatetimeValues.ruleName ::
   ReplaceCurrentLike(catalogManager).ruleName ::
   RewriteDistinctAggregates.ruleName ::
   ReplaceDeduplicateWithAggregate.ruleName ::
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index deacc3b..daf4c5e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.TreePattern._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, 
convertSpecialTimestamp, convertSpecialTimestampNTZ}
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -119,3 +120,25 @@ case class ReplaceCurrentLike(catalogManager: 
CatalogManager) extends Rule[Logic
 }
   }
 }
+
+/**
+ * Replaces casts of special datetime strings by its date/timestamp values
+ * if the input strings are foldable.
+ */
+object SpecialDatetimeValues extends Rule[LogicalPlan] {
+  private val conv = Map[DataType,