from:"\"dongjoon\""

[spark] branch master updated (21c02ee -> e736c62)

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 21c02ee  [SPARK-30864][SQL][DOC] add the user guide for Adaptive Query 
Execution
 add e736c62  [SPARK-31116][SQL] Fix nested schema case-sensitivity in 
ParquetRowConverter

No new revisions were added by this update.

Summary of changes:
 .../datasources/parquet/ParquetRowConverter.scala  | 12 +--
 .../spark/sql/FileBasedDataSourceSuite.scala   | 40 ++
 2 files changed, 50 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31116][SQL] Fix nested schema case-sensitivity in ParquetRowConverter

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new da1f95b  [SPARK-31116][SQL] Fix nested schema case-sensitivity in 
ParquetRowConverter
da1f95b is described below

commit da1f95be6b9af59a91a14e01613bdc4e8ac35374
Author: Tae-kyeom, Kim 
AuthorDate: Mon Mar 16 10:31:56 2020 -0700

[SPARK-31116][SQL] Fix nested schema case-sensitivity in ParquetRowConverter

### What changes were proposed in this pull request?

This PR (SPARK-31116) add caseSensitive parameter to ParquetRowConverter so 
that it handle materialize parquet properly with respect to case sensitivity

### Why are the changes needed?

From spark 3.0.0, below statement throws IllegalArgumentException in 
caseInsensitive mode because of explicit field index searching in 
ParquetRowConverter. As we already constructed parquet requested schema and 
catalyst requested schema during schema clipping in ParquetReadSupport, just 
follow these behavior.

```scala
val path = "/some/temp/path"

spark
  .range(1L)
  .selectExpr("NAMED_STRUCT('lowercase', id, 'camelCase', id + 1) AS 
StructColumn")
  .write.parquet(path)

val caseInsensitiveSchema = new StructType()
  .add(
"StructColumn",
new StructType()
  .add("LowerCase", LongType)
  .add("camelcase", LongType))

spark.read.schema(caseInsensitiveSchema).parquet(path).show()
```

### Does this PR introduce any user-facing change?

No. The changes are only in unreleased branches (`master` and `branch-3.0`).

### How was this patch tested?

Passed new test cases that check parquet column selection with respect to 
schemas and case sensitivities

Closes #27888 from kimtkyeom/parquet_row_converter_case_sensitivity.

Authored-by: Tae-kyeom, Kim 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit e736c62764137b2c3af90d2dc8a77e391891200a)
Signed-off-by: Dongjoon Hyun 
---
 .../datasources/parquet/ParquetRowConverter.scala  | 12 +--
 .../spark/sql/FileBasedDataSourceSuite.scala   | 40 ++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
index 850adae..22422c0 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -33,8 +33,9 @@ import 
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, DOUBLE
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, 
GenericArrayData}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, 
CaseInsensitiveMap, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLTimestamp
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -178,8 +179,15 @@ private[parquet] class ParquetRowConverter(
 
   // Converters for each field.
   private[this] val fieldConverters: Array[Converter with 
HasParentContainerUpdater] = {
+// (SPARK-31116) Use case insensitive map if spark.sql.caseSensitive is 
false
+// to prevent throwing IllegalArgumentException when searching catalyst 
type's field index
+val catalystFieldNameToIndex = if (SQLConf.get.caseSensitiveAnalysis) {
+  catalystType.fieldNames.zipWithIndex.toMap
+} else {
+  CaseInsensitiveMap(catalystType.fieldNames.zipWithIndex.toMap)
+}
 parquetType.getFields.asScala.map { parquetField =>
-  val fieldIndex = catalystType.fieldIndex(parquetField.getName)
+  val fieldIndex = catalystFieldNameToIndex(parquetField.getName)
   val catalystField = catalystType(fieldIndex)
   // Converted field value should be set to the `fieldIndex`-th cell of 
`currentRow`
   newConverter(parquetField, catalystField.dataType, new 
RowUpdater(currentRow, fieldIndex))
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index c870958..cb410b4 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSui

[spark] branch branch-3.0 updated: [SPARK-31146][SQL] Leverage the helper method for aliasing in built-in SQL expressions

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 9cf5d17  [SPARK-31146][SQL] Leverage the helper method for aliasing in 
built-in SQL expressions
9cf5d17 is described below

commit 9cf5d170910a65792d894d129a976ec486b4abc6
Author: HyukjinKwon 
AuthorDate: Mon Mar 16 11:22:34 2020 -0700

[SPARK-31146][SQL] Leverage the helper method for aliasing in built-in SQL 
expressions

### What changes were proposed in this pull request?

This PR is kind of a followup of #26808. It leverages the helper method for 
aliasing in built-in SQL expressions to use the alias as its output column name 
where it's applicable.

- `Expression`, `UnaryMathExpression` and `BinaryMathExpression` search the 
alias in the tags by default.
- When the naming is different in its implementation, it has to be 
overwritten for the expression specifically. E.g., `CallMethodViaReflection`, 
`Remainder`, `CurrentTimestamp`,
`FormatString` and `XPathDouble`.

This PR fixes the aliases of the functions below:

| class| alias|
|--|--|
|`Rand`|`random`  |
|`Ceil`|`ceiling` |
|`Remainder`   |`mod` |
|`Pow` |`pow` |
|`Signum`  |`sign`|
|`Chr` |`char`|
|`Length`  |`char_length` |
|`Length`  |`character_length`|
|`FormatString`|`printf`  |
|`Substring`   |`substr`  |
|`Upper`   |`ucase`   |
|`XPathDouble` |`xpath_number`|
|`DayOfMonth`  |`day` |
|`CurrentTimestamp`|`now` |
|`Size`|`cardinality` |
|`Sha1`|`sha` |
|`CallMethodViaReflection` |`java_method` |

Note: `EqualTo`, `=` and `==` aliases were excluded because it's unable to 
leverage this helper method. It should fix the parser.

Note: this PR also excludes some instances such as `ToDegrees`, 
`ToRadians`, `UnaryMinus` and `UnaryPositive` that needs an explicit name 
overwritten to make the scope of this PR smaller.

### Why are the changes needed?

To respect expression name.

### Does this PR introduce any user-facing change?

Yes, it will change the output column name.

### How was this patch tested?

Manually tested, and unittests were added.

Closes #27901 from HyukjinKwon/31146.

Authored-by: HyukjinKwon 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 6704103499d2003b1879ff0b4b8e29141e401b9f)
Signed-off-by: Dongjoon Hyun 
---
 .../sql/catalyst/analysis/FunctionRegistry.scala   | 38 --
 .../expressions/CallMethodViaReflection.scala  |  4 +--
 .../sql/catalyst/expressions/Expression.scala  |  5 +--
 .../sql/catalyst/expressions/aggregate/First.scala |  2 --
 .../sql/catalyst/expressions/aggregate/Last.scala  |  2 --
 .../sql/catalyst/expressions/arithmetic.scala  | 14 +++-
 .../catalyst/expressions/datetimeExpressions.scala |  4 ++-
 .../sql/catalyst/expressions/mathExpressions.scala | 10 +++---
 .../catalyst/expressions/stringExpressions.scala   |  7 ++--
 .../spark/sql/catalyst/expressions/xml/xpath.scala |  5 +--
 .../resources/sql-tests/results/operators.sql.out  | 14 
 .../sql-tests/results/postgreSQL/insert.sql.out|  2 +-
 .../sql-tests/results/postgreSQL/numeric.sql.out   |  2 +-
 .../sql-tests/results/postgreSQL/strings.sql.out   |  4 +--
 .../sql-tests/results/string-functions.sql.out |  6 ++--
 .../typeCoercion/native/implicitTypeCasts.sql.out  |  2 +-
 .../scala/org/apache/spark/sql/ExplainSuite.scala  |  4 +--
 17 files changed, 70 insertions(+), 55 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 6c4aee4..c11186e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -218,7 +218,7 @@ object FunctionRegistry {
 expression[PosExplode]("posexplode"),
 expressionGeneratorOuter[PosExplode]("posexplode_outer"),
 expression[Rand]("rand"),
-expression[Rand]("random"),
+expression[Rand]("random", true),
 expression[Randn]("randn"),
 expressio

[spark] branch master updated (3ce1dff -> 6704103)

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 3ce1dff  [SPARK-30930][ML] Remove ML/MLLIB DeveloperApi annotations
 add 6704103  [SPARK-31146][SQL] Leverage the helper method for aliasing in 
built-in SQL expressions

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/FunctionRegistry.scala   | 38 --
 .../expressions/CallMethodViaReflection.scala  |  4 +--
 .../sql/catalyst/expressions/Expression.scala  |  5 +--
 .../sql/catalyst/expressions/aggregate/First.scala |  2 --
 .../sql/catalyst/expressions/aggregate/Last.scala  |  2 --
 .../sql/catalyst/expressions/arithmetic.scala  | 14 +++-
 .../catalyst/expressions/datetimeExpressions.scala |  4 ++-
 .../sql/catalyst/expressions/mathExpressions.scala | 10 +++---
 .../catalyst/expressions/stringExpressions.scala   |  7 ++--
 .../spark/sql/catalyst/expressions/xml/xpath.scala |  5 +--
 .../resources/sql-tests/results/operators.sql.out  | 14 
 .../sql-tests/results/postgreSQL/insert.sql.out|  2 +-
 .../sql-tests/results/postgreSQL/numeric.sql.out   |  2 +-
 .../sql-tests/results/postgreSQL/strings.sql.out   |  4 +--
 .../sql-tests/results/string-functions.sql.out |  6 ++--
 .../typeCoercion/native/implicitTypeCasts.sql.out  |  2 +-
 .../scala/org/apache/spark/sql/ExplainSuite.scala  |  4 +--
 17 files changed, 70 insertions(+), 55 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle non-existed path

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new cb26f63  [SPARK-31163][SQL] TruncateTableCommand with acl/permission 
should handle non-existed path
cb26f63 is described below

commit cb26f636b08aea4c5c6bf5035a359cd3cbf335c0
Author: yi.wu 
AuthorDate: Mon Mar 16 11:45:25 2020 -0700

[SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle 
non-existed path

### What changes were proposed in this pull request?

This fix #26956
Wrap try-catch on `fs.getFileStatus(path)` within acl/permission in case of 
the path doesn't exist.

### Why are the changes needed?

`truncate table` may fail to re-create path in case of interruption or 
something else. As a result, next time we `truncate table` on the same table 
with acl/permission, it will fail due to `FileNotFoundException`. And it also 
brings behavior change compares to previous Spark version, which could still 
`truncate table` successfully even if the path doesn't exist.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added UT.

Closes #27923 from Ngone51/fix_truncate.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/execution/command/tables.scala |  2 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 21 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 6243261..d4de822 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -508,8 +508,8 @@ case class TruncateTableCommand(
   var optPermission: Option[FsPermission] = None
   var optAcls: Option[java.util.List[AclEntry]] = None
   if (!ignorePermissionAcl) {
-val fileStatus = fs.getFileStatus(path)
 try {
+  val fileStatus = fs.getFileStatus(path)
   optPermission = Some(fileStatus.getPermission())
 } catch {
   case NonFatal(_) => // do nothing
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 5a67dce..10ad8ac 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2084,6 +2084,27 @@ abstract class DDLSuite extends QueryTest with 
SQLTestUtils {
 }
   }
 
+  test("SPARK-31163: acl/permission should handle non-existed path when 
truncating table") {
+withSQLConf(SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL.key -> "false") {
+  withTable("tab1") {
+sql("CREATE TABLE tab1 (col1 STRING, col2 INT) USING parquet 
PARTITIONED BY (col2)")
+sql("INSERT INTO tab1 SELECT 'one', 1")
+checkAnswer(spark.table("tab1"), Row("one", 1))
+val part = 
spark.sessionState.catalog.listPartitions(TableIdentifier("tab1")).head
+val path = new File(part.location.getPath)
+sql("TRUNCATE TABLE tab1")
+// simulate incomplete/unsuccessful truncate
+assert(path.exists())
+path.delete()
+assert(!path.exists())
+// execute without java.io.FileNotFoundException
+sql("TRUNCATE TABLE tab1")
+// partition path should be re-created
+assert(path.exists())
+  }
+}
+  }
+
   test("create temporary view with mismatched schema") {
 withTable("tab1") {
   spark.range(10).write.saveAsTable("tab1")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle non-existed path

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new fed4a68  [SPARK-31163][SQL] TruncateTableCommand with acl/permission 
should handle non-existed path
fed4a68 is described below

commit fed4a680e3c45462d0d69aeac53780a677f988ae
Author: yi.wu 
AuthorDate: Mon Mar 16 11:45:25 2020 -0700

[SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle 
non-existed path

### What changes were proposed in this pull request?

This fix #26956
Wrap try-catch on `fs.getFileStatus(path)` within acl/permission in case of 
the path doesn't exist.

### Why are the changes needed?

`truncate table` may fail to re-create path in case of interruption or 
something else. As a result, next time we `truncate table` on the same table 
with acl/permission, it will fail due to `FileNotFoundException`. And it also 
brings behavior change compares to previous Spark version, which could still 
`truncate table` successfully even if the path doesn't exist.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added UT.

Closes #27923 from Ngone51/fix_truncate.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit cb26f636b08aea4c5c6bf5035a359cd3cbf335c0)
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/execution/command/tables.scala |  2 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 21 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 6243261..d4de822 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -508,8 +508,8 @@ case class TruncateTableCommand(
   var optPermission: Option[FsPermission] = None
   var optAcls: Option[java.util.List[AclEntry]] = None
   if (!ignorePermissionAcl) {
-val fileStatus = fs.getFileStatus(path)
 try {
+  val fileStatus = fs.getFileStatus(path)
   optPermission = Some(fileStatus.getPermission())
 } catch {
   case NonFatal(_) => // do nothing
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 5a67dce..10ad8ac 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2084,6 +2084,27 @@ abstract class DDLSuite extends QueryTest with 
SQLTestUtils {
 }
   }
 
+  test("SPARK-31163: acl/permission should handle non-existed path when 
truncating table") {
+withSQLConf(SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL.key -> "false") {
+  withTable("tab1") {
+sql("CREATE TABLE tab1 (col1 STRING, col2 INT) USING parquet 
PARTITIONED BY (col2)")
+sql("INSERT INTO tab1 SELECT 'one', 1")
+checkAnswer(spark.table("tab1"), Row("one", 1))
+val part = 
spark.sessionState.catalog.listPartitions(TableIdentifier("tab1")).head
+val path = new File(part.location.getPath)
+sql("TRUNCATE TABLE tab1")
+// simulate incomplete/unsuccessful truncate
+assert(path.exists())
+path.delete()
+assert(!path.exists())
+// execute without java.io.FileNotFoundException
+sql("TRUNCATE TABLE tab1")
+// partition path should be re-created
+assert(path.exists())
+  }
+}
+  }
+
   test("create temporary view with mismatched schema") {
 withTable("tab1") {
   spark.range(10).write.saveAsTable("tab1")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated: [SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle non-existed path

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new 26ad3fe  [SPARK-31163][SQL] TruncateTableCommand with acl/permission 
should handle non-existed path
26ad3fe is described below

commit 26ad3fe2604506a2cbc5de0ecd1b5771f3965cb4
Author: yi.wu 
AuthorDate: Mon Mar 16 11:45:25 2020 -0700

[SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle 
non-existed path

### What changes were proposed in this pull request?

This fix #26956
Wrap try-catch on `fs.getFileStatus(path)` within acl/permission in case of 
the path doesn't exist.

### Why are the changes needed?

`truncate table` may fail to re-create path in case of interruption or 
something else. As a result, next time we `truncate table` on the same table 
with acl/permission, it will fail due to `FileNotFoundException`. And it also 
brings behavior change compares to previous Spark version, which could still 
`truncate table` successfully even if the path doesn't exist.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added UT.

Closes #27923 from Ngone51/fix_truncate.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit cb26f636b08aea4c5c6bf5035a359cd3cbf335c0)
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/execution/command/tables.scala |  2 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 21 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 28dc4a4..8f504d3 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -470,8 +470,8 @@ case class TruncateTableCommand(
   var optPermission: Option[FsPermission] = None
   var optAcls: Option[java.util.List[AclEntry]] = None
   if (!ignorePermissionAcl) {
-val fileStatus = fs.getFileStatus(path)
 try {
+  val fileStatus = fs.getFileStatus(path)
   optPermission = Some(fileStatus.getPermission())
 } catch {
   case NonFatal(_) => // do nothing
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 0aabe86..73565f2 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2009,6 +2009,27 @@ abstract class DDLSuite extends QueryTest with 
SQLTestUtils {
 }
   }
 
+  test("SPARK-31163: acl/permission should handle non-existed path when 
truncating table") {
+withSQLConf(SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL.key -> "false") {
+  withTable("tab1") {
+sql("CREATE TABLE tab1 (col1 STRING, col2 INT) USING parquet 
PARTITIONED BY (col2)")
+sql("INSERT INTO tab1 SELECT 'one', 1")
+checkAnswer(spark.table("tab1"), Row("one", 1))
+val part = 
spark.sessionState.catalog.listPartitions(TableIdentifier("tab1")).head
+val path = new File(part.location.getPath)
+sql("TRUNCATE TABLE tab1")
+// simulate incomplete/unsuccessful truncate
+assert(path.exists())
+path.delete()
+assert(!path.exists())
+// execute without java.io.FileNotFoundException
+sql("TRUNCATE TABLE tab1")
+// partition path should be re-created
+assert(path.exists())
+  }
+}
+  }
+
   test("create temporary view with mismatched schema") {
 withTable("tab1") {
   spark.range(10).write.saveAsTable("tab1")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated: [SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle non-existed path

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new 26ad3fe  [SPARK-31163][SQL] TruncateTableCommand with acl/permission 
should handle non-existed path
26ad3fe is described below

commit 26ad3fe2604506a2cbc5de0ecd1b5771f3965cb4
Author: yi.wu 
AuthorDate: Mon Mar 16 11:45:25 2020 -0700

[SPARK-31163][SQL] TruncateTableCommand with acl/permission should handle 
non-existed path

### What changes were proposed in this pull request?

This fix #26956
Wrap try-catch on `fs.getFileStatus(path)` within acl/permission in case of 
the path doesn't exist.

### Why are the changes needed?

`truncate table` may fail to re-create path in case of interruption or 
something else. As a result, next time we `truncate table` on the same table 
with acl/permission, it will fail due to `FileNotFoundException`. And it also 
brings behavior change compares to previous Spark version, which could still 
`truncate table` successfully even if the path doesn't exist.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added UT.

Closes #27923 from Ngone51/fix_truncate.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit cb26f636b08aea4c5c6bf5035a359cd3cbf335c0)
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/execution/command/tables.scala |  2 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 21 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 28dc4a4..8f504d3 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -470,8 +470,8 @@ case class TruncateTableCommand(
   var optPermission: Option[FsPermission] = None
   var optAcls: Option[java.util.List[AclEntry]] = None
   if (!ignorePermissionAcl) {
-val fileStatus = fs.getFileStatus(path)
 try {
+  val fileStatus = fs.getFileStatus(path)
   optPermission = Some(fileStatus.getPermission())
 } catch {
   case NonFatal(_) => // do nothing
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 0aabe86..73565f2 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2009,6 +2009,27 @@ abstract class DDLSuite extends QueryTest with 
SQLTestUtils {
 }
   }
 
+  test("SPARK-31163: acl/permission should handle non-existed path when 
truncating table") {
+withSQLConf(SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL.key -> "false") {
+  withTable("tab1") {
+sql("CREATE TABLE tab1 (col1 STRING, col2 INT) USING parquet 
PARTITIONED BY (col2)")
+sql("INSERT INTO tab1 SELECT 'one', 1")
+checkAnswer(spark.table("tab1"), Row("one", 1))
+val part = 
spark.sessionState.catalog.listPartitions(TableIdentifier("tab1")).head
+val path = new File(part.location.getPath)
+sql("TRUNCATE TABLE tab1")
+// simulate incomplete/unsuccessful truncate
+assert(path.exists())
+path.delete()
+assert(!path.exists())
+// execute without java.io.FileNotFoundException
+sql("TRUNCATE TABLE tab1")
+// partition path should be re-created
+assert(path.exists())
+  }
+}
+  }
+
   test("create temporary view with mismatched schema") {
 withTable("tab1") {
   spark.range(10).write.saveAsTable("tab1")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (d7b97a1 -> ed06d98)

2020-03-16 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from d7b97a1  [SPARK-31166][SQL] UNION map and other maps 
should not fail
 add ed06d98  [SPARK-25355][K8S] Add proxy user to driver if present on 
spark-submit

No new revisions were added by this update.

Summary of changes:
 .../org/apache/spark/deploy/SparkSubmit.scala  |  4 ++
 .../org/apache/spark/deploy/SparkSubmitSuite.scala |  2 +
 .../apache/spark/deploy/k8s/KubernetesConf.scala   | 14 --
 .../k8s/features/DriverCommandFeatureStep.scala|  6 +++
 .../k8s/submit/KubernetesClientApplication.scala   | 12 +++--
 .../spark/deploy/k8s/KubernetesConfSuite.scala |  3 +-
 .../spark/deploy/k8s/KubernetesTestConf.scala  |  5 +-
 .../features/DriverCommandFeatureStepSuite.scala   | 54 +-
 8 files changed, 89 insertions(+), 11 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [MINOR][SQL] Update the DataFrameWriter.bucketBy comment

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 124b4ce  [MINOR][SQL] Update the DataFrameWriter.bucketBy comment
124b4ce is described below

commit 124b4ce2e6e8f84294f8fc13d3e731a82325dacb
Author: Takeshi Yamamuro 
AuthorDate: Tue Mar 17 00:52:45 2020 -0700

[MINOR][SQL] Update the DataFrameWriter.bucketBy comment

### What changes were proposed in this pull request?

This PR intends to update the `DataFrameWriter.bucketBy` comment for 
clearly describing that the bucketBy scheme follows a Spark "specific" one.

I saw the questions about the current bucketing compatibility with Hive in 
[SPARK-31162](https://issues.apache.org/jira/browse/SPARK-31162?focusedCommentId=17060408&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17060408)
 and 
[SPARK-17495](https://issues.apache.org/jira/browse/SPARK-17495?focusedCommentId=17059847&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17059847)
 from users and IMHO the comment is a bit confusing [...]

### Why are the changes needed?

To make users understood smoothly.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #27930 from maropu/UpdateBucketByComment.

Authored-by: Takeshi Yamamuro 
    Signed-off-by: Dongjoon Hyun 
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 22b26ca..6946c1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -198,7 +198,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) 
{
 
   /**
* Buckets the output by the given columns. If specified, the output is laid 
out on the file
-   * system similar to Hive's bucketing scheme.
+   * system similar to Hive's bucketing scheme, but with a different bucket 
hash function
+   * and is not compatible with Hive's bucketing.
*
* This is applicable for all file-based data sources (e.g. Parquet, JSON) 
starting with Spark
* 2.1.0.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [MINOR][SQL] Update the DataFrameWriter.bucketBy comment

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 26ea213  [MINOR][SQL] Update the DataFrameWriter.bucketBy comment
26ea213 is described below

commit 26ea213f3c4f2acb07045bf0f6b476ddfb635436
Author: Takeshi Yamamuro 
AuthorDate: Tue Mar 17 00:52:45 2020 -0700

[MINOR][SQL] Update the DataFrameWriter.bucketBy comment

### What changes were proposed in this pull request?

This PR intends to update the `DataFrameWriter.bucketBy` comment for 
clearly describing that the bucketBy scheme follows a Spark "specific" one.

I saw the questions about the current bucketing compatibility with Hive in 
[SPARK-31162](https://issues.apache.org/jira/browse/SPARK-31162?focusedCommentId=17060408&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17060408)
 and 
[SPARK-17495](https://issues.apache.org/jira/browse/SPARK-17495?focusedCommentId=17059847&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17059847)
 from users and IMHO the comment is a bit confusing [...]

### Why are the changes needed?

To make users understood smoothly.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #27930 from maropu/UpdateBucketByComment.

Authored-by: Takeshi Yamamuro 
    Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 124b4ce2e6e8f84294f8fc13d3e731a82325dacb)
    Signed-off-by: Dongjoon Hyun 
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 22b26ca..6946c1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -198,7 +198,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) 
{
 
   /**
* Buckets the output by the given columns. If specified, the output is laid 
out on the file
-   * system similar to Hive's bucketing scheme.
+   * system similar to Hive's bucketing scheme, but with a different bucket 
hash function
+   * and is not compatible with Hive's bucketing.
*
* This is applicable for all file-based data sources (e.g. Parquet, JSON) 
starting with Spark
* 2.1.0.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [MINOR][SQL] Update the DataFrameWriter.bucketBy comment

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 26ea213  [MINOR][SQL] Update the DataFrameWriter.bucketBy comment
26ea213 is described below

commit 26ea213f3c4f2acb07045bf0f6b476ddfb635436
Author: Takeshi Yamamuro 
AuthorDate: Tue Mar 17 00:52:45 2020 -0700

[MINOR][SQL] Update the DataFrameWriter.bucketBy comment

### What changes were proposed in this pull request?

This PR intends to update the `DataFrameWriter.bucketBy` comment for 
clearly describing that the bucketBy scheme follows a Spark "specific" one.

I saw the questions about the current bucketing compatibility with Hive in 
[SPARK-31162](https://issues.apache.org/jira/browse/SPARK-31162?focusedCommentId=17060408&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17060408)
 and 
[SPARK-17495](https://issues.apache.org/jira/browse/SPARK-17495?focusedCommentId=17059847&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17059847)
 from users and IMHO the comment is a bit confusing [...]

### Why are the changes needed?

To make users understood smoothly.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #27930 from maropu/UpdateBucketByComment.

Authored-by: Takeshi Yamamuro 
    Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 124b4ce2e6e8f84294f8fc13d3e731a82325dacb)
    Signed-off-by: Dongjoon Hyun 
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 22b26ca..6946c1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -198,7 +198,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) 
{
 
   /**
* Buckets the output by the given columns. If specified, the output is laid 
out on the file
-   * system similar to Hive's bucketing scheme.
+   * system similar to Hive's bucketing scheme, but with a different bucket 
hash function
+   * and is not compatible with Hive's bucketing.
*
* This is applicable for all file-based data sources (e.g. Parquet, JSON) 
starting with Spark
* 2.1.0.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated (26ad3fe -> 6a60c66)

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 26ad3fe  [SPARK-31163][SQL] TruncateTableCommand with acl/permission 
should handle non-existed path
 add 6a60c66  [MINOR][SQL] Update the DataFrameWriter.bucketBy comment

No new revisions were added by this update.

Summary of changes:
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated (26ad3fe -> 6a60c66)

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 26ad3fe  [SPARK-31163][SQL] TruncateTableCommand with acl/permission 
should handle non-existed path
 add 6a60c66  [MINOR][SQL] Update the DataFrameWriter.bucketBy comment

No new revisions were added by this update.

Summary of changes:
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at rolling event log folder and appStatusFile creation

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9f27a54  [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at 
rolling event log folder and appStatusFile creation
9f27a54 is described below

commit 9f27a5495d0114723701c932f20f32d308a571cc
Author: Adam Binford 
AuthorDate: Tue Mar 17 11:20:10 2020 -0700

[SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at rolling event 
log folder and appStatusFile creation

### What changes were proposed in this pull request?
This pull request fixes an issue with rolling event logs. The rolling event 
log directory is created ignoring the dfs umask setting. This allows the 
history server to prune old rolling logs when run as the group owner of the 
event log folder.

### Why are the changes needed?
For non-rolling event logs, log files are created ignoring the umask 
setting by calling setPermission after creating the file. The default umask of 
022 currently causes rolling log directories to be created without group write 
permissions, preventing the history server from pruning logs of applications 
not run as the same user as the history server. This adds the same behavior for 
rolling event logs so users don't need to worry about the umask setting causing 
different behavior.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Manually. The folder is created with the correct 770 permission. The status 
file is still affected by the umask setting, but that doesn't stop the folder 
from being deleted by the history server. I'm not sure if that causes any other 
issues. I'm not sure how to test something involving a Hadoop setting.

Closes #27764 from Kimahriman/bug/rolling-log-permissions.

Authored-by: Adam Binford 
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/deploy/history/EventLogFileWriters.scala  | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala 
b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
index 1d58d05..7d44cbd 100644
--- 
a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
+++ 
b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
@@ -166,7 +166,8 @@ object EventLogFileWriter {
   val IN_PROGRESS = ".inprogress"
   val COMPACTED = ".compact"
 
-  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 
8).toShort)
+  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("660", 
8).toShort)
+  val LOG_FOLDER_PERMISSIONS = new FsPermission(Integer.parseInt("770", 
8).toShort)
 
   def apply(
   appId: String,
@@ -317,7 +318,8 @@ class RollingEventLogFilesWriter(
   throw new IOException(s"Target log directory already exists 
($logDirForAppPath)")
 }
 
-fileSystem.mkdirs(logDirForAppPath, 
EventLogFileWriter.LOG_FILE_PERMISSIONS)
+// SPARK-30860: use the class method to avoid the umask causing permission 
issues
+FileSystem.mkdirs(fileSystem, logDirForAppPath, 
EventLogFileWriter.LOG_FOLDER_PERMISSIONS)
 createAppStatusFile(inProgress = true)
 rollEventLogFile()
   }
@@ -361,7 +363,9 @@ class RollingEventLogFilesWriter(
 
   private def createAppStatusFile(inProgress: Boolean): Unit = {
 val appStatusPath = getAppStatusFilePath(logDirForAppPath, appId, 
appAttemptId, inProgress)
-val outputStream = fileSystem.create(appStatusPath)
+// SPARK-30860: use the class method to avoid the umask causing permission 
issues
+val outputStream = FileSystem.create(fileSystem, appStatusPath,
+  EventLogFileWriter.LOG_FILE_PERMISSIONS)
 // we intentionally create zero-byte file to minimize the cost
 outputStream.close()
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (5bc0d76 -> 9f27a54)

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 5bc0d76  [SPARK-31170][SQL] Spark SQL Cli should respect hive-site.xml 
and spark.sql.warehouse.dir
 add 9f27a54  [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at 
rolling event log folder and appStatusFile creation

No new revisions were added by this update.

Summary of changes:
 .../org/apache/spark/deploy/history/EventLogFileWriters.scala  | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at rolling event log folder and appStatusFile creation

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 314627e  [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at 
rolling event log folder and appStatusFile creation
314627e is described below

commit 314627ee148264e170b0feaa623e1e1284932166
Author: Adam Binford 
AuthorDate: Tue Mar 17 11:20:10 2020 -0700

[SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at rolling event 
log folder and appStatusFile creation

### What changes were proposed in this pull request?
This pull request fixes an issue with rolling event logs. The rolling event 
log directory is created ignoring the dfs umask setting. This allows the 
history server to prune old rolling logs when run as the group owner of the 
event log folder.

### Why are the changes needed?
For non-rolling event logs, log files are created ignoring the umask 
setting by calling setPermission after creating the file. The default umask of 
022 currently causes rolling log directories to be created without group write 
permissions, preventing the history server from pruning logs of applications 
not run as the same user as the history server. This adds the same behavior for 
rolling event logs so users don't need to worry about the umask setting causing 
different behavior.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Manually. The folder is created with the correct 770 permission. The status 
file is still affected by the umask setting, but that doesn't stop the folder 
from being deleted by the history server. I'm not sure if that causes any other 
issues. I'm not sure how to test something involving a Hadoop setting.

Closes #27764 from Kimahriman/bug/rolling-log-permissions.

Authored-by: Adam Binford 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 9f27a5495d0114723701c932f20f32d308a571cc)
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/deploy/history/EventLogFileWriters.scala  | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala 
b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
index 1d58d05..7d44cbd 100644
--- 
a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
+++ 
b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
@@ -166,7 +166,8 @@ object EventLogFileWriter {
   val IN_PROGRESS = ".inprogress"
   val COMPACTED = ".compact"
 
-  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 
8).toShort)
+  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("660", 
8).toShort)
+  val LOG_FOLDER_PERMISSIONS = new FsPermission(Integer.parseInt("770", 
8).toShort)
 
   def apply(
   appId: String,
@@ -317,7 +318,8 @@ class RollingEventLogFilesWriter(
   throw new IOException(s"Target log directory already exists 
($logDirForAppPath)")
 }
 
-fileSystem.mkdirs(logDirForAppPath, 
EventLogFileWriter.LOG_FILE_PERMISSIONS)
+// SPARK-30860: use the class method to avoid the umask causing permission 
issues
+FileSystem.mkdirs(fileSystem, logDirForAppPath, 
EventLogFileWriter.LOG_FOLDER_PERMISSIONS)
 createAppStatusFile(inProgress = true)
 rollEventLogFile()
   }
@@ -361,7 +363,9 @@ class RollingEventLogFilesWriter(
 
   private def createAppStatusFile(inProgress: Boolean): Unit = {
 val appStatusPath = getAppStatusFilePath(logDirForAppPath, appId, 
appAttemptId, inProgress)
-val outputStream = fileSystem.create(appStatusPath)
+// SPARK-30860: use the class method to avoid the umask causing permission 
issues
+val outputStream = FileSystem.create(fileSystem, appStatusPath,
+  EventLogFileWriter.LOG_FILE_PERMISSIONS)
 // we intentionally create zero-byte file to minimize the cost
 outputStream.close()
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at rolling event log folder and appStatusFile creation

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 314627e  [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at 
rolling event log folder and appStatusFile creation
314627e is described below

commit 314627ee148264e170b0feaa623e1e1284932166
Author: Adam Binford 
AuthorDate: Tue Mar 17 11:20:10 2020 -0700

[SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at rolling event 
log folder and appStatusFile creation

### What changes were proposed in this pull request?
This pull request fixes an issue with rolling event logs. The rolling event 
log directory is created ignoring the dfs umask setting. This allows the 
history server to prune old rolling logs when run as the group owner of the 
event log folder.

### Why are the changes needed?
For non-rolling event logs, log files are created ignoring the umask 
setting by calling setPermission after creating the file. The default umask of 
022 currently causes rolling log directories to be created without group write 
permissions, preventing the history server from pruning logs of applications 
not run as the same user as the history server. This adds the same behavior for 
rolling event logs so users don't need to worry about the umask setting causing 
different behavior.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Manually. The folder is created with the correct 770 permission. The status 
file is still affected by the umask setting, but that doesn't stop the folder 
from being deleted by the history server. I'm not sure if that causes any other 
issues. I'm not sure how to test something involving a Hadoop setting.

Closes #27764 from Kimahriman/bug/rolling-log-permissions.

Authored-by: Adam Binford 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 9f27a5495d0114723701c932f20f32d308a571cc)
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/deploy/history/EventLogFileWriters.scala  | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala 
b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
index 1d58d05..7d44cbd 100644
--- 
a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
+++ 
b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
@@ -166,7 +166,8 @@ object EventLogFileWriter {
   val IN_PROGRESS = ".inprogress"
   val COMPACTED = ".compact"
 
-  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 
8).toShort)
+  val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("660", 
8).toShort)
+  val LOG_FOLDER_PERMISSIONS = new FsPermission(Integer.parseInt("770", 
8).toShort)
 
   def apply(
   appId: String,
@@ -317,7 +318,8 @@ class RollingEventLogFilesWriter(
   throw new IOException(s"Target log directory already exists 
($logDirForAppPath)")
 }
 
-fileSystem.mkdirs(logDirForAppPath, 
EventLogFileWriter.LOG_FILE_PERMISSIONS)
+// SPARK-30860: use the class method to avoid the umask causing permission 
issues
+FileSystem.mkdirs(fileSystem, logDirForAppPath, 
EventLogFileWriter.LOG_FOLDER_PERMISSIONS)
 createAppStatusFile(inProgress = true)
 rollEventLogFile()
   }
@@ -361,7 +363,9 @@ class RollingEventLogFilesWriter(
 
   private def createAppStatusFile(inProgress: Boolean): Unit = {
 val appStatusPath = getAppStatusFilePath(logDirForAppPath, appId, 
appAttemptId, inProgress)
-val outputStream = fileSystem.create(appStatusPath)
+// SPARK-30860: use the class method to avoid the umask causing permission 
issues
+val outputStream = FileSystem.create(fileSystem, appStatusPath,
+  EventLogFileWriter.LOG_FILE_PERMISSIONS)
 // we intentionally create zero-byte file to minimize the cost
 outputStream.close()
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (9f27a54 -> dc5ebc2)

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 9f27a54  [SPARK-30860][CORE] Use FileSystem.mkdirs to avoid umask at 
rolling event log folder and appStatusFile creation
 add dc5ebc2  [SPARK-31171][SQL] size(null) should return null under ansi 
mode

No new revisions were added by this update.

Summary of changes:
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala   |  9 ++---
 .../catalyst/expressions/CollectionExpressionsSuite.scala|  6 ++
 .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 12 
 3 files changed, 24 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31171][SQL] size(null) should return null under ansi mode

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new b9371e3  [SPARK-31171][SQL] size(null) should return null under ansi 
mode
b9371e3 is described below

commit b9371e38abbc351106453b14172d6919be0eca82
Author: Wenchen Fan 
AuthorDate: Tue Mar 17 11:48:54 2020 -0700

[SPARK-31171][SQL] size(null) should return null under ansi mode

Make `size(null)` return null under ANSI mode, regardless of the 
`spark.sql.legacy.sizeOfNull` config.

In https://github.com/apache/spark/pull/27834, we change the result of 
`size(null)` to be -1 to match the 2.4 behavior and avoid breaking changes.

However, it's true that the "return -1" behavior is error-prone when being 
used with aggregate functions. The current ANSI mode controls a bunch of 
"better behaviors" like failing on overflow. We don't enable these "better 
behaviors" by default because they are too breaking. The "return null" behavior 
of `size(null)` is a good fit of the ANSI mode.

No as ANSI mode is off by default.

new tests

Closes #27936 from cloud-fan/null.

Authored-by: Wenchen Fan 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit dc5ebc2d5b8122121d89a9175737bea95ae10126)
Signed-off-by: Dongjoon Hyun 
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala   |  9 ++---
 .../catalyst/expressions/CollectionExpressionsSuite.scala|  6 ++
 .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 12 
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e49593e..1331350 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1940,8 +1940,8 @@ object SQLConf {
 
   val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull")
 .internal()
-.doc("If it is set to true, size of null returns -1. This behavior was 
inherited from Hive. " +
-  "The size function returns null for null input if the flag is disabled.")
+.doc(s"If it is set to false, or ${ANSI_ENABLED.key} is true, then size of 
null returns " +
+  "null. Otherwise, it returns -1, which was inherited from Hive.")
 .booleanConf
 .createWithDefault(true)
 
@@ -2759,7 +2759,10 @@ class SQLConf extends Serializable with Logging {
 
   def csvColumnPruning: Boolean = getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)
 
-  def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL)
+  def legacySizeOfNull: Boolean = {
+// size(null) should return null under ansi mode.
+getConf(SQLConf.LEGACY_SIZE_OF_NULL) && !getConf(ANSI_ENABLED)
+  }
 
   def isReplEagerEvalEnabled: Boolean = 
getConf(SQLConf.REPL_EAGER_EVAL_ENABLED)
 
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 3cfc66f..173f248 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -74,6 +74,12 @@ class CollectionExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper
 withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
   testSize(sizeOfNull = null)
 }
+// size(null) should return null under ansi mode.
+withSQLConf(
+  SQLConf.LEGACY_SIZE_OF_NULL.key -> "true",
+  SQLConf.ANSI_ENABLED.key -> "true") {
+  testSize(sizeOfNull = null)
+}
   }
 
   test("MapKeys/MapValues") {
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index a613c33..c41eb98 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -490,6 +490,12 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
 withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
   testSizeOfArray(sizeOfNull = null)
 }
+// size(null) should return null under ansi mode.
+withSQLConf(
+  SQLConf.LEGACY_SIZE_OF_NULL.key -> "true",
+  SQLConf.ANSI_ENABLED.key -> "true") {
+  testSize

[spark] branch master updated: [SPARK-31125][K8S] Terminating pods have a deletion timestamp but they are not yet dead

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 57d27e9  [SPARK-31125][K8S] Terminating pods have a deletion timestamp 
but they are not yet dead
57d27e9 is described below

commit 57d27e900f79e6c5699b9a23db236aae98e761ad
Author: Holden Karau 
AuthorDate: Tue Mar 17 12:04:06 2020 -0700

[SPARK-31125][K8S] Terminating pods have a deletion timestamp but they are 
not yet dead

### What changes were proposed in this pull request?

Change what we consider a deleted pod to not include "Terminating"

### Why are the changes needed?

If we get a new snapshot while a pod is in the process of being cleaned up 
we shouldn't delete the executor until it is fully terminated.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

This should be covered by the decommissioning tests in that they currently 
are flaky because we sometimes delete the executor instead of allowing it to 
decommission all the way.

I also ran this in a loop locally ~80 times with the only failures being 
the PV suite because of unrelated minikube mount issues.

Closes #27905 from holdenk/SPARK-31125-Processing-state-snapshots-incorrect.

Authored-by: Holden Karau 
Signed-off-by: Dongjoon Hyun 
---
 .../spark/scheduler/cluster/k8s/ExecutorPodStates.scala   |  2 ++
 .../spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala| 11 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
index 83daddf..34fca29 100644
--- 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
+++ 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
@@ -34,4 +34,6 @@ case class PodFailed(pod: Pod) extends FinalPodState
 
 case class PodDeleted(pod: Pod) extends FinalPodState
 
+case class PodTerminating(pod: Pod) extends FinalPodState
+
 case class PodUnknown(pod: Pod) extends ExecutorPodState
diff --git 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
index 435a5f1..30030ab 100644
--- 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
+++ 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
@@ -64,6 +64,8 @@ object ExecutorPodsSnapshot extends Logging {
   PodFailed(pod)
 case "succeeded" =>
   PodSucceeded(pod)
+case "terminating" =>
+  PodTerminating(pod)
 case _ =>
   logWarning(s"Received unknown phase $phase for executor pod with 
name" +
 s" ${pod.getMetadata.getName} in namespace 
${pod.getMetadata.getNamespace}")
@@ -72,5 +74,12 @@ object ExecutorPodsSnapshot extends Logging {
 }
   }
 
-  private def isDeleted(pod: Pod): Boolean = 
pod.getMetadata.getDeletionTimestamp != null
+  private def isDeleted(pod: Pod): Boolean = {
+(pod.getMetadata.getDeletionTimestamp != null &&
+  (
+pod.getStatus == null ||
+pod.getStatus.getPhase == null ||
+pod.getStatus.getPhase.toLowerCase(Locale.ROOT) != "terminating"
+  ))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31047][SQL] Improve file listing for ViewFileSystem

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 4e4e08f  [SPARK-31047][SQL] Improve file listing for ViewFileSystem
4e4e08f is described below

commit 4e4e08f372db888797fd23faca88ac02d9466d5a
Author: manuzhang 
AuthorDate: Tue Mar 17 14:23:28 2020 -0700

[SPARK-31047][SQL] Improve file listing for ViewFileSystem

### What changes were proposed in this pull request?
Use `listLocatedStatus` when `lnMemoryFileIndex` is listing files from a 
`ViewFileSystem` which should delegate to that of `DistributedFileSystem`.

### Why are the changes needed?
When `ViewFileSystem` is used to manage several `DistributedFileSystem`, 
the change will improve performance of file listing, especially when there are 
many files.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existing tests.

Closes #27801 from manuzhang/spark-31047.

Authored-by: manuzhang 
Signed-off-by: Dongjoon Hyun 
---
 .../execution/datasources/InMemoryFileIndex.scala  |  3 ++-
 .../sql/execution/datasources/FileIndexSuite.scala | 25 +-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index cac2d6e..84160f3 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
+import org.apache.hadoop.fs.viewfs.ViewFileSystem
 import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
@@ -313,7 +314,7 @@ object InMemoryFileIndex extends Logging {
 // to retrieve the file status with the file block location. The 
reason to still fallback
 // to listStatus is because the default implementation would 
potentially throw a
 // FileNotFoundException which is better handled by doing the lookups 
manually below.
-case _: DistributedFileSystem if !ignoreLocality =>
+case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality 
=>
   val remoteIter = fs.listLocatedStatus(path)
   new Iterator[LocatedFileStatus]() {
 def next(): LocatedFileStatus = remoteIter.next
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index 553773e..ea15f18 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -22,7 +22,11 @@ import java.net.URI
 
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, 
Path, RawLocalFileSystem}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, 
Path, RawLocalFileSystem, RemoteIterator}
+import org.apache.hadoop.fs.viewfs.ViewFileSystem
+import org.mockito.ArgumentMatchers.any
+import org.mockito.Mockito.{mock, when}
 
 import org.apache.spark.SparkException
 import org.apache.spark.metrics.source.HiveCatalogMetrics
@@ -465,6 +469,25 @@ class FileIndexSuite extends SharedSparkSession {
   }
 }
   }
+
+  test("SPARK-31047 - Improve file listing for ViewFileSystem") {
+val path = mock(classOf[Path])
+val dfs = mock(classOf[ViewFileSystem])
+when(path.getFileSystem(any[Configuration])).thenReturn(dfs)
+val statuses =
+  Seq(
+new LocatedFileStatus(
+  new FileStatus(0, false, 0, 100, 0,
+new Path("file")), Array(new BlockLocation()))
+  )
+when(dfs.listLocatedStatus(path)).thenReturn(new 
RemoteIterator[LocatedFileStatus] {
+  val iter = statuses.toIterator
+  override def hasNext: Boolean = iter.hasNext
+  override def next(): LocatedFileStatus = iter.next
+})
+val fileIndex = new TestInMemoryFileIndex(spark, path)
+assert(fileIndex.leafFileStatuses.toSeq == statuses)
+  }
 }
 
 object DeletionRaceFileSystem {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-30954][ML][R] Make file name the same as class name

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 6f0b0f1  [SPARK-30954][ML][R] Make file name the same as class name
6f0b0f1 is described below

commit 6f0b0f1655e5d8fbfb20ed0a58b740e6c9a6ae50
Author: Qianyang Yu 
AuthorDate: Tue Mar 17 16:15:02 2020 -0700

[SPARK-30954][ML][R] Make file name the same as class name

This pr solved the same issue as 
[pr27919](https://github.com/apache/spark/pull/27919), but this one changes the 
file names based on comment from previous pr.

### What changes were proposed in this pull request?

Make some of  file names the same as class name in R package.

### Why are the changes needed?

Make the file consistence

### Does this PR introduce any user-facing change?

No
### How was this patch tested?

run `./R/run-tests.sh`

Closes #27940 from kevinyu98/spark-30954-r-v2.

Authored-by: Qianyang Yu 
Signed-off-by: Dongjoon Hyun 
---
 ...reeClassificationWrapper.scala => DecisionTreeClassifierWrapper.scala} | 0
 ...sionTreeRegressionWrapper.scala => DecisionTreeRegressorWrapper.scala} | 0
 .../ml/r/{GBTClassificationWrapper.scala => GBTClassifierWrapper.scala}   | 0
 .../spark/ml/r/{GBTRegressionWrapper.scala => GBTRegressorWrapper.scala}  | 0
 ...estClassificationWrapper.scala => RandomForestClassifierWrapper.scala} | 0
 ...omForestRegressionWrapper.scala => RandomForestRegressorWrapper.scala} | 0
 6 files changed, 0 insertions(+), 0 deletions(-)

diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressionWrapper.scala
 b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressionWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassifierWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
rename to mllib/src/main/scala/org/apache/spark/ml/r/GBTClassifierWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressorWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala
rename to mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressorWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassifierWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassifierWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
 b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressorWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressorWrapper.scala


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30954][ML][R] Make file name the same as class name

2020-03-17 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new a7c2525  [SPARK-30954][ML][R] Make file name the same as class name
a7c2525 is described below

commit a7c25259c97085f6518f88fbcc3f0f4bde2995af
Author: Qianyang Yu 
AuthorDate: Tue Mar 17 16:15:02 2020 -0700

[SPARK-30954][ML][R] Make file name the same as class name

This pr solved the same issue as 
[pr27919](https://github.com/apache/spark/pull/27919), but this one changes the 
file names based on comment from previous pr.

### What changes were proposed in this pull request?

Make some of  file names the same as class name in R package.

### Why are the changes needed?

Make the file consistence

### Does this PR introduce any user-facing change?

No
### How was this patch tested?

run `./R/run-tests.sh`

Closes #27940 from kevinyu98/spark-30954-r-v2.

Authored-by: Qianyang Yu 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 6f0b0f1655e5d8fbfb20ed0a58b740e6c9a6ae50)
Signed-off-by: Dongjoon Hyun 
---
 ...reeClassificationWrapper.scala => DecisionTreeClassifierWrapper.scala} | 0
 ...sionTreeRegressionWrapper.scala => DecisionTreeRegressorWrapper.scala} | 0
 .../ml/r/{GBTClassificationWrapper.scala => GBTClassifierWrapper.scala}   | 0
 .../spark/ml/r/{GBTRegressionWrapper.scala => GBTRegressorWrapper.scala}  | 0
 ...estClassificationWrapper.scala => RandomForestClassifierWrapper.scala} | 0
 ...omForestRegressionWrapper.scala => RandomForestRegressorWrapper.scala} | 0
 6 files changed, 0 insertions(+), 0 deletions(-)

diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressionWrapper.scala
 b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressionWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassifierWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
rename to mllib/src/main/scala/org/apache/spark/ml/r/GBTClassifierWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressorWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala
rename to mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressorWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassifierWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassifierWrapper.scala
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
 b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressorWrapper.scala
similarity index 100%
rename from 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
rename to 
mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressorWrapper.scala


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (ac262cb -> c6a6d5e)

2020-03-19 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from ac262cb  [SPARK-30292][SQL][FOLLOWUP] ansi cast from strings to 
integral numbers (byte/short/int/long) should fail with fraction
 add c6a6d5e  Revert "[SPARK-31170][SQL] Spark SQL Cli should respect 
hive-site.xml and spark.sql.warehouse.dir"

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/sql/internal/SharedState.scala| 80 +++---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala  |  2 -
 .../spark/sql/hive/thriftserver/CliSuite.scala | 12 
 .../spark/sql/hive/HiveSharedStateSuite.scala  |  1 +
 .../spark/sql/hive/HiveSparkSubmitSuite.scala  |  2 +-
 5 files changed, 42 insertions(+), 55 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (ac262cb -> c6a6d5e)

2020-03-19 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from ac262cb  [SPARK-30292][SQL][FOLLOWUP] ansi cast from strings to 
integral numbers (byte/short/int/long) should fail with fraction
 add c6a6d5e  Revert "[SPARK-31170][SQL] Spark SQL Cli should respect 
hive-site.xml and spark.sql.warehouse.dir"

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/sql/internal/SharedState.scala| 80 +++---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala  |  2 -
 .../spark/sql/hive/thriftserver/CliSuite.scala | 12 
 .../spark/sql/hive/HiveSharedStateSuite.scala  |  1 +
 .../spark/sql/hive/HiveSparkSubmitSuite.scala  |  2 +-
 5 files changed, 42 insertions(+), 55 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: Revert "[SPARK-31170][SQL] Spark SQL Cli should respect hive-site.xml and spark.sql.warehouse.dir"

2020-03-19 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 476aaee  Revert "[SPARK-31170][SQL] Spark SQL Cli should respect 
hive-site.xml and spark.sql.warehouse.dir"
476aaee is described below

commit 476aaee7089d051dae49d2a44de69df3b79248a0
Author: Dongjoon Hyun 
AuthorDate: Thu Mar 19 16:13:50 2020 -0700

Revert "[SPARK-31170][SQL] Spark SQL Cli should respect hive-site.xml and 
spark.sql.warehouse.dir"

This reverts commit 321341a4c3104380035350631c82a4b385f117e4.

Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/internal/SharedState.scala| 80 +++---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala  |  2 -
 .../spark/sql/hive/thriftserver/CliSuite.scala | 12 
 .../spark/sql/hive/HiveSharedStateSuite.scala  |  1 +
 .../spark/sql/hive/HiveSparkSubmitSuite.scala  |  2 +-
 5 files changed, 42 insertions(+), 55 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index eb74e96..5347264 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -41,6 +41,7 @@ import 
org.apache.spark.sql.streaming.ui.{StreamingQueryStatusListener, Streamin
 import org.apache.spark.status.ElementTrackingStore
 import org.apache.spark.util.Utils
 
+
 /**
  * A class that holds all state shared across sessions in a given 
[[SQLContext]].
  *
@@ -54,10 +55,45 @@ private[sql] class SharedState(
 
   SharedState.setFsUrlStreamHandlerFactory(sparkContext.conf)
 
+  // Load hive-site.xml into hadoopConf and determine the warehouse path we 
want to use, based on
+  // the config from both hive and Spark SQL. Finally set the warehouse config 
value to sparkConf.
+  val warehousePath: String = {
+val configFile = 
Utils.getContextOrSparkClassLoader.getResource("hive-site.xml")
+if (configFile != null) {
+  logInfo(s"loading hive config file: $configFile")
+  sparkContext.hadoopConfiguration.addResource(configFile)
+}
+
+// hive.metastore.warehouse.dir only stay in hadoopConf
+sparkContext.conf.remove("hive.metastore.warehouse.dir")
+// Set the Hive metastore warehouse path to the one we use
+val hiveWarehouseDir = 
sparkContext.hadoopConfiguration.get("hive.metastore.warehouse.dir")
+if (hiveWarehouseDir != null && 
!sparkContext.conf.contains(WAREHOUSE_PATH.key)) {
+  // If hive.metastore.warehouse.dir is set and spark.sql.warehouse.dir is 
not set,
+  // we will respect the value of hive.metastore.warehouse.dir.
+  sparkContext.conf.set(WAREHOUSE_PATH.key, hiveWarehouseDir)
+  logInfo(s"${WAREHOUSE_PATH.key} is not set, but 
hive.metastore.warehouse.dir " +
+s"is set. Setting ${WAREHOUSE_PATH.key} to the value of " +
+s"hive.metastore.warehouse.dir ('$hiveWarehouseDir').")
+  hiveWarehouseDir
+} else {
+  // If spark.sql.warehouse.dir is set, we will override 
hive.metastore.warehouse.dir using
+  // the value of spark.sql.warehouse.dir.
+  // When neither spark.sql.warehouse.dir nor hive.metastore.warehouse.dir 
is set,
+  // we will set hive.metastore.warehouse.dir to the default value of 
spark.sql.warehouse.dir.
+  val sparkWarehouseDir = sparkContext.conf.get(WAREHOUSE_PATH)
+  logInfo(s"Setting hive.metastore.warehouse.dir ('$hiveWarehouseDir') to 
the value of " +
+s"${WAREHOUSE_PATH.key} ('$sparkWarehouseDir').")
+  sparkContext.hadoopConfiguration.set("hive.metastore.warehouse.dir", 
sparkWarehouseDir)
+  sparkWarehouseDir
+}
+  }
+  logInfo(s"Warehouse path is '$warehousePath'.")
+
+  // These 2 variables should be initiated after `warehousePath`, because in 
the first place we need
+  // to load hive-site.xml into hadoopConf and determine the warehouse path 
which will be set into
+  // both spark conf and hadoop conf avoiding be affected by any SparkSession 
level options
   private val (conf, hadoopConf) = {
-// Load hive-site.xml into hadoopConf and determine the warehouse path 
which will be set into
-// both spark conf and hadoop conf avoiding be affected by any 
SparkSession level options
-SharedState.loadHiveConfFile(sparkContext.conf, 
sparkContext.hadoopConfiguration)
 val confClone = sparkContext.conf.clone()
 val hadoopConfClone = new Configuration(sparkContext.hadoopConfiguration)
 // If `SparkSession` is instantiated using an existing `SparkContext` 
instance and no existing
@@ -130,7 +166,7 @@

[spark] branch master updated (c6a6d5e -> ca499e9)

2020-03-19 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from c6a6d5e  Revert "[SPARK-31170][SQL] Spark SQL Cli should respect 
hive-site.xml and spark.sql.warehouse.dir"
 add ca499e9  [SPARK-25121][SQL] Supports multi-part table names for 
broadcast hint resolution

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/HintErrorLogger.scala|  7 +-
 .../spark/sql/catalyst/analysis/ResolveHints.scala | 71 +++-
 .../spark/sql/catalyst/expressions/package.scala   |  2 +-
 .../spark/sql/catalyst/plans/logical/hints.scala   |  3 +-
 .../spark/sql/catalyst/analysis/AnalysisTest.scala |  2 +
 .../sql/catalyst/analysis/ResolveHintsSuite.scala  | 48 +++
 .../sql/catalyst/analysis/TestRelations.scala  |  2 +
 .../org/apache/spark/sql/DataFrameJoinSuite.scala  | 98 +-
 .../spark/sql/execution/GlobalTempViewSuite.scala  | 24 +-
 9 files changed, 230 insertions(+), 27 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-25121][SQL] Supports multi-part table names for broadcast hint resolution

2020-03-19 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 7d4c1b8  [SPARK-25121][SQL] Supports multi-part table names for 
broadcast hint resolution
7d4c1b8 is described below

commit 7d4c1b894ef32170b421f51c42cf30198c35c21b
Author: Takeshi Yamamuro 
AuthorDate: Thu Mar 19 20:11:04 2020 -0700

[SPARK-25121][SQL] Supports multi-part table names for broadcast hint 
resolution

### What changes were proposed in this pull request?

This pr fixed code to respect a database name for broadcast table hint 
resolution.
Currently, spark ignores a database name in multi-part names;
```
scala> sql("CREATE DATABASE testDb")
scala> spark.range(10).write.saveAsTable("testDb.t")

// without this patch
scala> spark.range(10).join(spark.table("testDb.t"), 
"id").hint("broadcast", "testDb.t").explain
== Physical Plan ==
*(2) Project [id#24L]
+- *(2) BroadcastHashJoin [id#24L], [id#26L], Inner, BuildLeft
   :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, 
false]))
   :  +- *(1) Range (0, 10, step=1, splits=4)
   +- *(2) Project [id#26L]
  +- *(2) Filter isnotnull(id#26L)
 +- *(2) FileScan parquet testdb.t[id#26L] Batched: true, Format: 
Parquet, Location: 
InMemoryFileIndex[file:/Users/maropu/Repositories/spark/spark-2.3.1-bin-hadoop2.7/spark-warehouse...,
 PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: 
struct

// with this patch
scala> spark.range(10).join(spark.table("testDb.t"), 
"id").hint("broadcast", "testDb.t").explain
== Physical Plan ==
*(2) Project [id#3L]
+- *(2) BroadcastHashJoin [id#3L], [id#5L], Inner, BuildRight
   :- *(2) Range (0, 10, step=1, splits=4)
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, 
true]))
  +- *(1) Project [id#5L]
 +- *(1) Filter isnotnull(id#5L)
+- *(1) FileScan parquet testdb.t[id#5L] Batched: true, Format: 
Parquet, Location: 
InMemoryFileIndex[file:/Users/maropu/Repositories/spark/spark-master/spark-warehouse/testdb.db/t],
 PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: 
struct
```

This PR comes from https://github.com/apache/spark/pull/22198

### Why are the changes needed?

For better usability.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added unit tests.

Closes #27935 from maropu/SPARK-25121-2.

Authored-by: Takeshi Yamamuro 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit ca499e94091ae62a6ee76ea779d7b2b4cf2dbc5c)
Signed-off-by: Dongjoon Hyun 
---
 .../sql/catalyst/analysis/HintErrorLogger.scala|  7 +-
 .../spark/sql/catalyst/analysis/ResolveHints.scala | 71 +++-
 .../spark/sql/catalyst/expressions/package.scala   |  2 +-
 .../spark/sql/catalyst/plans/logical/hints.scala   |  3 +-
 .../spark/sql/catalyst/analysis/AnalysisTest.scala |  2 +
 .../sql/catalyst/analysis/ResolveHintsSuite.scala  | 48 +++
 .../sql/catalyst/analysis/TestRelations.scala  |  2 +
 .../org/apache/spark/sql/DataFrameJoinSuite.scala  | 98 +-
 .../spark/sql/execution/GlobalTempViewSuite.scala  | 24 +-
 9 files changed, 230 insertions(+), 27 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala
index c6e0c74..71c6d40 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala
@@ -24,15 +24,16 @@ import 
org.apache.spark.sql.catalyst.plans.logical.{HintErrorHandler, HintInfo}
  * The hint error handler that logs warnings for each hint error.
  */
 object HintErrorLogger extends HintErrorHandler with Logging {
+  import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
 
   override def hintNotRecognized(name: String, parameters: Seq[Any]): Unit = {
 logWarning(s"Unrecognized hint: ${hintToPrettyString(name, parameters)}")
   }
 
   override def hintRelationsNotFound(
-  name: String, parameters: Seq[Any], invalidRelations: Set[String]): Unit 
= {
-invalidRelations.foreach { n =>
-  logWarning(s"Count not find relation '$n' specified in hint " +
+  name: String, parameters: Seq[Any], invalidRelations: Set[Seq[String]]): 
Unit = {
+invalidRelations.foreach { ident =>
+  logWarning(s"Count not find relati

[spark] branch branch-3.0 updated: [SPARK-31181][SQL][TESTS] Remove the default value assumption on CREATE TABLE test cases

2020-03-19 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new c42f9f6  [SPARK-31181][SQL][TESTS] Remove the default value assumption 
on CREATE TABLE test cases
c42f9f6 is described below

commit c42f9f61f40d4d796413ab2bfb58fbeff5ceb68b
Author: Dongjoon Hyun 
AuthorDate: Fri Mar 20 12:28:57 2020 +0800

[SPARK-31181][SQL][TESTS] Remove the default value assumption on CREATE 
TABLE test cases

A few `CREATE TABLE` test cases have some assumption on the default value 
of `LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED`. This PR (SPARK-31181) makes 
the test cases more explicit from test-case side.

The configuration change was tested via 
https://github.com/apache/spark/pull/27894 during discussing SPARK-31136. This 
PR has only the test case part from that PR.

This makes our test case more robust in terms of the default value of 
`LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED`. Even in the case where we switch 
the conf value, that will be one-liner with no test case changes.

No.

Pass the Jenkins with the existing tests.

Closes #27946 from dongjoon-hyun/SPARK-EXPLICIT-TEST.

Authored-by: Dongjoon Hyun 
Signed-off-by: Wenchen Fan 
(cherry picked from commit f1cc86792f825f76c2689660764d62d7be0d1989)
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/catalyst/parser/DDLParserSuite.scala | 27 --
 .../sql-tests/inputs/describe-table-column.sql |  2 +-
 .../sql-tests/inputs/postgreSQL/create_view.sql|  2 +-
 .../results/describe-table-column.sql.out  |  2 +-
 .../results/postgreSQL/create_view.sql.out |  2 +-
 .../spark/sql/connector/DataSourceV2SQLSuite.scala |  2 ++
 .../sql/execution/command/DDLParserSuite.scala |  3 ++-
 .../apache/spark/sql/hive/StatisticsSuite.scala|  4 +++-
 8 files changed, 26 insertions(+), 18 deletions(-)

diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
index e3570899..35a54c8 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.{EqualTo, 
Literal}
 import org.apache.spark.sql.catalyst.plans.logical._
 import 
org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition.{after, first}
 import org.apache.spark.sql.connector.expressions.{ApplyTransform, 
BucketTransform, DaysTransform, FieldReference, HoursTransform, 
IdentityTransform, LiteralValue, MonthsTransform, Transform, YearsTransform}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType, 
StructType, TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -2117,18 +2118,20 @@ class DDLParserSuite extends AnalysisTest {
   }
 
   test("create table - without using") {
-val sql = "CREATE TABLE 1m.2g(a INT)"
-val expectedTableSpec = TableSpec(
-  Seq("1m", "2g"),
-  Some(new StructType().add("a", IntegerType)),
-  Seq.empty[Transform],
-  None,
-  Map.empty[String, String],
-  None,
-  Map.empty[String, String],
-  None,
-  None)
+withSQLConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> 
"false") {
+  val sql = "CREATE TABLE 1m.2g(a INT)"
+  val expectedTableSpec = TableSpec(
+Seq("1m", "2g"),
+Some(new StructType().add("a", IntegerType)),
+Seq.empty[Transform],
+None,
+Map.empty[String, String],
+None,
+Map.empty[String, String],
+None,
+None)
 
-testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false)
+  testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = 
false)
+}
   }
 }
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql 
b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
index 821cb47..d55e398 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
@@ -52,7 +52,7 @@ DROP TABLE desc_complex_col_table;
 
 --Test case insensitive
 
-CREATE TABLE customer(CName STRING);
+CREATE TABLE customer(CName STRING) USING PARQUET;
 
 INSERT INTO customer VALUES('Maria');
 
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql 
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql
index 39e7084..21f

[spark] branch master updated: [SPARK-31184][SQL] Support getTablesByType API of Hive Client

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 3a48ea1  [SPARK-31184][SQL] Support getTablesByType API of Hive Client
3a48ea1 is described below

commit 3a48ea1fe0fb85253f12d86caea01ffcb7e678d0
Author: Eric Wu <492960...@qq.com>
AuthorDate: Sat Mar 21 17:41:23 2020 -0700

[SPARK-31184][SQL] Support getTablesByType API of Hive Client

### What changes were proposed in this pull request?
Hive 2.3+ supports `getTablesByType` API, which will provide an efficient 
way to get HiveTable with specific type. Now, we have following mappings when 
using `HiveExternalCatalog`.
```
CatalogTableType.EXTERNAL  =>  HiveTableType.EXTERNAL_TABLE
CatalogTableType.MANAGED => HiveTableType.MANAGED_TABLE
CatalogTableType.VIEW => HiveTableType.VIRTUAL_VIEW
```
Without this API, we need to achieve the goal by `getTables` + 
`getTablesByName` + `filter with type`.

This PR add `getTablesByType` in `HiveShim`. For those hive versions don't 
support this API, `UnsupportedOperationException` will be thrown. And the upper 
logic should catch the exception and fallback to the filter solution mentioned 
above.

Since the JDK11 related fix in `Hive` is not released yet, manual tests 
against hive 2.3.7-SNAPSHOT is done by following the instructions of 
SPARK-29245.

### Why are the changes needed?
This API will provide better usability and performance if we want to get a 
list of hiveTables with specific type. For example `HiveTableType.VIRTUAL_VIEW` 
corresponding to `CatalogTableType.VIEW`.

### Does this PR introduce any user-facing change?
No, this is a support function.

### How was this patch tested?
Add tests in VersionsSuite and manually run JDK11 test with following 
settings:

- Hive 2.3.6 Metastore on JDK8
- Hive 2.3.7-SNAPSHOT library build from source of Hive 2.3 branch
- Spark build with Hive 2.3.7-SNAPSHOT on jdk-11.0.6

Closes #27952 from Eric5553/GetTableByType.

Authored-by: Eric Wu <492960...@qq.com>
    Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/hive/client/HiveClient.scala  |  9 +
 .../spark/sql/hive/client/HiveClientImpl.scala | 42 --
 .../apache/spark/sql/hive/client/HiveShim.scala| 35 +-
 .../spark/sql/hive/client/VersionsSuite.scala  | 22 +---
 4 files changed, 92 insertions(+), 16 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index e31dffa..3ea80ea 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -61,6 +61,15 @@ private[hive] trait HiveClient {
   /** Returns the names of tables in the given database that matches the given 
pattern. */
   def listTables(dbName: String, pattern: String): Seq[String]
 
+  /**
+   * Returns the names of tables with specific tableType in the given database 
that matches
+   * the given pattern.
+   */
+  def listTablesByType(
+  dbName: String,
+  pattern: String,
+  tableType: CatalogTableType): Seq[String]
+
   /** Sets the name of current database. */
   def setCurrentDatabase(databaseName: String): Unit
 
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 4a3e813..6ad5e9d 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -755,6 +755,22 @@ private[hive] class HiveClientImpl(
 client.getTablesByPattern(dbName, pattern).asScala
   }
 
+  override def listTablesByType(
+  dbName: String,
+  pattern: String,
+  tableType: CatalogTableType): Seq[String] = withHiveState {
+try {
+  // Try with Hive API getTablesByType first, it's supported from Hive 
2.3+.
+  shim.getTablesByType(client, dbName, pattern, toHiveTableType(tableType))
+} catch {
+  case _: UnsupportedOperationException =>
+// Fallback to filter logic if getTablesByType not supported.
+val tableNames = client.getTablesByPattern(dbName, pattern).asScala
+val tables = getTablesByName(dbName, tableNames).filter(_.tableType == 
tableType)
+tables.map(_.identifier.table)
+}
+  }
+
   /**
* Runs the specified SQL query using Hive.
*/
@@ -1011,25 +1027,29 @@ private[hive] object HiveClientImpl extends Logging {
   private def toOutputFormat(name: String) =
 Utils.classForName[org.apache.hadoop.hive.q

[spark] branch master updated (fae981e -> 3a48ea1)

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from fae981e  [SPARK-30773][ML] Support NativeBlas for level-1 routines
 add 3a48ea1  [SPARK-31184][SQL] Support getTablesByType API of Hive Client

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/sql/hive/client/HiveClient.scala  |  9 +
 .../spark/sql/hive/client/HiveClientImpl.scala | 42 --
 .../apache/spark/sql/hive/client/HiveShim.scala| 35 +-
 .../spark/sql/hive/client/VersionsSuite.scala  | 22 +---
 4 files changed, 92 insertions(+), 16 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-30715][K8S][TESTS][FOLLOWUP] Update k8s client version in IT as well

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 3799d2b  [SPARK-30715][K8S][TESTS][FOLLOWUP] Update k8s client version 
in IT as well
3799d2b is described below

commit 3799d2b9d842f4b9f4e78bf701f5e123f0061bad
Author: Prashant Sharma 
AuthorDate: Sat Mar 21 18:26:53 2020 -0700

[SPARK-30715][K8S][TESTS][FOLLOWUP] Update k8s client version in IT as well

### What changes were proposed in this pull request?
This is a follow up for SPARK-30715 . Kubernetes client version in sync in 
integration-tests and kubernetes/core

### Why are the changes needed?
More than once, the kubernetes client version has gone out of sync between 
integration tests and kubernetes/core. So brought them up in sync again and 
added a comment to save us from future need of this additional followup.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Manually.

Closes #27948 from ScrapCodes/follow-up-spark-30715.

Authored-by: Prashant Sharma 
Signed-off-by: Dongjoon Hyun 
---
 resource-managers/kubernetes/core/pom.xml | 1 +
 resource-managers/kubernetes/integration-tests/pom.xml| 2 +-
 .../apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala | 8 
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/core/pom.xml 
b/resource-managers/kubernetes/core/pom.xml
index 18793de..b527816 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -29,6 +29,7 @@
   Spark Project Kubernetes
   
 kubernetes
+
 4.7.1
   
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml 
b/resource-managers/kubernetes/integration-tests/pom.xml
index a522e87..7a889c4 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -29,7 +29,7 @@
 1.3.0
 1.4.0
 
-4.6.4
+4.7.1
 3.2.2
 1.0
 kubernetes-integration-tests
diff --git 
a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
 
b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
index eaaf67d..4de7e70 100644
--- 
a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
+++ 
b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
@@ -61,15 +61,15 @@ class KubernetesSuite extends SparkFunSuite
   protected var appLocator: String = _
 
   // Default memory limit is 1024M + 384M (minimum overhead constant)
-  private val baseMemory = s"${1024 + 384}Mi"
+  private val baseMemory = s"${1024 + 384}"
   protected val memOverheadConstant = 0.8
-  private val standardNonJVMMemory = s"${(1024 + 0.4*1024).toInt}Mi"
+  private val standardNonJVMMemory = s"${(1024 + 0.4*1024).toInt}"
   protected val additionalMemory = 200
   // 209715200 is 200Mi
   protected val additionalMemoryInBytes = 209715200
-  private val extraDriverTotalMemory = s"${(1024 + 
memOverheadConstant*1024).toInt}Mi"
+  private val extraDriverTotalMemory = s"${(1024 + 
memOverheadConstant*1024).toInt}"
   private val extraExecTotalMemory =
-s"${(1024 + memOverheadConstant*1024 + additionalMemory).toInt}Mi"
+s"${(1024 + memOverheadConstant*1024 + additionalMemory).toInt}"
 
   /**
* Build the image ref for the given image name, taking the repo and tag 
from the


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (3799d2b -> bf342ba)

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 3799d2b  [SPARK-30715][K8S][TESTS][FOLLOWUP] Update k8s client version 
in IT as well
 add bf342ba  [SPARK-30541][TESTS] Implement KafkaDelegationTokenSuite with 
testRetry

No new revisions were added by this update.

Summary of changes:
 external/kafka-0-10-sql/src/test/resources/log4j.properties   | 4 +++-
 .../org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala | 2 +-
 .../src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala | 4 
 project/SparkBuild.scala  | 3 ++-
 4 files changed, 10 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30541][TESTS] Implement KafkaDelegationTokenSuite with testRetry

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 4a5f695  [SPARK-30541][TESTS] Implement KafkaDelegationTokenSuite with 
testRetry
4a5f695 is described below

commit 4a5f6955ba9667298bf13e5cdaa7703d5389b083
Author: Gabor Somogyi 
AuthorDate: Sat Mar 21 18:59:29 2020 -0700

[SPARK-30541][TESTS] Implement KafkaDelegationTokenSuite with testRetry

### What changes were proposed in this pull request?
`KafkaDelegationTokenSuite` has been ignored because showed flaky 
behaviour. In this PR I've changed the approach how the test executed and 
turning it on again. This PR contains the following:
* The test runs in separate JVM in order to avoid modified security context
* The body of the test runs in `testRetry` which reties if failed
* Additional logs to analyse possible failures
* Enhanced clean-up code

### Why are the changes needed?
`KafkaDelegationTokenSuite ` is ignored.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Executed the test in loop 1k+ times in jenkins (locally much harder to 
reproduce).

Closes #27877 from gaborgsomogyi/SPARK-30541.

Authored-by: Gabor Somogyi 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit bf342bafa81738a47d511d3aa02812a1ccc0ecab)
Signed-off-by: Dongjoon Hyun 
---
 external/kafka-0-10-sql/src/test/resources/log4j.properties   | 4 +++-
 .../org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala | 2 +-
 .../src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala | 4 
 project/SparkBuild.scala  | 3 ++-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/external/kafka-0-10-sql/src/test/resources/log4j.properties 
b/external/kafka-0-10-sql/src/test/resources/log4j.properties
index 75e3b53..daf0572 100644
--- a/external/kafka-0-10-sql/src/test/resources/log4j.properties
+++ b/external/kafka-0-10-sql/src/test/resources/log4j.properties
@@ -25,4 +25,6 @@ log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd 
HH:mm:ss.SSS} %t %p %c{
 
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.spark-project.jetty=WARN
-
+log4j.logger.org.apache.spark.sql.kafka010.KafkaTestUtils=DEBUG
+log4j.logger.org.apache.directory.server.kerberos.kdc.authentication=DEBUG
+log4j.logger.org.apache.directory.server.core.DefaultDirectoryService=DEBUG
diff --git 
a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala
 
b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala
index 79239e5..702bd4f 100644
--- 
a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala
+++ 
b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala
@@ -62,7 +62,7 @@ class KafkaDelegationTokenSuite extends StreamTest with 
SharedSparkSession with
 }
   }
 
-  ignore("Roundtrip") {
+  testRetry("Roundtrip", 3) {
 val hadoopConf = new Configuration()
 val manager = new HadoopDelegationTokenManager(spark.sparkContext.conf, 
hadoopConf, null)
 val credentials = new Credentials()
diff --git 
a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
 
b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 7b972fe..c1ca557 100644
--- 
a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ 
b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -170,6 +170,7 @@ class KafkaTestUtils(
 
 kdc.getKrb5conf.delete()
 Files.write(krb5confStr, kdc.getKrb5conf, StandardCharsets.UTF_8)
+logDebug(s"krb5.conf file content: $krb5confStr")
   }
 
   private def addedKrb5Config(key: String, value: String): String = {
@@ -309,6 +310,7 @@ class KafkaTestUtils(
 }
 brokerReady = false
 zkReady = false
+kdcReady = false
 
 if (producer != null) {
   producer.close()
@@ -317,6 +319,7 @@ class KafkaTestUtils(
 
 if (adminClient != null) {
   adminClient.close()
+  adminClient = null
 }
 
 if (server != null) {
@@ -351,6 +354,7 @@ class KafkaTestUtils(
 Configuration.getConfiguration.refresh()
 if (kdc != null) {
   kdc.stop()
+  kdc = null
 }
 UserGroupInformation.reset()
 teardownKrbDebug()
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 1a2a7c3..3889013 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -479,7 +4

[spark] branch master updated: [SPARK-31101][BUILD] Upgrade Janino to 3.0.16

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new f55f6b5  [SPARK-31101][BUILD] Upgrade Janino to 3.0.16
f55f6b5 is described below

commit f55f6b569beea3636549f8a71949cd2bca2a813b
Author: Jungtaek Lim (HeartSaVioR) 
AuthorDate: Sat Mar 21 19:10:23 2020 -0700

[SPARK-31101][BUILD] Upgrade Janino to 3.0.16

### What changes were proposed in this pull request?

This PR(SPARK-31101) proposes to upgrade Janino to 3.0.16 which is released 
recently.

* Merged pull request janino-compiler/janino#114 "Grow the code for 
relocatables, and do fixup, and relocate".

Please see the commit log.
- https://github.com/janino-compiler/janino/commits/3.0.16

You can see the changelog from the link: 
http://janino-compiler.github.io/janino/changelog.html / though release note 
for Janino 3.0.16 is actually incorrect.

### Why are the changes needed?

We got some report on failure on user's query which Janino throws error on 
compiling generated code. The issue is here: janino-compiler/janino#113 It 
contains the information of generated code, symptom (error), and analysis of 
the bug, so please refer the link for more details.
Janino 3.0.16 contains the PR janino-compiler/janino#114 which would enable 
Janino to succeed to compile user's query properly.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Existing UTs.

Closes #27932 from HeartSaVioR/SPARK-31101-janino-3.0.16.

Authored-by: Jungtaek Lim (HeartSaVioR) 
Signed-off-by: Dongjoon Hyun 
---
 dev/deps/spark-deps-hadoop-2.7-hive-1.2 | 4 ++--
 dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 4 ++--
 dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 4 ++--
 pom.xml | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 
b/dev/deps/spark-deps-hadoop-2.7-hive-1.2
index 6ab937f..8f4d48f 100644
--- a/dev/deps/spark-deps-hadoop-2.7-hive-1.2
+++ b/dev/deps/spark-deps-hadoop-2.7-hive-1.2
@@ -35,7 +35,7 @@ commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar
 commons-cli/1.2//commons-cli-1.2.jar
 commons-codec/1.10//commons-codec-1.10.jar
 commons-collections/3.2.2//commons-collections-3.2.2.jar
-commons-compiler/3.0.15//commons-compiler-3.0.15.jar
+commons-compiler/3.0.16//commons-compiler-3.0.16.jar
 commons-compress/1.8.1//commons-compress-1.8.1.jar
 commons-configuration/1.6//commons-configuration-1.6.jar
 commons-crypto/1.0.0//commons-crypto-1.0.0.jar
@@ -105,7 +105,7 @@ jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar
 jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar
 jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar
 jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar
-janino/3.0.15//janino-3.0.15.jar
+janino/3.0.16//janino-3.0.16.jar
 javassist/3.25.0-GA//javassist-3.25.0-GA.jar
 javax.inject/1//javax.inject-1.jar
 javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 
b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
index 6f56381..3b1d3ad 100644
--- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
@@ -33,7 +33,7 @@ commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar
 commons-cli/1.2//commons-cli-1.2.jar
 commons-codec/1.10//commons-codec-1.10.jar
 commons-collections/3.2.2//commons-collections-3.2.2.jar
-commons-compiler/3.0.15//commons-compiler-3.0.15.jar
+commons-compiler/3.0.16//commons-compiler-3.0.16.jar
 commons-compress/1.8.1//commons-compress-1.8.1.jar
 commons-configuration/1.6//commons-configuration-1.6.jar
 commons-crypto/1.0.0//commons-crypto-1.0.0.jar
@@ -118,7 +118,7 @@ jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar
 jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar
 jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar
 jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar
-janino/3.0.15//janino-3.0.15.jar
+janino/3.0.16//janino-3.0.16.jar
 javassist/3.25.0-GA//javassist-3.25.0-GA.jar
 javax.inject/1//javax.inject-1.jar
 javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar
diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 
b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
index 90cbd02..3486ed1 100644
--- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
@@ -30,7 +30,7 @@ commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar
 commons-cli/1.2//commons-cli-1.2.jar
 commons-codec/1.10//commons-codec-1.10.jar
 commons-collections/3.2.2//commons-collections-3.2.2.jar
-commons-compiler/3.0.15//commons-compiler-3.0.15.jar
+commons-compiler/3.0.16//commons-compiler-3.0.16.jar
 commons-compress/1.8.1//commons-compress-1.8.1.jar
 commons-configuration2/2.1.1//commons-conf

[spark] branch branch-3.0 updated: Revert "[SPARK-31183][SQL][FOLLOWUP] Move rebase tests to `AvroSuite` and check the rebase flag out of function bodies"

2020-03-21 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new f0bfdc5  Revert "[SPARK-31183][SQL][FOLLOWUP] Move rebase tests to 
`AvroSuite` and check the rebase flag out of function bodies"
f0bfdc5 is described below

commit f0bfdc513a15884de8f3ffc79cc1845991082642
Author: Dongjoon Hyun 
AuthorDate: Sat Mar 21 20:52:04 2020 -0700

Revert "[SPARK-31183][SQL][FOLLOWUP] Move rebase tests to `AvroSuite` and 
check the rebase flag out of function bodies"

This reverts commit a6f3e3b096e2d7a39e0b2fdec6452e6d633baf7e.

Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/avro/AvroDeserializer.scala   |  21 ++--
 .../org/apache/spark/sql/avro/AvroSerializer.scala |  18 +--
 .../spark/sql/avro/AvroLogicalTypeSuite.scala  |  98 +++-
 .../org/apache/spark/sql/avro/AvroSuite.scala  | 124 +++--
 4 files changed, 130 insertions(+), 131 deletions(-)

diff --git 
a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala 
b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
index 3e8a7f9..b98f303 100644
--- 
a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
+++ 
b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
@@ -106,22 +106,21 @@ class AvroDeserializer(rootAvroType: Schema, 
rootCatalystType: DataType) {
   case (LONG, TimestampType) => avroType.getLogicalType match {
 // For backward compatibility, if the Avro type is Long and it is not 
logical type
 // (the `null` case), the value is processed as timestamp type with 
millisecond precision.
-case null | _: TimestampMillis if rebaseDateTime => (updater, ordinal, 
value) =>
-  val millis = value.asInstanceOf[Long]
-  val micros = DateTimeUtils.fromMillis(millis)
-  val rebasedMicros = 
DateTimeUtils.rebaseJulianToGregorianMicros(micros)
-  updater.setLong(ordinal, rebasedMicros)
 case null | _: TimestampMillis => (updater, ordinal, value) =>
   val millis = value.asInstanceOf[Long]
   val micros = DateTimeUtils.fromMillis(millis)
-  updater.setLong(ordinal, micros)
-case _: TimestampMicros if rebaseDateTime => (updater, ordinal, value) 
=>
-  val micros = value.asInstanceOf[Long]
-  val rebasedMicros = 
DateTimeUtils.rebaseJulianToGregorianMicros(micros)
-  updater.setLong(ordinal, rebasedMicros)
+  if (rebaseDateTime) {
+updater.setLong(ordinal, 
DateTimeUtils.rebaseJulianToGregorianMicros(micros))
+  } else {
+updater.setLong(ordinal, micros)
+  }
 case _: TimestampMicros => (updater, ordinal, value) =>
   val micros = value.asInstanceOf[Long]
-  updater.setLong(ordinal, micros)
+  if (rebaseDateTime) {
+updater.setLong(ordinal, 
DateTimeUtils.rebaseJulianToGregorianMicros(micros))
+  } else {
+updater.setLong(ordinal, micros)
+  }
 case other => throw new IncompatibleSchemaException(
   s"Cannot convert Avro logical type ${other} to Catalyst Timestamp 
type.")
   }
diff --git 
a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala 
b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala
index 68df7c0..af9e3a5 100644
--- 
a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala
+++ 
b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala
@@ -149,15 +149,17 @@ class AvroSerializer(rootCatalystType: DataType, 
rootAvroType: Schema, nullable:
   case (TimestampType, LONG) => avroType.getLogicalType match {
   // For backward compatibility, if the Avro type is Long and it is 
not logical type
   // (the `null` case), output the timestamp value as with millisecond 
precision.
-  case null | _: TimestampMillis if rebaseDateTime => (getter, 
ordinal) =>
-val micros = getter.getLong(ordinal)
-val rebasedMicros = 
DateTimeUtils.rebaseGregorianToJulianMicros(micros)
-DateTimeUtils.fromMillis(rebasedMicros)
   case null | _: TimestampMillis => (getter, ordinal) =>
-DateTimeUtils.fromMillis(getter.getLong(ordinal))
-  case _: TimestampMicros if rebaseDateTime => (getter, ordinal) =>
-
DateTimeUtils.rebaseGregorianToJulianMicros(getter.getLong(ordinal))
-  case _: TimestampMicros => (getter, ordinal) => 
getter.getLong(ordinal)
+val micros = getter.getLong(ordinal)
+val rebasedMicros = if (rebaseDateTime) {
+  DateTimeUtils.rebaseGregori

[spark] branch master updated: [SPARK-30494][SQL] Fix cached data leakage during replacing an existing view

2020-03-22 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 929b794  [SPARK-30494][SQL] Fix cached data leakage during replacing 
an existing view
929b794 is described below

commit 929b794e25ff5454dadde7da304e6df25526d60e
Author: LantaoJin 
AuthorDate: Sun Mar 22 22:22:13 2020 -0700

[SPARK-30494][SQL] Fix cached data leakage during replacing an existing view

### What changes were proposed in this pull request?

The cached RDD for plan "select 1" stays in memory forever until the 
session close. This cached data cannot be used since the view temp1 has been 
replaced by another plan. It's a memory leak.

We can reproduce by below commands:
```
Welcome to
    __
 / __/__  ___ _/ /__
_\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.0.0-SNAPSHOT
  /_/

Using Scala version 2.12.10 (Java HotSpot(TM) 64-Bit Server VM, Java 
1.8.0_201)
Type in expressions to have them evaluated.
Type :help for more information.

scala> spark.sql("create or replace temporary view temp1 as select 1")
scala> spark.sql("cache table temp1")
scala> spark.sql("create or replace temporary view temp1 as select 1, 2")
scala> spark.sql("cache table temp1")
scala> assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 
1, 2")).isDefined)
scala> assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 
1")).isDefined)
```

### Why are the changes needed?
Fix the memory leak, specially for long running mode.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Add an unit test.

Closes #27185 from LantaoJin/SPARK-30494.

Authored-by: LantaoJin 
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/execution/command/CommandUtils.scala |  8 +
 .../spark/sql/execution/command/tables.scala   | 13 ++-
 .../apache/spark/sql/execution/command/views.scala | 15 
 .../org/apache/spark/sql/CachedTableSuite.scala| 42 ++
 .../sql/hive/execution/InsertIntoHiveTable.scala   |  2 +-
 5 files changed, 68 insertions(+), 12 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 81157ca..c047be7 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -386,4 +386,12 @@ object CommandUtils extends Logging {
   private def isDataPath(path: Path, stagingDir: String): Boolean = {
 !path.getName.startsWith(stagingDir) && DataSourceUtils.isDataPath(path)
   }
+
+  def uncacheTableOrView(sparkSession: SparkSession, name: String): Unit = {
+try {
+  sparkSession.catalog.uncacheTable(name)
+} catch {
+  case NonFatal(e) => logWarning("Exception when attempting to uncache 
$name", e)
+}
+  }
 }
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index d4de822..61955ba 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -196,11 +196,7 @@ case class AlterTableRenameCommand(
   // this can happen with Hive tables when the underlying catalog is 
in-memory.
   val wasCached = 
Try(sparkSession.catalog.isCached(oldName.unquotedString)).getOrElse(false)
   if (wasCached) {
-try {
-  sparkSession.catalog.uncacheTable(oldName.unquotedString)
-} catch {
-  case NonFatal(e) => log.warn(e.toString, e)
-}
+CommandUtils.uncacheTableOrView(sparkSession, oldName.unquotedString)
   }
   // Invalidate the table last, otherwise uncaching the table would load 
the logical plan
   // back into the hive metastore cache
@@ -230,12 +226,7 @@ case class AlterTableAddColumnsCommand(
 val catalog = sparkSession.sessionState.catalog
 val catalogTable = 
verifyAlterTableAddColumn(sparkSession.sessionState.conf, catalog, table)
 
-try {
-  sparkSession.catalog.uncacheTable(table.quotedString)
-} catch {
-  case NonFatal(e) =>
-log.warn(s"Exception when attempting to uncache table 
${table.quotedString}", e)
-}
+CommandUtils.uncacheTableOrView(sparkSession, table.quotedString)
 catalog.refreshTable(table)
 
 SchemaUtils.checkCol

[spark] branch branch-3.0 updated: [SPARK-30494][SQL] Fix cached data leakage during replacing an existing view

2020-03-22 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 8c09160  [SPARK-30494][SQL] Fix cached data leakage during replacing 
an existing view
8c09160 is described below

commit 8c09160cb8ec750826932988e1c87385a496ba22
Author: LantaoJin 
AuthorDate: Sun Mar 22 22:22:13 2020 -0700

[SPARK-30494][SQL] Fix cached data leakage during replacing an existing view

### What changes were proposed in this pull request?

The cached RDD for plan "select 1" stays in memory forever until the 
session close. This cached data cannot be used since the view temp1 has been 
replaced by another plan. It's a memory leak.

We can reproduce by below commands:
```
Welcome to
    __
 / __/__  ___ _/ /__
_\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.0.0-SNAPSHOT
  /_/

Using Scala version 2.12.10 (Java HotSpot(TM) 64-Bit Server VM, Java 
1.8.0_201)
Type in expressions to have them evaluated.
Type :help for more information.

scala> spark.sql("create or replace temporary view temp1 as select 1")
scala> spark.sql("cache table temp1")
scala> spark.sql("create or replace temporary view temp1 as select 1, 2")
scala> spark.sql("cache table temp1")
scala> assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 
1, 2")).isDefined)
scala> assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 
1")).isDefined)
```

### Why are the changes needed?
Fix the memory leak, specially for long running mode.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Add an unit test.

Closes #27185 from LantaoJin/SPARK-30494.

Authored-by: LantaoJin 
Signed-off-by: Dongjoon Hyun 
    (cherry picked from commit 929b794e25ff5454dadde7da304e6df25526d60e)
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/execution/command/CommandUtils.scala |  8 +
 .../spark/sql/execution/command/tables.scala   | 13 ++-
 .../apache/spark/sql/execution/command/views.scala | 15 
 .../org/apache/spark/sql/CachedTableSuite.scala| 42 ++
 .../sql/hive/execution/InsertIntoHiveTable.scala   |  2 +-
 5 files changed, 68 insertions(+), 12 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index b229b23..7e456a6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -385,4 +385,12 @@ object CommandUtils extends Logging {
   private def isDataPath(path: Path, stagingDir: String): Boolean = {
 !path.getName.startsWith(stagingDir) && DataSourceUtils.isDataPath(path)
   }
+
+  def uncacheTableOrView(sparkSession: SparkSession, name: String): Unit = {
+try {
+  sparkSession.catalog.uncacheTable(name)
+} catch {
+  case NonFatal(e) => logWarning("Exception when attempting to uncache 
$name", e)
+}
+  }
 }
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index d4de822..61955ba 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -196,11 +196,7 @@ case class AlterTableRenameCommand(
   // this can happen with Hive tables when the underlying catalog is 
in-memory.
   val wasCached = 
Try(sparkSession.catalog.isCached(oldName.unquotedString)).getOrElse(false)
   if (wasCached) {
-try {
-  sparkSession.catalog.uncacheTable(oldName.unquotedString)
-} catch {
-  case NonFatal(e) => log.warn(e.toString, e)
-}
+CommandUtils.uncacheTableOrView(sparkSession, oldName.unquotedString)
   }
   // Invalidate the table last, otherwise uncaching the table would load 
the logical plan
   // back into the hive metastore cache
@@ -230,12 +226,7 @@ case class AlterTableAddColumnsCommand(
 val catalog = sparkSession.sessionState.catalog
 val catalogTable = 
verifyAlterTableAddColumn(sparkSession.sessionState.conf, catalog, table)
 
-try {
-  sparkSession.catalog.uncacheTable(table.quotedString)
-} catch {
-  case NonFatal(e) =>
-log.warn(s"Exception when attempting to uncache table 
${table.quotedString}", e)
-}
+CommandUt

[spark] branch branch-2.4 updated: Revert "[SPARK-31231][BUILD] Explicitly setuptools version as 46.0.0 in pip package test"

2020-03-24 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new e37f664  Revert "[SPARK-31231][BUILD] Explicitly setuptools version as 
46.0.0 in pip package test"
e37f664 is described below

commit e37f66420bbb73da1a1a4d3ad02b175eedbb5052
Author: Dongjoon Hyun 
AuthorDate: Tue Mar 24 11:55:57 2020 -0700

Revert "[SPARK-31231][BUILD] Explicitly setuptools version as 46.0.0 in pip 
package test"

This reverts commit 223b9fb1eadeba0e05b1a300512c31c4f99f41e8.

Signed-off-by: Dongjoon Hyun 
---
 dev/run-pip-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-pip-tests b/dev/run-pip-tests
index f9cd94d..60cf4d8 100755
--- a/dev/run-pip-tests
+++ b/dev/run-pip-tests
@@ -81,7 +81,7 @@ for python in "${PYTHON_EXECS[@]}"; do
 VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
 rm -rf "$VIRTUALENV_PATH"
 if [ -n "$USE_CONDA" ]; then
-  conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip 
setuptools=46.0.0
+  conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip 
setuptools
   source activate "$VIRTUALENV_PATH"
 else
   mkdir -p "$VIRTUALENV_PATH"


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated (e37f664 -> 4381ad5)

2020-03-24 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git.


from e37f664  Revert "[SPARK-31231][BUILD] Explicitly setuptools version as 
46.0.0 in pip package test"
 add 4381ad5  [SPARK-30494][SQL][2.4] Fix cached data leakage during 
replacing an existing view

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/execution/command/CommandUtils.scala |  8 +
 .../spark/sql/execution/command/tables.scala   | 13 ++-
 .../apache/spark/sql/execution/command/views.scala | 17 -
 .../org/apache/spark/sql/CachedTableSuite.scala| 42 ++
 .../sql/hive/execution/InsertIntoHiveTable.scala   |  2 +-
 5 files changed, 69 insertions(+), 13 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated (d278960 -> 4b97009)

2020-03-24 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git.


from d278960  [SPARK-31081][UI][SQL] Make display of 
stageId/stageAttemptId/taskId of sql metrics toggleable
 add 4b97009  [SPARK-31101][BUILD][3.0] Upgrade Janino to 3.0.16

No new revisions were added by this update.

Summary of changes:
 dev/deps/spark-deps-hadoop-2.7-hive-1.2 | 4 ++--
 dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 4 ++--
 dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 4 ++--
 pom.xml | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31184][SQL] Support getTablesByType API of Hive Client

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new b3fd5c9  [SPARK-31184][SQL] Support getTablesByType API of Hive Client
b3fd5c9 is described below

commit b3fd5c90ae00807f3a49e2b9f1f0c8d026cc4219
Author: Eric Wu <492960...@qq.com>
AuthorDate: Sat Mar 21 17:41:23 2020 -0700

[SPARK-31184][SQL] Support getTablesByType API of Hive Client

### What changes were proposed in this pull request?
Hive 2.3+ supports `getTablesByType` API, which will provide an efficient 
way to get HiveTable with specific type. Now, we have following mappings when 
using `HiveExternalCatalog`.
```
CatalogTableType.EXTERNAL  =>  HiveTableType.EXTERNAL_TABLE
CatalogTableType.MANAGED => HiveTableType.MANAGED_TABLE
CatalogTableType.VIEW => HiveTableType.VIRTUAL_VIEW
```
Without this API, we need to achieve the goal by `getTables` + 
`getTablesByName` + `filter with type`.

This PR add `getTablesByType` in `HiveShim`. For those hive versions don't 
support this API, `UnsupportedOperationException` will be thrown. And the upper 
logic should catch the exception and fallback to the filter solution mentioned 
above.

Since the JDK11 related fix in `Hive` is not released yet, manual tests 
against hive 2.3.7-SNAPSHOT is done by following the instructions of 
SPARK-29245.

### Why are the changes needed?
This API will provide better usability and performance if we want to get a 
list of hiveTables with specific type. For example `HiveTableType.VIRTUAL_VIEW` 
corresponding to `CatalogTableType.VIEW`.

### Does this PR introduce any user-facing change?
No, this is a support function.

### How was this patch tested?
Add tests in VersionsSuite and manually run JDK11 test with following 
settings:

- Hive 2.3.6 Metastore on JDK8
- Hive 2.3.7-SNAPSHOT library build from source of Hive 2.3 branch
- Spark build with Hive 2.3.7-SNAPSHOT on jdk-11.0.6

Closes #27952 from Eric5553/GetTableByType.

Authored-by: Eric Wu <492960...@qq.com>
    Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 3a48ea1fe0fb85253f12d86caea01ffcb7e678d0)
    Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/sql/hive/client/HiveClient.scala  |  9 +
 .../spark/sql/hive/client/HiveClientImpl.scala | 42 --
 .../apache/spark/sql/hive/client/HiveShim.scala| 35 +-
 .../spark/sql/hive/client/VersionsSuite.scala  | 22 +---
 4 files changed, 92 insertions(+), 16 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index e31dffa..3ea80ea 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -61,6 +61,15 @@ private[hive] trait HiveClient {
   /** Returns the names of tables in the given database that matches the given 
pattern. */
   def listTables(dbName: String, pattern: String): Seq[String]
 
+  /**
+   * Returns the names of tables with specific tableType in the given database 
that matches
+   * the given pattern.
+   */
+  def listTablesByType(
+  dbName: String,
+  pattern: String,
+  tableType: CatalogTableType): Seq[String]
+
   /** Sets the name of current database. */
   def setCurrentDatabase(databaseName: String): Unit
 
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 4a3e813..6ad5e9d 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -755,6 +755,22 @@ private[hive] class HiveClientImpl(
 client.getTablesByPattern(dbName, pattern).asScala
   }
 
+  override def listTablesByType(
+  dbName: String,
+  pattern: String,
+  tableType: CatalogTableType): Seq[String] = withHiveState {
+try {
+  // Try with Hive API getTablesByType first, it's supported from Hive 
2.3+.
+  shim.getTablesByType(client, dbName, pattern, toHiveTableType(tableType))
+} catch {
+  case _: UnsupportedOperationException =>
+// Fallback to filter logic if getTablesByType not supported.
+val tableNames = client.getTablesByPattern(dbName, pattern).asScala
+val tables = getTablesByName(dbName, tableNames).filter(_.tableType == 
tableType)
+tables.map(_.identifier.table)
+}
+  }
+
   /**
* Runs the specified SQL query using Hive.
*/
@@ -1011,25 +1027,29 @@ private[hive] object HiveClientImp

[spark] branch master updated: [SPARK-25121][SQL][FOLLOWUP] Add more unit tests for multi-part identifiers in join strategy hints

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new da49f50  [SPARK-25121][SQL][FOLLOWUP] Add more unit tests for 
multi-part identifiers in join strategy hints
da49f50 is described below

commit da49f50621b201822a64358624c19be3fae0a855
Author: Takeshi Yamamuro 
AuthorDate: Wed Mar 25 08:37:28 2020 -0700

[SPARK-25121][SQL][FOLLOWUP] Add more unit tests for multi-part identifiers 
in join strategy hints

### What changes were proposed in this pull request?

This pr intends to add unit tests for the other join hints (`MERGEJOIN`, 
`SHUFFLE_HASH`, and `SHUFFLE_REPLICATE_NL`). This is a followup PR of #27935.

### Why are the changes needed?

For better test coverage.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added unit tests.

Closes #28013 from maropu/SPARK-25121-FOLLOWUP.

Authored-by: Takeshi Yamamuro 
Signed-off-by: Dongjoon Hyun 
---
 .../sql/catalyst/analysis/ResolveHintsSuite.scala  | 93 --
 1 file changed, 49 insertions(+), 44 deletions(-)

diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveHintsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveHintsSuite.scala
index ca7d284..d3bd5d0 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveHintsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveHintsSuite.scala
@@ -242,51 +242,56 @@ class ResolveHintsSuite extends AnalysisTest {
   caseSensitive = false)
   }
 
-  test("Supports multi-part table names for broadcast hint resolution") {
-// local temp table (single-part identifier case)
-checkAnalysis(
-  UnresolvedHint("MAPJOIN", Seq("table", "table2"),
-table("TaBlE").join(table("TaBlE2"))),
-  Join(
-ResolvedHint(testRelation, HintInfo(strategy = Some(BROADCAST))),
-ResolvedHint(testRelation2, HintInfo(strategy = Some(BROADCAST))),
-Inner,
-None,
-JoinHint.NONE),
-  caseSensitive = false)
-
-checkAnalysis(
-  UnresolvedHint("MAPJOIN", Seq("TaBlE", "table2"),
-table("TaBlE").join(table("TaBlE2"))),
-  Join(
-ResolvedHint(testRelation, HintInfo(strategy = Some(BROADCAST))),
-testRelation2,
-Inner,
-None,
-JoinHint.NONE),
-  caseSensitive = true)
+  test("Supports multi-part table names for join strategy hint resolution") {
+Seq(("MAPJOIN", BROADCAST),
+("MERGEJOIN", SHUFFLE_MERGE),
+("SHUFFLE_HASH", SHUFFLE_HASH),
+("SHUFFLE_REPLICATE_NL", SHUFFLE_REPLICATE_NL)).foreach { case 
(hintName, st) =>
+  // local temp table (single-part identifier case)
+  checkAnalysis(
+UnresolvedHint(hintName, Seq("table", "table2"),
+  table("TaBlE").join(table("TaBlE2"))),
+Join(
+  ResolvedHint(testRelation, HintInfo(strategy = Some(st))),
+  ResolvedHint(testRelation2, HintInfo(strategy = Some(st))),
+  Inner,
+  None,
+  JoinHint.NONE),
+caseSensitive = false)
 
-// global temp table (multi-part identifier case)
-checkAnalysis(
-  UnresolvedHint("MAPJOIN", Seq("GlOBal_TeMP.table4", "table5"),
-table("global_temp", "table4").join(table("global_temp", "table5"))),
-  Join(
-ResolvedHint(testRelation4, HintInfo(strategy = Some(BROADCAST))),
-ResolvedHint(testRelation5, HintInfo(strategy = Some(BROADCAST))),
-Inner,
-None,
-JoinHint.NONE),
-  caseSensitive = false)
+  checkAnalysis(
+UnresolvedHint(hintName, Seq("TaBlE", "table2"),
+  table("TaBlE").join(table("TaBlE2"))),
+Join(
+  ResolvedHint(testRelation, HintInfo(strategy = Some(st))),
+  testRelation2,
+  Inner,
+  None,
+  JoinHint.NONE),
+caseSensitive = true)
+
+  // global temp table (multi-part identifier case)
+  checkAnalysis(
+UnresolvedHint(hintName, Seq("GlOBal_TeMP.table4", "table5"),
+  table("global_temp", "table4").join(table("global_temp", "table5"))),
+Join(
+  ResolvedHint(testRelation4, HintInfo(strategy = Some(st))),
+  ResolvedHint(testRelation5, HintInfo(strategy = Some(st))),
+  Inner,
+

[spark] branch master updated (27d53de -> da49f50)

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 27d53de  [SPARK-31232][SQL][DOCS] Specify formats of 
`spark.sql.session.timeZone`
 add da49f50  [SPARK-25121][SQL][FOLLOWUP] Add more unit tests for 
multi-part identifiers in join strategy hints

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/ResolveHintsSuite.scala  | 93 --
 1 file changed, 49 insertions(+), 44 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated (bd99994 -> e6c85b3)

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git.


from bd4  [SPARK-31232][SQL][DOCS] Specify formats of 
`spark.sql.session.timeZone`
 add e6c85b3  [SPARK-25121][SQL][FOLLOWUP] Add more unit tests for 
multi-part identifiers in join strategy hints

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/ResolveHintsSuite.scala  | 93 --
 1 file changed, 49 insertions(+), 44 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated (bd99994 -> e6c85b3)

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git.


from bd4  [SPARK-31232][SQL][DOCS] Specify formats of 
`spark.sql.session.timeZone`
 add e6c85b3  [SPARK-25121][SQL][FOLLOWUP] Add more unit tests for 
multi-part identifiers in join strategy hints

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/ResolveHintsSuite.scala  | 93 --
 1 file changed, 49 insertions(+), 44 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (da49f50 -> 4f274a4)

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from da49f50  [SPARK-25121][SQL][FOLLOWUP] Add more unit tests for 
multi-part identifiers in join strategy hints
 add 4f274a4  [SPARK-31147][SQL] Forbid CHAR type in non-Hive-Serde tables

No new revisions were added by this update.

Summary of changes:
 docs/sql-migration-guide.md| 18 +++---
 .../sql/catalyst/analysis/ResolveCatalogs.scala|  5 ++
 .../spark/sql/catalyst/parser/AstBuilder.scala |  4 ++
 .../spark/sql/catalyst/parser/ParseDriver.scala|  5 ++
 .../sql/catalyst/parser/ParserInterface.scala  |  6 ++
 .../sql/connector/catalog/CatalogV2Util.scala  | 19 +-
 .../apache/spark/sql/types/HiveStringType.scala|  7 +++
 .../catalyst/analysis/ResolveSessionCatalog.scala  | 15 +
 .../spark/sql/SparkSessionExtensionSuite.scala |  3 +
 .../execution/command/PlanResolutionSuite.scala| 70 ++
 .../spark/sql/hive/HiveMetastoreCatalogSuite.scala |  2 +-
 11 files changed, 134 insertions(+), 20 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S DepsTestsSuite

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 53221cd  [SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S 
DepsTestsSuite
53221cd is described below

commit 53221cda408e9be5d0d2ff5946c200cb43647dd9
Author: Dongjoon Hyun 
AuthorDate: Wed Mar 25 12:38:15 2020 -0700

[SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S DepsTestsSuite

### What changes were proposed in this pull request?

This PR (SPARK-31244) replaces `Ceph` with `Minio` in K8S `DepsTestSuite`.

### Why are the changes needed?

Currently, `DepsTestsSuite` is using `ceph` for S3 storage. However, the 
used version and all new releases are broken on new `minikube` releases. We had 
better use more robust and small one.

```
$ minikube version
minikube version: v1.8.2

$ minikube -p minikube docker-env | source

$ docker run -it --rm -e NETWORK_AUTO_DETECT=4 -e RGW_FRONTEND_PORT=8000 -e 
SREE_PORT=5001 -e CEPH_DEMO_UID=nano -e CEPH_DAEMON=demo 
ceph/daemon:v4.0.3-stable-4.0-nautilus-centos-7-x86_64 /bin/sh
2020-03-25 04:26:21  /opt/ceph-container/bin/entrypoint.sh: ERROR- it looks 
like we have not been able to discover the network settings

$ docker run -it --rm -e NETWORK_AUTO_DETECT=4 -e RGW_FRONTEND_PORT=8000 -e 
SREE_PORT=5001 -e CEPH_DEMO_UID=nano -e CEPH_DAEMON=demo 
ceph/daemon:v4.0.11-stable-4.0-nautilus-centos-7 /bin/sh
2020-03-25 04:20:30  /opt/ceph-container/bin/entrypoint.sh: ERROR- it looks 
like we have not been able to discover the network settings
```

Also, the image size is unnecessarily big (almost `1GB`) and growing while 
`minio` is `55.8MB` with the same features.
```
$ docker images | grep ceph
ceph/daemon v4.0.3-stable-4.0-nautilus-centos-7-x86_64 a6a05ccdf924 6 
months ago 852MB
ceph/daemon v4.0.11-stable-4.0-nautilus-centos-7   87f695550d8e 12 
hours ago 901MB

$ docker images | grep minio
minio/minio latest 95c226551ea6 5 days 
ago   55.8MB
```

### Does this PR introduce any user-facing change?

No. (This is a test case change)

### How was this patch tested?

Pass the existing Jenkins K8s integration test job and test with the latest 
minikube.
```
$ minikube version
minikube version: v1.8.2

$ kubectl version --short
Client Version: v1.17.4
Server Version: v1.17.4

$ NO_MANUAL=1 ./dev/make-distribution.sh --r --pip --tgz -Pkubernetes
$ 
resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh 
--spark-tgz $PWD/spark-*.tgz
...
KubernetesSuite:
- Run SparkPi with no resources
- Run SparkPi with a very long application name.
- Use SparkLauncher.NO_RESOURCE
- Run SparkPi with a master URL without a scheme.
- Run SparkPi with an argument.
- Run SparkPi with custom labels, annotations, and environment variables.
- All pods have the same service account by default
- Run extraJVMOptions check on driver
- Run SparkRemoteFileTest using a remote data file
- Run SparkPi with env and mount secrets.
- Run PySpark on simple pi.py example
- Run PySpark with Python2 to test a pyfiles example
- Run PySpark with Python3 to test a pyfiles example
- Run PySpark with memory customization
- Run in client mode.
- Start pod creation from template
- PVs with local storage *** FAILED *** // This is irrelevant to this PR.
- Launcher client dependencies  // This is the fixed test case by 
this PR.
- Test basic decommissioning
- Run SparkR on simple dataframe.R example
Run completed in 12 minutes, 4 seconds.
...
```

The following is the working snapshot of `DepsTestSuite` test.
```
$ kubectl get all -ncf9438dd8a65436686b1196a6b73000f
NAME  READY   STATUS
RESTARTS   AGE
pod/minio-0   1/1 Running   0   
   70s
pod/spark-test-app-8494bddca3754390b9e59a2ef47584eb   1/1 Running   0   
   55s

NAME TYPECLUSTER-IP 
 EXTERNAL-IP   PORT(S)  AGE
service/minio-s3 NodePort
10.109.54.180   9000:30678/TCP   70s
service/spark-test-app-fd916b711061c7b8-driver-svc   ClusterIP   None   
 7078/TCP,7079/TCP,4040/TCP   55s

NAME READY   AGE
statefulset.apps/minio   1/1 70s
```

Closes #28015 from dongjoon-hyun/SPARK-31244.

Authored-by: Dongjoon Hyun 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit

[spark] branch master updated: [SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S DepsTestsSuite

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new f206bbd  [SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S 
DepsTestsSuite
f206bbd is described below

commit f206bbde3a8f64650236013d61680faba492d7a4
Author: Dongjoon Hyun 
AuthorDate: Wed Mar 25 12:38:15 2020 -0700

[SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S DepsTestsSuite

### What changes were proposed in this pull request?

This PR (SPARK-31244) replaces `Ceph` with `Minio` in K8S `DepsTestSuite`.

### Why are the changes needed?

Currently, `DepsTestsSuite` is using `ceph` for S3 storage. However, the 
used version and all new releases are broken on new `minikube` releases. We had 
better use more robust and small one.

```
$ minikube version
minikube version: v1.8.2

$ minikube -p minikube docker-env | source

$ docker run -it --rm -e NETWORK_AUTO_DETECT=4 -e RGW_FRONTEND_PORT=8000 -e 
SREE_PORT=5001 -e CEPH_DEMO_UID=nano -e CEPH_DAEMON=demo 
ceph/daemon:v4.0.3-stable-4.0-nautilus-centos-7-x86_64 /bin/sh
2020-03-25 04:26:21  /opt/ceph-container/bin/entrypoint.sh: ERROR- it looks 
like we have not been able to discover the network settings

$ docker run -it --rm -e NETWORK_AUTO_DETECT=4 -e RGW_FRONTEND_PORT=8000 -e 
SREE_PORT=5001 -e CEPH_DEMO_UID=nano -e CEPH_DAEMON=demo 
ceph/daemon:v4.0.11-stable-4.0-nautilus-centos-7 /bin/sh
2020-03-25 04:20:30  /opt/ceph-container/bin/entrypoint.sh: ERROR- it looks 
like we have not been able to discover the network settings
```

Also, the image size is unnecessarily big (almost `1GB`) and growing while 
`minio` is `55.8MB` with the same features.
```
$ docker images | grep ceph
ceph/daemon v4.0.3-stable-4.0-nautilus-centos-7-x86_64 a6a05ccdf924 6 
months ago 852MB
ceph/daemon v4.0.11-stable-4.0-nautilus-centos-7   87f695550d8e 12 
hours ago 901MB

$ docker images | grep minio
minio/minio latest 95c226551ea6 5 days 
ago   55.8MB
```

### Does this PR introduce any user-facing change?

No. (This is a test case change)

### How was this patch tested?

Pass the existing Jenkins K8s integration test job and test with the latest 
minikube.
```
$ minikube version
minikube version: v1.8.2

$ kubectl version --short
Client Version: v1.17.4
Server Version: v1.17.4

$ NO_MANUAL=1 ./dev/make-distribution.sh --r --pip --tgz -Pkubernetes
$ 
resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh 
--spark-tgz $PWD/spark-*.tgz
...
KubernetesSuite:
- Run SparkPi with no resources
- Run SparkPi with a very long application name.
- Use SparkLauncher.NO_RESOURCE
- Run SparkPi with a master URL without a scheme.
- Run SparkPi with an argument.
- Run SparkPi with custom labels, annotations, and environment variables.
- All pods have the same service account by default
- Run extraJVMOptions check on driver
- Run SparkRemoteFileTest using a remote data file
- Run SparkPi with env and mount secrets.
- Run PySpark on simple pi.py example
- Run PySpark with Python2 to test a pyfiles example
- Run PySpark with Python3 to test a pyfiles example
- Run PySpark with memory customization
- Run in client mode.
- Start pod creation from template
- PVs with local storage *** FAILED *** // This is irrelevant to this PR.
- Launcher client dependencies  // This is the fixed test case by 
this PR.
- Test basic decommissioning
- Run SparkR on simple dataframe.R example
Run completed in 12 minutes, 4 seconds.
...
```

The following is the working snapshot of `DepsTestSuite` test.
```
$ kubectl get all -ncf9438dd8a65436686b1196a6b73000f
NAME  READY   STATUS
RESTARTS   AGE
pod/minio-0   1/1 Running   0   
   70s
pod/spark-test-app-8494bddca3754390b9e59a2ef47584eb   1/1 Running   0   
   55s

NAME TYPECLUSTER-IP 
 EXTERNAL-IP   PORT(S)  AGE
service/minio-s3 NodePort
10.109.54.180   9000:30678/TCP   70s
service/spark-test-app-fd916b711061c7b8-driver-svc   ClusterIP   None   
 7078/TCP,7079/TCP,4040/TCP   55s

NAME READY   AGE
statefulset.apps/minio   1/1 70s
```

Closes #28015 from dongjoon-hyun/SPARK-31244.

Authored-by: Dongjoon Hyun 
Signed-off-by: Dongjoon Hyun 
---
 .../k8s/integrationtest

[spark] branch branch-3.0 updated: [SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S DepsTestsSuite

2020-03-25 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 53221cd  [SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S 
DepsTestsSuite
53221cd is described below

commit 53221cda408e9be5d0d2ff5946c200cb43647dd9
Author: Dongjoon Hyun 
AuthorDate: Wed Mar 25 12:38:15 2020 -0700

[SPARK-31244][K8S][TEST] Use Minio instead of Ceph in K8S DepsTestsSuite

### What changes were proposed in this pull request?

This PR (SPARK-31244) replaces `Ceph` with `Minio` in K8S `DepsTestSuite`.

### Why are the changes needed?

Currently, `DepsTestsSuite` is using `ceph` for S3 storage. However, the 
used version and all new releases are broken on new `minikube` releases. We had 
better use more robust and small one.

```
$ minikube version
minikube version: v1.8.2

$ minikube -p minikube docker-env | source

$ docker run -it --rm -e NETWORK_AUTO_DETECT=4 -e RGW_FRONTEND_PORT=8000 -e 
SREE_PORT=5001 -e CEPH_DEMO_UID=nano -e CEPH_DAEMON=demo 
ceph/daemon:v4.0.3-stable-4.0-nautilus-centos-7-x86_64 /bin/sh
2020-03-25 04:26:21  /opt/ceph-container/bin/entrypoint.sh: ERROR- it looks 
like we have not been able to discover the network settings

$ docker run -it --rm -e NETWORK_AUTO_DETECT=4 -e RGW_FRONTEND_PORT=8000 -e 
SREE_PORT=5001 -e CEPH_DEMO_UID=nano -e CEPH_DAEMON=demo 
ceph/daemon:v4.0.11-stable-4.0-nautilus-centos-7 /bin/sh
2020-03-25 04:20:30  /opt/ceph-container/bin/entrypoint.sh: ERROR- it looks 
like we have not been able to discover the network settings
```

Also, the image size is unnecessarily big (almost `1GB`) and growing while 
`minio` is `55.8MB` with the same features.
```
$ docker images | grep ceph
ceph/daemon v4.0.3-stable-4.0-nautilus-centos-7-x86_64 a6a05ccdf924 6 
months ago 852MB
ceph/daemon v4.0.11-stable-4.0-nautilus-centos-7   87f695550d8e 12 
hours ago 901MB

$ docker images | grep minio
minio/minio latest 95c226551ea6 5 days 
ago   55.8MB
```

### Does this PR introduce any user-facing change?

No. (This is a test case change)

### How was this patch tested?

Pass the existing Jenkins K8s integration test job and test with the latest 
minikube.
```
$ minikube version
minikube version: v1.8.2

$ kubectl version --short
Client Version: v1.17.4
Server Version: v1.17.4

$ NO_MANUAL=1 ./dev/make-distribution.sh --r --pip --tgz -Pkubernetes
$ 
resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh 
--spark-tgz $PWD/spark-*.tgz
...
KubernetesSuite:
- Run SparkPi with no resources
- Run SparkPi with a very long application name.
- Use SparkLauncher.NO_RESOURCE
- Run SparkPi with a master URL without a scheme.
- Run SparkPi with an argument.
- Run SparkPi with custom labels, annotations, and environment variables.
- All pods have the same service account by default
- Run extraJVMOptions check on driver
- Run SparkRemoteFileTest using a remote data file
- Run SparkPi with env and mount secrets.
- Run PySpark on simple pi.py example
- Run PySpark with Python2 to test a pyfiles example
- Run PySpark with Python3 to test a pyfiles example
- Run PySpark with memory customization
- Run in client mode.
- Start pod creation from template
- PVs with local storage *** FAILED *** // This is irrelevant to this PR.
- Launcher client dependencies  // This is the fixed test case by 
this PR.
- Test basic decommissioning
- Run SparkR on simple dataframe.R example
Run completed in 12 minutes, 4 seconds.
...
```

The following is the working snapshot of `DepsTestSuite` test.
```
$ kubectl get all -ncf9438dd8a65436686b1196a6b73000f
NAME  READY   STATUS
RESTARTS   AGE
pod/minio-0   1/1 Running   0   
   70s
pod/spark-test-app-8494bddca3754390b9e59a2ef47584eb   1/1 Running   0   
   55s

NAME TYPECLUSTER-IP 
 EXTERNAL-IP   PORT(S)  AGE
service/minio-s3 NodePort
10.109.54.180   9000:30678/TCP   70s
service/spark-test-app-fd916b711061c7b8-driver-svc   ClusterIP   None   
 7078/TCP,7079/TCP,4040/TCP   55s

NAME READY   AGE
statefulset.apps/minio   1/1 70s
```

Closes #28015 from dongjoon-hyun/SPARK-31244.

Authored-by: Dongjoon Hyun 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit

[spark] branch branch-3.0 updated: [SPARK-31259][CORE] Fix log message about fetch request size in ShuffleBlockFetcherIterator

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 8f93dc2  [SPARK-31259][CORE] Fix log message about fetch request size 
in ShuffleBlockFetcherIterator
8f93dc2 is described below

commit 8f93dc2f1dd8bd09d52fd3dc07a4c10e70bd237c
Author: yi.wu 
AuthorDate: Thu Mar 26 09:11:13 2020 -0700

[SPARK-31259][CORE] Fix log message about fetch request size in 
ShuffleBlockFetcherIterator

### What changes were proposed in this pull request?

Fix incorrect log of `cureRequestSize`.

### Why are the changes needed?

In batch mode, `curRequestSize` can be the total size of several block 
groups. And each group should have its own request size instead of using the 
total size.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

It's only affect log.

Closes #28028 from Ngone51/fix_curRequestSize.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 33f532a9f201fb9c7895d685b3dce82cf042dc61)
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/storage/ShuffleBlockFetcherIterator.scala | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
 
b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index f1a7d88..404e055 100644
--- 
a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ 
b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -329,9 +329,8 @@ final class ShuffleBlockFetcherIterator(
 
   private def createFetchRequest(
   blocks: Seq[FetchBlockInfo],
-  address: BlockManagerId,
-  curRequestSize: Long): FetchRequest = {
-logDebug(s"Creating fetch request of $curRequestSize at $address "
+  address: BlockManagerId): FetchRequest = {
+logDebug(s"Creating fetch request of ${blocks.map(_.size).sum} at $address 
"
   + s"with ${blocks.size} blocks")
 FetchRequest(address, blocks)
   }
@@ -339,17 +338,16 @@ final class ShuffleBlockFetcherIterator(
   private def createFetchRequests(
   curBlocks: Seq[FetchBlockInfo],
   address: BlockManagerId,
-  curRequestSize: Long,
   isLast: Boolean,
   collectedRemoteRequests: ArrayBuffer[FetchRequest]): Seq[FetchBlockInfo] 
= {
 val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks)
 var retBlocks = Seq.empty[FetchBlockInfo]
 if (mergedBlocks.length <= maxBlocksInFlightPerAddress) {
-  collectedRemoteRequests += createFetchRequest(mergedBlocks, address, 
curRequestSize)
+  collectedRemoteRequests += createFetchRequest(mergedBlocks, address)
 } else {
   mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks =>
 if (blocks.length == maxBlocksInFlightPerAddress || isLast) {
-  collectedRemoteRequests += createFetchRequest(blocks, address, 
curRequestSize)
+  collectedRemoteRequests += createFetchRequest(blocks, address)
 } else {
   // The last group does not exceed `maxBlocksInFlightPerAddress`. Put 
it back
   // to `curBlocks`.
@@ -377,14 +375,14 @@ final class ShuffleBlockFetcherIterator(
   // For batch fetch, the actual block in flight should count for merged 
block.
   val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= 
maxBlocksInFlightPerAddress
   if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) {
-curBlocks = createFetchRequests(curBlocks, address, curRequestSize, 
isLast = false,
+curBlocks = createFetchRequests(curBlocks, address, isLast = false,
   collectedRemoteRequests).to[ArrayBuffer]
 curRequestSize = curBlocks.map(_.size).sum
   }
 }
 // Add in the final request
 if (curBlocks.nonEmpty) {
-  curBlocks = createFetchRequests(curBlocks, address, curRequestSize, 
isLast = true,
+  curBlocks = createFetchRequests(curBlocks, address, isLast = true,
 collectedRemoteRequests).to[ArrayBuffer]
   curRequestSize = curBlocks.map(_.size).sum
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31259][CORE] Fix log message about fetch request size in ShuffleBlockFetcherIterator

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 33f532a  [SPARK-31259][CORE] Fix log message about fetch request size 
in ShuffleBlockFetcherIterator
33f532a is described below

commit 33f532a9f201fb9c7895d685b3dce82cf042dc61
Author: yi.wu 
AuthorDate: Thu Mar 26 09:11:13 2020 -0700

[SPARK-31259][CORE] Fix log message about fetch request size in 
ShuffleBlockFetcherIterator

### What changes were proposed in this pull request?

Fix incorrect log of `cureRequestSize`.

### Why are the changes needed?

In batch mode, `curRequestSize` can be the total size of several block 
groups. And each group should have its own request size instead of using the 
total size.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

It's only affect log.

Closes #28028 from Ngone51/fix_curRequestSize.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/storage/ShuffleBlockFetcherIterator.scala | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
 
b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index f1a7d88..404e055 100644
--- 
a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ 
b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -329,9 +329,8 @@ final class ShuffleBlockFetcherIterator(
 
   private def createFetchRequest(
   blocks: Seq[FetchBlockInfo],
-  address: BlockManagerId,
-  curRequestSize: Long): FetchRequest = {
-logDebug(s"Creating fetch request of $curRequestSize at $address "
+  address: BlockManagerId): FetchRequest = {
+logDebug(s"Creating fetch request of ${blocks.map(_.size).sum} at $address 
"
   + s"with ${blocks.size} blocks")
 FetchRequest(address, blocks)
   }
@@ -339,17 +338,16 @@ final class ShuffleBlockFetcherIterator(
   private def createFetchRequests(
   curBlocks: Seq[FetchBlockInfo],
   address: BlockManagerId,
-  curRequestSize: Long,
   isLast: Boolean,
   collectedRemoteRequests: ArrayBuffer[FetchRequest]): Seq[FetchBlockInfo] 
= {
 val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks)
 var retBlocks = Seq.empty[FetchBlockInfo]
 if (mergedBlocks.length <= maxBlocksInFlightPerAddress) {
-  collectedRemoteRequests += createFetchRequest(mergedBlocks, address, 
curRequestSize)
+  collectedRemoteRequests += createFetchRequest(mergedBlocks, address)
 } else {
   mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks =>
 if (blocks.length == maxBlocksInFlightPerAddress || isLast) {
-  collectedRemoteRequests += createFetchRequest(blocks, address, 
curRequestSize)
+  collectedRemoteRequests += createFetchRequest(blocks, address)
 } else {
   // The last group does not exceed `maxBlocksInFlightPerAddress`. Put 
it back
   // to `curBlocks`.
@@ -377,14 +375,14 @@ final class ShuffleBlockFetcherIterator(
   // For batch fetch, the actual block in flight should count for merged 
block.
   val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= 
maxBlocksInFlightPerAddress
   if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) {
-curBlocks = createFetchRequests(curBlocks, address, curRequestSize, 
isLast = false,
+curBlocks = createFetchRequests(curBlocks, address, isLast = false,
   collectedRemoteRequests).to[ArrayBuffer]
 curRequestSize = curBlocks.map(_.size).sum
   }
 }
 // Add in the final request
 if (curBlocks.nonEmpty) {
-  curBlocks = createFetchRequests(curBlocks, address, curRequestSize, 
isLast = true,
+  curBlocks = createFetchRequests(curBlocks, address, isLast = true,
 collectedRemoteRequests).to[ArrayBuffer]
   curRequestSize = curBlocks.map(_.size).sum
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31259][CORE] Fix log message about fetch request size in ShuffleBlockFetcherIterator

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 33f532a  [SPARK-31259][CORE] Fix log message about fetch request size 
in ShuffleBlockFetcherIterator
33f532a is described below

commit 33f532a9f201fb9c7895d685b3dce82cf042dc61
Author: yi.wu 
AuthorDate: Thu Mar 26 09:11:13 2020 -0700

[SPARK-31259][CORE] Fix log message about fetch request size in 
ShuffleBlockFetcherIterator

### What changes were proposed in this pull request?

Fix incorrect log of `cureRequestSize`.

### Why are the changes needed?

In batch mode, `curRequestSize` can be the total size of several block 
groups. And each group should have its own request size instead of using the 
total size.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

It's only affect log.

Closes #28028 from Ngone51/fix_curRequestSize.

Authored-by: yi.wu 
Signed-off-by: Dongjoon Hyun 
---
 .../apache/spark/storage/ShuffleBlockFetcherIterator.scala | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
 
b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index f1a7d88..404e055 100644
--- 
a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ 
b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -329,9 +329,8 @@ final class ShuffleBlockFetcherIterator(
 
   private def createFetchRequest(
   blocks: Seq[FetchBlockInfo],
-  address: BlockManagerId,
-  curRequestSize: Long): FetchRequest = {
-logDebug(s"Creating fetch request of $curRequestSize at $address "
+  address: BlockManagerId): FetchRequest = {
+logDebug(s"Creating fetch request of ${blocks.map(_.size).sum} at $address 
"
   + s"with ${blocks.size} blocks")
 FetchRequest(address, blocks)
   }
@@ -339,17 +338,16 @@ final class ShuffleBlockFetcherIterator(
   private def createFetchRequests(
   curBlocks: Seq[FetchBlockInfo],
   address: BlockManagerId,
-  curRequestSize: Long,
   isLast: Boolean,
   collectedRemoteRequests: ArrayBuffer[FetchRequest]): Seq[FetchBlockInfo] 
= {
 val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks)
 var retBlocks = Seq.empty[FetchBlockInfo]
 if (mergedBlocks.length <= maxBlocksInFlightPerAddress) {
-  collectedRemoteRequests += createFetchRequest(mergedBlocks, address, 
curRequestSize)
+  collectedRemoteRequests += createFetchRequest(mergedBlocks, address)
 } else {
   mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks =>
 if (blocks.length == maxBlocksInFlightPerAddress || isLast) {
-  collectedRemoteRequests += createFetchRequest(blocks, address, 
curRequestSize)
+  collectedRemoteRequests += createFetchRequest(blocks, address)
 } else {
   // The last group does not exceed `maxBlocksInFlightPerAddress`. Put 
it back
   // to `curBlocks`.
@@ -377,14 +375,14 @@ final class ShuffleBlockFetcherIterator(
   // For batch fetch, the actual block in flight should count for merged 
block.
   val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= 
maxBlocksInFlightPerAddress
   if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) {
-curBlocks = createFetchRequests(curBlocks, address, curRequestSize, 
isLast = false,
+curBlocks = createFetchRequests(curBlocks, address, isLast = false,
   collectedRemoteRequests).to[ArrayBuffer]
 curRequestSize = curBlocks.map(_.size).sum
   }
 }
 // Add in the final request
 if (curBlocks.nonEmpty) {
-  curBlocks = createFetchRequests(curBlocks, address, curRequestSize, 
isLast = true,
+  curBlocks = createFetchRequests(curBlocks, address, isLast = true,
 collectedRemoteRequests).to[ArrayBuffer]
   curRequestSize = curBlocks.map(_.size).sum
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for ORC datasource

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new d72ec85  [SPARK-31238][SQL] Rebase dates to/from Julian calendar in 
write/read for ORC datasource
d72ec85 is described below

commit d72ec8574113f9a7e87f3d7ec56c8447267b0506
Author: Maxim Gekk 
AuthorDate: Thu Mar 26 13:14:28 2020 -0700

[SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for 
ORC datasource

### What changes were proposed in this pull request?

This PR (SPARK-31238) aims the followings.
1. Modified ORC Vectorized Reader, in particular, OrcColumnVector v1.2 and 
v2.3. After the changes, it uses `DateTimeUtils. rebaseJulianToGregorianDays()` 
added by https://github.com/apache/spark/pull/27915 . The method performs 
rebasing days from the hybrid calendar (Julian + Gregorian) to Proleptic 
Gregorian calendar. It builds a local date in the original calendar, extracts 
date fields `year`, `month` and `day` from the local date, and builds another 
local date in the target calend [...]
2. Introduced rebasing dates while saving ORC files, in particular, I 
modified `OrcShimUtils. getDateWritable` v1.2 and v2.3, and returned 
`DaysWritable` instead of Hive's `DateWritable`. The `DaysWritable` class was 
added by the PR https://github.com/apache/spark/pull/27890 (and fixed by 
https://github.com/apache/spark/pull/27962). I moved `DaysWritable` from 
`sql/hive` to `sql/core` to re-use it in ORC datasource.

### Why are the changes needed?
For the backward compatibility with Spark 2.4 and earlier versions. The 
changes allow users to read dates/timestamps saved by previous version, and get 
the same result.

### Does this PR introduce any user-facing change?
Yes. Before the changes, loading the date `1200-01-01` saved by Spark 2.4.5 
returns the following:
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-08|
+--+
```
After the changes
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```

### How was this patch tested?
- By running `OrcSourceSuite` and `HiveOrcSourceSuite`.
- Add new test `SPARK-31238: compatibility with Spark 2.4 in reading dates` 
to `OrcSuite` which reads an ORC file saved by Spark 2.4.5 via the commands:
```shell
$ export TZ="America/Los_Angeles"
```
```scala
scala> sql("select cast('1200-01-01' as date) 
dt").write.mode("overwrite").orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc")
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```
- Add round trip test `SPARK-31238: rebasing dates in write`. The test 
`SPARK-31238: compatibility with Spark 2.4 in reading dates` confirms rebasing 
in read. So, we can check rebasing in write.

Closes #28016 from MaxGekk/rebase-date-orc.

Authored-by: Maxim Gekk 
Signed-off-by: Dongjoon Hyun 
---
 .../sql/execution/datasources}/DaysWritable.scala  |  10 ++--
 .../test-data/before_1582_date_v2_4.snappy.orc | Bin 0 -> 201 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 -
 .../sql/execution/datasources/orc/OrcTest.scala|   5 
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc}/DaysWritable.scala  |  17 ++---
 .../execution/datasources/orc/OrcShimUtils.scala   |   4 +--
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc/OrcShimUtils.scala   |   5 ++--
 .../org/apache/spark/sql/hive/HiveInspectors.scala |   1 +
 10 files changed, 88 insertions(+), 12 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
similarity index 92%
copy from sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
copy to 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
index 1eec8d7..00b710f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive
+package org.apache.spark.sql.execution.datasources
 
 import java.io.{DataInput, DataOutput, IOException

[spark] branch branch-3.0 updated: [SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for ORC datasource

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 78cc2ef  [SPARK-31238][SQL] Rebase dates to/from Julian calendar in 
write/read for ORC datasource
78cc2ef is described below

commit 78cc2ef5b663d6d605e3d4febc6fb99e20b7f165
Author: Maxim Gekk 
AuthorDate: Thu Mar 26 13:14:28 2020 -0700

[SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for 
ORC datasource

### What changes were proposed in this pull request?

This PR (SPARK-31238) aims the followings.
1. Modified ORC Vectorized Reader, in particular, OrcColumnVector v1.2 and 
v2.3. After the changes, it uses `DateTimeUtils. rebaseJulianToGregorianDays()` 
added by https://github.com/apache/spark/pull/27915 . The method performs 
rebasing days from the hybrid calendar (Julian + Gregorian) to Proleptic 
Gregorian calendar. It builds a local date in the original calendar, extracts 
date fields `year`, `month` and `day` from the local date, and builds another 
local date in the target calend [...]
2. Introduced rebasing dates while saving ORC files, in particular, I 
modified `OrcShimUtils. getDateWritable` v1.2 and v2.3, and returned 
`DaysWritable` instead of Hive's `DateWritable`. The `DaysWritable` class was 
added by the PR https://github.com/apache/spark/pull/27890 (and fixed by 
https://github.com/apache/spark/pull/27962). I moved `DaysWritable` from 
`sql/hive` to `sql/core` to re-use it in ORC datasource.

### Why are the changes needed?
For the backward compatibility with Spark 2.4 and earlier versions. The 
changes allow users to read dates/timestamps saved by previous version, and get 
the same result.

### Does this PR introduce any user-facing change?
Yes. Before the changes, loading the date `1200-01-01` saved by Spark 2.4.5 
returns the following:
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-08|
+--+
```
After the changes
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```

### How was this patch tested?
- By running `OrcSourceSuite` and `HiveOrcSourceSuite`.
- Add new test `SPARK-31238: compatibility with Spark 2.4 in reading dates` 
to `OrcSuite` which reads an ORC file saved by Spark 2.4.5 via the commands:
```shell
$ export TZ="America/Los_Angeles"
```
```scala
scala> sql("select cast('1200-01-01' as date) 
dt").write.mode("overwrite").orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc")
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```
- Add round trip test `SPARK-31238: rebasing dates in write`. The test 
`SPARK-31238: compatibility with Spark 2.4 in reading dates` confirms rebasing 
in read. So, we can check rebasing in write.

Closes #28016 from MaxGekk/rebase-date-orc.

Authored-by: Maxim Gekk 
Signed-off-by: Dongjoon Hyun 
    (cherry picked from commit d72ec8574113f9a7e87f3d7ec56c8447267b0506)
Signed-off-by: Dongjoon Hyun 
---
 .../sql/execution/datasources}/DaysWritable.scala  |  10 ++--
 .../test-data/before_1582_date_v2_4.snappy.orc | Bin 0 -> 201 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 -
 .../sql/execution/datasources/orc/OrcTest.scala|   5 
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc}/DaysWritable.scala  |  17 ++---
 .../execution/datasources/orc/OrcShimUtils.scala   |   4 +--
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc/OrcShimUtils.scala   |   5 ++--
 .../org/apache/spark/sql/hive/HiveInspectors.scala |   1 +
 10 files changed, 88 insertions(+), 12 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
similarity index 92%
copy from sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
copy to 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
index 1eec8d7..00b710f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.s

[spark] branch master updated (33f532a -> d72ec85)

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 33f532a  [SPARK-31259][CORE] Fix log message about fetch request size 
in ShuffleBlockFetcherIterator
 add d72ec85  [SPARK-31238][SQL] Rebase dates to/from Julian calendar in 
write/read for ORC datasource

No new revisions were added by this update.

Summary of changes:
 .../sql/execution/datasources}/DaysWritable.scala  |  10 ++--
 .../test-data/before_1582_date_v2_4.snappy.orc | Bin 0 -> 201 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 -
 .../sql/execution/datasources/orc/OrcTest.scala|   5 
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc}/DaysWritable.scala  |  17 ++---
 .../execution/datasources/orc/OrcShimUtils.scala   |   4 +--
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc/OrcShimUtils.scala   |   5 ++--
 .../org/apache/spark/sql/hive/HiveInspectors.scala |   1 +
 10 files changed, 88 insertions(+), 12 deletions(-)
 copy sql/{hive/src/main/scala/org/apache/spark/sql/hive => 
core/src/main/scala/org/apache/spark/sql/execution/datasources}/DaysWritable.scala
 (92%)
 create mode 100644 
sql/core/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc
 rename sql/{hive/src/main/scala/org/apache/spark/sql/hive => 
core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc}/DaysWritable.scala
 (81%)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for ORC datasource

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 78cc2ef  [SPARK-31238][SQL] Rebase dates to/from Julian calendar in 
write/read for ORC datasource
78cc2ef is described below

commit 78cc2ef5b663d6d605e3d4febc6fb99e20b7f165
Author: Maxim Gekk 
AuthorDate: Thu Mar 26 13:14:28 2020 -0700

[SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for 
ORC datasource

### What changes were proposed in this pull request?

This PR (SPARK-31238) aims the followings.
1. Modified ORC Vectorized Reader, in particular, OrcColumnVector v1.2 and 
v2.3. After the changes, it uses `DateTimeUtils. rebaseJulianToGregorianDays()` 
added by https://github.com/apache/spark/pull/27915 . The method performs 
rebasing days from the hybrid calendar (Julian + Gregorian) to Proleptic 
Gregorian calendar. It builds a local date in the original calendar, extracts 
date fields `year`, `month` and `day` from the local date, and builds another 
local date in the target calend [...]
2. Introduced rebasing dates while saving ORC files, in particular, I 
modified `OrcShimUtils. getDateWritable` v1.2 and v2.3, and returned 
`DaysWritable` instead of Hive's `DateWritable`. The `DaysWritable` class was 
added by the PR https://github.com/apache/spark/pull/27890 (and fixed by 
https://github.com/apache/spark/pull/27962). I moved `DaysWritable` from 
`sql/hive` to `sql/core` to re-use it in ORC datasource.

### Why are the changes needed?
For the backward compatibility with Spark 2.4 and earlier versions. The 
changes allow users to read dates/timestamps saved by previous version, and get 
the same result.

### Does this PR introduce any user-facing change?
Yes. Before the changes, loading the date `1200-01-01` saved by Spark 2.4.5 
returns the following:
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-08|
+--+
```
After the changes
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```

### How was this patch tested?
- By running `OrcSourceSuite` and `HiveOrcSourceSuite`.
- Add new test `SPARK-31238: compatibility with Spark 2.4 in reading dates` 
to `OrcSuite` which reads an ORC file saved by Spark 2.4.5 via the commands:
```shell
$ export TZ="America/Los_Angeles"
```
```scala
scala> sql("select cast('1200-01-01' as date) 
dt").write.mode("overwrite").orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc")
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```
- Add round trip test `SPARK-31238: rebasing dates in write`. The test 
`SPARK-31238: compatibility with Spark 2.4 in reading dates` confirms rebasing 
in read. So, we can check rebasing in write.

Closes #28016 from MaxGekk/rebase-date-orc.

Authored-by: Maxim Gekk 
Signed-off-by: Dongjoon Hyun 
    (cherry picked from commit d72ec8574113f9a7e87f3d7ec56c8447267b0506)
Signed-off-by: Dongjoon Hyun 
---
 .../sql/execution/datasources}/DaysWritable.scala  |  10 ++--
 .../test-data/before_1582_date_v2_4.snappy.orc | Bin 0 -> 201 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 -
 .../sql/execution/datasources/orc/OrcTest.scala|   5 
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc}/DaysWritable.scala  |  17 ++---
 .../execution/datasources/orc/OrcShimUtils.scala   |   4 +--
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc/OrcShimUtils.scala   |   5 ++--
 .../org/apache/spark/sql/hive/HiveInspectors.scala |   1 +
 10 files changed, 88 insertions(+), 12 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
similarity index 92%
copy from sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
copy to 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
index 1eec8d7..00b710f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.s

[spark] branch branch-3.0 updated: [SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for ORC datasource

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 78cc2ef  [SPARK-31238][SQL] Rebase dates to/from Julian calendar in 
write/read for ORC datasource
78cc2ef is described below

commit 78cc2ef5b663d6d605e3d4febc6fb99e20b7f165
Author: Maxim Gekk 
AuthorDate: Thu Mar 26 13:14:28 2020 -0700

[SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for 
ORC datasource

### What changes were proposed in this pull request?

This PR (SPARK-31238) aims the followings.
1. Modified ORC Vectorized Reader, in particular, OrcColumnVector v1.2 and 
v2.3. After the changes, it uses `DateTimeUtils. rebaseJulianToGregorianDays()` 
added by https://github.com/apache/spark/pull/27915 . The method performs 
rebasing days from the hybrid calendar (Julian + Gregorian) to Proleptic 
Gregorian calendar. It builds a local date in the original calendar, extracts 
date fields `year`, `month` and `day` from the local date, and builds another 
local date in the target calend [...]
2. Introduced rebasing dates while saving ORC files, in particular, I 
modified `OrcShimUtils. getDateWritable` v1.2 and v2.3, and returned 
`DaysWritable` instead of Hive's `DateWritable`. The `DaysWritable` class was 
added by the PR https://github.com/apache/spark/pull/27890 (and fixed by 
https://github.com/apache/spark/pull/27962). I moved `DaysWritable` from 
`sql/hive` to `sql/core` to re-use it in ORC datasource.

### Why are the changes needed?
For the backward compatibility with Spark 2.4 and earlier versions. The 
changes allow users to read dates/timestamps saved by previous version, and get 
the same result.

### Does this PR introduce any user-facing change?
Yes. Before the changes, loading the date `1200-01-01` saved by Spark 2.4.5 
returns the following:
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-08|
+--+
```
After the changes
```scala
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```

### How was this patch tested?
- By running `OrcSourceSuite` and `HiveOrcSourceSuite`.
- Add new test `SPARK-31238: compatibility with Spark 2.4 in reading dates` 
to `OrcSuite` which reads an ORC file saved by Spark 2.4.5 via the commands:
```shell
$ export TZ="America/Los_Angeles"
```
```scala
scala> sql("select cast('1200-01-01' as date) 
dt").write.mode("overwrite").orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc")
scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_date_orc").show(false)
+--+
|dt|
+--+
|1200-01-01|
+--+
```
- Add round trip test `SPARK-31238: rebasing dates in write`. The test 
`SPARK-31238: compatibility with Spark 2.4 in reading dates` confirms rebasing 
in read. So, we can check rebasing in write.

Closes #28016 from MaxGekk/rebase-date-orc.

Authored-by: Maxim Gekk 
Signed-off-by: Dongjoon Hyun 
    (cherry picked from commit d72ec8574113f9a7e87f3d7ec56c8447267b0506)
Signed-off-by: Dongjoon Hyun 
---
 .../sql/execution/datasources}/DaysWritable.scala  |  10 ++--
 .../test-data/before_1582_date_v2_4.snappy.orc | Bin 0 -> 201 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 -
 .../sql/execution/datasources/orc/OrcTest.scala|   5 
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc}/DaysWritable.scala  |  17 ++---
 .../execution/datasources/orc/OrcShimUtils.scala   |   4 +--
 .../execution/datasources/orc/OrcColumnVector.java |  15 ++-
 .../execution/datasources/orc/OrcShimUtils.scala   |   5 ++--
 .../org/apache/spark/sql/hive/HiveInspectors.scala |   1 +
 10 files changed, 88 insertions(+), 12 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
similarity index 92%
copy from sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
copy to 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
index 1eec8d7..00b710f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.s

[spark] branch branch-2.4 updated: Revert "[SPARK-31258][BUILD] Pin the avro version in SBT"

2020-03-26 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new 4217f75  Revert "[SPARK-31258][BUILD] Pin the avro version in SBT"
4217f75 is described below

commit 4217f75b3f05f323018a3a9986ecb9ae587688a8
Author: Dongjoon Hyun 
AuthorDate: Thu Mar 26 13:49:39 2020 -0700

Revert "[SPARK-31258][BUILD] Pin the avro version in SBT"

This reverts commit 916a25a46bca7196416372bacc3fc260a6ef658f.
---
 project/SparkBuild.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 4578857..7ee079c 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -554,8 +554,7 @@ object DependencyOverrides {
 dependencyOverrides += "com.google.guava" % "guava" % "14.0.1",
 dependencyOverrides += "commons-io" % "commons-io" % "2.4",
 dependencyOverrides += "com.fasterxml.jackson.core"  % "jackson-databind" 
% "2.6.7.3",
-dependencyOverrides += "jline" % "jline" % "2.14.6",
-dependencyOverrides += "org.apache.avro" % "avro" % "1.8.2")
+dependencyOverrides += "jline" % "jline" % "2.14.6")
 }
 
 /**


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated (4868b4d -> f94d13f)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 4868b4d  [SPARK-31225][SQL] Override sql method of OuterReference
 add f94d13f  
[SPARK-25556][SPARK-17636][SPARK-31026][SPARK-31060][FOLLOWUP][3.0] Fix build 
error due to conf version

No new revisions were added by this update.

Summary of changes:
 sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 1 -
 1 file changed, 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (9f0c010 -> fc2a974)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 9f0c010  [SPARK-31277][SQL][TESTS] Migrate `DateTimeTestUtils` from 
`TimeZone` to `ZoneId`
 add fc2a974  [SPARK-31284][SQL][TESTS] Check rebasing of timestamps in ORC 
datasource

No new revisions were added by this update.

Summary of changes:
 .../test-data/before_1582_ts_v2_4.snappy.orc   | Bin 0 -> 251 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 +
 2 files changed, 28 insertions(+)
 create mode 100644 
sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31284][SQL][TESTS] Check rebasing of timestamps in ORC datasource

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new b6e8f64  [SPARK-31284][SQL][TESTS] Check rebasing of timestamps in ORC 
datasource
b6e8f64 is described below

commit b6e8f64d49caf1f0a1f1b910d603e8e000270d01
Author: Maxim Gekk 
AuthorDate: Fri Mar 27 09:06:59 2020 -0700

[SPARK-31284][SQL][TESTS] Check rebasing of timestamps in ORC datasource

### What changes were proposed in this pull request?
In the PR, I propose 2 tests to check that rebasing of timestamps from/to 
the hybrid calendar (Julian + Gregorian) to/from Proleptic Gregorian calendar 
works correctly.
1. The test `compatibility with Spark 2.4 in reading timestamps` load ORC 
file saved by Spark 2.4.5 via:
```shell
$ export TZ="America/Los_Angeles"
```
```scala
scala> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")

scala> val df = Seq("1001-01-01 
01:02:03.123456").toDF("tsS").select($"tsS".cast("timestamp").as("ts"))
df: org.apache.spark.sql.DataFrame = [ts: timestamp]

scala> df.write.orc("/Users/maxim/tmp/before_1582/2_4_5_ts_orc")

scala> 
spark.read.orc("/Users/maxim/tmp/before_1582/2_4_5_ts_orc").show(false)
+--+
|ts|
+--+
|1001-01-01 01:02:03.123456|
+--+
```
2. The test `rebasing timestamps in write` is round trip test. Since the 
previous test confirms correct rebasing of timestamps in read. This test should 
pass only if rebasing works correctly in write.

### Why are the changes needed?
To guarantee that rebasing works correctly for timestamps in ORC datasource.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By running `OrcSourceSuite` for Hive 1.2 and 2.3 via the commands:
```
$ build/sbt -Phive-2.3 "test:testOnly *OrcSourceSuite"
```
and
```
$ build/sbt -Phive-1.2 "test:testOnly *OrcSourceSuite"
```

Closes #28047 from MaxGekk/rebase-ts-orc-test.

Authored-by: Maxim Gekk 
    Signed-off-by: Dongjoon Hyun 
(cherry picked from commit fc2a974e030c82bf500a81c3908f853c3eeb761d)
Signed-off-by: Dongjoon Hyun 
---
 .../test-data/before_1582_ts_v2_4.snappy.orc   | Bin 0 -> 251 bytes
 .../execution/datasources/orc/OrcSourceSuite.scala |  28 +
 2 files changed, 28 insertions(+)

diff --git 
a/sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc 
b/sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc
new file mode 100644
index 000..af9ef04
Binary files /dev/null and 
b/sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc differ
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index b5e002f..0b7500c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -508,6 +508,34 @@ abstract class OrcSuite extends OrcTest with 
BeforeAndAfterAll {
   }
 }
   }
+
+  test("SPARK-31284: compatibility with Spark 2.4 in reading timestamps") {
+Seq(false, true).foreach { vectorized =>
+  withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> 
vectorized.toString) {
+checkAnswer(
+  readResourceOrcFile("test-data/before_1582_ts_v2_4.snappy.orc"),
+  Row(java.sql.Timestamp.valueOf("1001-01-01 01:02:03.123456")))
+  }
+}
+  }
+
+  test("SPARK-31284: rebasing timestamps in write") {
+withTempPath { dir =>
+  val path = dir.getAbsolutePath
+  Seq("1001-01-01 01:02:03.123456").toDF("tsS")
+.select($"tsS".cast("timestamp").as("ts"))
+.write
+.orc(path)
+
+  Seq(false, true).foreach { vectorized =>
+withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> 
vectorized.toString) {
+  checkAnswer(
+spark.read.orc(path),
+Row(java.sql.Timestamp.valueOf("1001-01-01 01:02:03.123456")))
+}
+  }
+}
+  }
 }
 
 class OrcSourceSuite extends OrcSuite with SharedSparkSession {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (fc2a974 -> f879573)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from fc2a974  [SPARK-31284][SQL][TESTS] Check rebasing of timestamps in ORC 
datasource
 add f879573  [SPARK-31200][K8S] Enforce to use `https` in 
/etc/apt/sources.list

No new revisions were added by this update.

Summary of changes:
 .../kubernetes/docker/src/main/dockerfiles/spark/Dockerfile  | 1 +
 1 file changed, 1 insertion(+)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31200][K8S] Enforce to use `https` in /etc/apt/sources.list

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new c1a03b2  [SPARK-31200][K8S] Enforce to use `https` in 
/etc/apt/sources.list
c1a03b2 is described below

commit c1a03b2233aee77f957dbc94180d2334c61ac088
Author: Prashant Sharma 
AuthorDate: Fri Mar 27 09:13:55 2020 -0700

[SPARK-31200][K8S] Enforce to use `https` in /etc/apt/sources.list

…n progress errors.

### What changes were proposed in this pull request?
Switching to `https` instead of `http` in the debian mirror urls.

### Why are the changes needed?
My ISP was trying to intercept (or trying to serve from cache) the `http` 
traffic and this was causing a very confusing errors while building the spark 
image. I thought by posting this, I can help someone save his time and energy, 
if he encounters the same issue.
```
bash-3.2$ bin/docker-image-tool.sh -r scrapcodes -t v3.1.0-f1cc86 build
Sending build context to Docker daemon  203.4MB
Step 1/18 : ARG java_image_tag=8-jre-slim
Step 2/18 : FROM openjdk:${java_image_tag}
 ---> 381b20190cf7
Step 3/18 : ARG spark_uid=185
 ---> Using cache
 ---> 65c06f86753c
Step 4/18 : RUN set -ex && apt-get update && ln -s /lib /lib64 &&   
  apt install -y bash tini libc6 libpam-modules krb5-user libnss3 procps && 
mkdir -p /opt/spark && mkdir -p /opt/spark/examples && mkdir -p 
/opt/spark/work-dir && touch /opt/spark/RELEASE && rm /bin/sh && ln 
-sv /bin/bash /bin/sh && echo "auth required pam_wheel.so use_uid" >> 
/etc/pam.d/su && chgrp root /etc/passwd && chmod ug+rw /etc/passwd && 
rm -rf /var/cache/apt/*
 ---> Running in 96bcbe927d35
+ apt-get update
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian buster-updates InRelease [49.3 kB]
Get:3 http://deb.debian.org/debian buster/main amd64 Packages [7907 kB]
Err:3 http://deb.debian.org/debian buster/main amd64 Packages
  File has unexpected size (13217 != 7906744). Mirror sync in progress? 
[IP: 151.101.10.133 80]
  Hashes of expected file:
   - Filesize:7906744 [weak]
   - SHA256:80ed5d1cc1f31a568b77e4fadfd9e01fa4d65e951243fd2ce29eee14d4b532cc
   - MD5Sum:80b6d9c1b6630b2234161e42f4040ab3 [weak]
  Release file created at: Sat, 08 Feb 2020 10:57:10 +
Get:5 http://deb.debian.org/debian buster-updates/main amd64 Packages [7380 
B]
Err:5 http://deb.debian.org/debian buster-updates/main amd64 Packages
  File has unexpected size (13233 != 7380). Mirror sync in progress? [IP: 
151.101.10.133 80]
  Hashes of expected file:
   - Filesize:7380 [weak]
   - SHA256:6af9ea081b6a3da33cfaf76a81978517f65d38e45230089a5612e56f2b6b789d
  Release file created at: Fri, 20 Mar 2020 02:28:11 +
Get:4 http://security-cdn.debian.org/debian-security buster/updates 
InRelease [65.4 kB]
Get:6 http://security-cdn.debian.org/debian-security buster/updates/main 
amd64 Packages [183 kB]
Fetched 419 kB in 1s (327 kB/s)
Reading package lists...
E: Failed to fetch 
http://deb.debian.org/debian/dists/buster/main/binary-amd64/by-hash/SHA256/80ed5d1cc1f31a568b77e4fadfd9e01fa4d65e951243fd2ce29eee14d4b532cc
  File has unexpected size (13217 != 7906744). Mirror sync in progress? [IP: 
151.101.10.133 80]
   Hashes of expected file:
- Filesize:7906744 [weak]
- 
SHA256:80ed5d1cc1f31a568b77e4fadfd9e01fa4d65e951243fd2ce29eee14d4b532cc
- MD5Sum:80b6d9c1b6630b2234161e42f4040ab3 [weak]
   Release file created at: Sat, 08 Feb 2020 10:57:10 +
E: Failed to fetch 
http://deb.debian.org/debian/dists/buster-updates/main/binary-amd64/by-hash/SHA256/6af9ea081b6a3da33cfaf76a81978517f65d38e45230089a5612e56f2b6b789d
  File has unexpected size (13233 != 7380). Mirror sync in progress? [IP: 
151.101.10.133 80]
   Hashes of expected file:
- Filesize:7380 [weak]
- 
SHA256:6af9ea081b6a3da33cfaf76a81978517f65d38e45230089a5612e56f2b6b789d
   Release file created at: Fri, 20 Mar 2020 02:28:11 +
E: Some index files failed to download. They have been ignored, or old ones 
used instead.
The command '/bin/sh -c set -ex && apt-get update && ln -s /lib 
/lib64 && apt install -y bash tini libc6 libpam-modules krb5-user libnss3 
procps && mkdir -p /opt/spark && mkdir -p /opt/spark/examples && 
mkdir -p /opt/spark/work-dir && touch /opt/spark/RELEASE && rm /bin/sh 
&& ln -sv /bin/bash /bin/sh && echo "auth required pam_wheel.so 
use_uid&quo

[spark] branch master updated (f879573 -> 8a5d496)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from f879573  [SPARK-31200][K8S] Enforce to use `https` in 
/etc/apt/sources.list
 add 8a5d496  [MINOR][DOC] Refine comments of QueryPlan regarding subquery

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/plans/QueryPlan.scala   | 32 ++
 1 file changed, 21 insertions(+), 11 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [MINOR][DOC] Refine comments of QueryPlan regarding subquery

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 7435f45  [MINOR][DOC] Refine comments of QueryPlan regarding subquery
7435f45 is described below

commit 7435f4543ea6f2b927da6055c1cfb75f4a62f19d
Author: Wenchen Fan 
AuthorDate: Fri Mar 27 09:35:35 2020 -0700

[MINOR][DOC] Refine comments of QueryPlan regarding subquery

### What changes were proposed in this pull request?

The query plan of Spark SQL is a mutually recursive structure: QueryPlan -> 
Expression (PlanExpression) -> QueryPlan, but the transformations do not take 
this into account.

This PR refines the comments of `QueryPlan` to highlight this fact.

### Why are the changes needed?

better document.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #28050 from cloud-fan/comment.

Authored-by: Wenchen Fan 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 8a5d49610d875c473114781e92300c79e24a53cc)
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/catalyst/plans/QueryPlan.scala   | 32 ++
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 1248266..9f86fb2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -23,6 +23,16 @@ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, 
TreeNode, TreeNodeTag
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 
+/**
+ * An abstraction of the Spark SQL query plan tree, which can be logical or 
physical. This class
+ * defines some basic properties of a query plan node, as well as some new 
transform APIs to
+ * transform the expressions of the plan node.
+ *
+ * Note that, the query plan is a mutually recursive structure:
+ *   QueryPlan -> Expression (subquery) -> QueryPlan
+ * The tree traverse APIs like `transform`, `foreach`, `collect`, etc. that are
+ * inherited from `TreeNode`, do not traverse into query plans inside 
subqueries.
+ */
 abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends 
TreeNode[PlanType] {
   self: PlanType =>
 
@@ -133,7 +143,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] 
extends TreeNode[PlanT
 
   /**
* Returns the result of running [[transformExpressions]] on this node
-   * and all its children.
+   * and all its children. Note that this method skips expressions inside 
subqueries.
*/
   def transformAllExpressions(rule: PartialFunction[Expression, Expression]): 
this.type = {
 transform {
@@ -204,7 +214,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] 
extends TreeNode[PlanT
   }
 
   /**
-   * All the subqueries of current plan.
+   * All the top-level subqueries of the current plan node. Nested subqueries 
are not included.
*/
   def subqueries: Seq[PlanType] = {
 expressions.flatMap(_.collect {
@@ -213,21 +223,21 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] 
extends TreeNode[PlanT
   }
 
   /**
-   * Returns a sequence containing the result of applying a partial function 
to all elements in this
-   * plan, also considering all the plans in its (nested) subqueries
-   */
-  def collectInPlanAndSubqueries[B](f: PartialFunction[PlanType, B]): Seq[B] =
-(this +: subqueriesAll).flatMap(_.collect(f))
-
-  /**
-   * Returns a sequence containing the subqueries in this plan, also including 
the (nested)
-   * subquries in its children
+   * All the subqueries of the current plan node and all its children. Nested 
subqueries are also
+   * included.
*/
   def subqueriesAll: Seq[PlanType] = {
 val subqueries = this.flatMap(_.subqueries)
 subqueries ++ subqueries.flatMap(_.subqueriesAll)
   }
 
+  /**
+   * Returns a sequence containing the result of applying a partial function 
to all elements in this
+   * plan, also considering all the plans in its (nested) subqueries
+   */
+  def collectInPlanAndSubqueries[B](f: PartialFunction[PlanType, B]): Seq[B] =
+(this +: subqueriesAll).flatMap(_.collect(f))
+
   override def innerChildren: Seq[QueryPlan[_]] = subqueries
 
   /**


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (8a5d496 -> aa8776b)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 8a5d496  [MINOR][DOC] Refine comments of QueryPlan regarding subquery
 add aa8776b  [SPARK-29721][SQL] Prune unnecessary nested fields from 
Generate without Project

No new revisions were added by this update.

Summary of changes:
 .../catalyst/optimizer/NestedColumnAliasing.scala  | 53 ++
 .../spark/sql/catalyst/optimizer/Optimizer.scala   | 20 +-
 .../optimizer/NestedColumnAliasingSuite.scala  | 80 ++
 .../execution/datasources/SchemaPruningSuite.scala | 37 ++
 4 files changed, 172 insertions(+), 18 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (8a5d496 -> aa8776b)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 8a5d496  [MINOR][DOC] Refine comments of QueryPlan regarding subquery
 add aa8776b  [SPARK-29721][SQL] Prune unnecessary nested fields from 
Generate without Project

No new revisions were added by this update.

Summary of changes:
 .../catalyst/optimizer/NestedColumnAliasing.scala  | 53 ++
 .../spark/sql/catalyst/optimizer/Optimizer.scala   | 20 +-
 .../optimizer/NestedColumnAliasingSuite.scala  | 80 ++
 .../execution/datasources/SchemaPruningSuite.scala | 37 ++
 4 files changed, 172 insertions(+), 18 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31271][UI] fix web ui for driver side SQL metrics

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new c4e98c0  [SPARK-31271][UI] fix web ui for driver side SQL metrics
c4e98c0 is described below

commit c4e98c065c99d2cf840e6006ee5414fbaaba9937
Author: Wenchen Fan 
AuthorDate: Fri Mar 27 15:45:35 2020 -0700

[SPARK-31271][UI] fix web ui for driver side SQL metrics

### What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/23551, we changed the metrics type 
of driver-side SQL metrics to size/time etc. which comes with max/min/median 
info.

This doesn't make sense for driver side SQL metrics as they have only one 
value. It makes the web UI hard to read:

![image](https://user-images.githubusercontent.com/3182036/77653892-42db9900-6fab-11ea-8e7f-92f763fa32ff.png)

This PR updates the SQL metrics UI to only display max/min/median if there 
are more than one metrics values:

![image](https://user-images.githubusercontent.com/3182036/77653975-5f77d100-6fab-11ea-849e-64c935377c8e.png)

### Why are the changes needed?

Makes the UI easier to read

### Does this PR introduce any user-facing change?

no

### How was this patch tested?
manual test

Closes #28037 from cloud-fan/ui.

Authored-by: Wenchen Fan 
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/execution/metric/SQLMetrics.scala| 60 +++---
 .../spark/sql/execution/ui/SparkPlanGraph.scala|  7 ++-
 .../sql/execution/metric/SQLMetricsSuite.scala | 33 +++-
 .../sql/execution/metric/SQLMetricsTestUtils.scala | 12 ++---
 .../execution/ui/SQLAppStatusListenerSuite.scala   |  9 ++--
 5 files changed, 68 insertions(+), 53 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 1394e0f..92d2179 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -116,26 +116,23 @@ object SQLMetrics {
 // data size total (min, med, max):
 // 100GB (100MB, 1GB, 10GB)
 val acc = new SQLMetric(SIZE_METRIC, -1)
-acc.register(sc, name = Some(s"$name total (min, med, max (stageId: 
taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
   def createTimingMetric(sc: SparkContext, name: String): SQLMetric = {
 // The final result of this metric in physical operator UI may looks like:
-// duration(min, med, max):
+// duration total (min, med, max):
 // 5s (800ms, 1s, 2s)
 val acc = new SQLMetric(TIMING_METRIC, -1)
-acc.register(sc, name = Some(s"$name total (min, med, max (stageId: 
taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
   def createNanoTimingMetric(sc: SparkContext, name: String): SQLMetric = {
 // Same with createTimingMetric, just normalize the unit of time to 
millisecond.
 val acc = new SQLMetric(NS_TIMING_METRIC, -1)
-acc.register(sc, name = Some(s"$name total (min, med, max (stageId: 
taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
@@ -150,8 +147,7 @@ object SQLMetrics {
 // probe avg (min, med, max):
 // (1.2, 2.2, 6.3)
 val acc = new SQLMetric(AVERAGE_METRIC)
-acc.register(sc, name = Some(s"$name (min, med, max (stageId: taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
@@ -164,13 +160,15 @@ object SQLMetrics {
 metricsType != SUM_METRIC
   }
 
+  private val METRICS_NAME_SUFFIX = "(min, med, max (stageId: taskId))"
+
   /**
* A function that defines how we aggregate the final accumulator results 
among all tasks,
* and represent it in string for a SQL physical operator.
 */
   def stringValue(metricsType: String, values: Array[Long], maxMetrics: 
Array[Long]): String = {
-// stringMetric = "(driver)" OR (stage ${stageId}.${attemptId}: task 
$taskId)
-val stringMetric = if (maxMetrics.isEmpty) {
+// taskInfo = "(driver)" OR (stage ${stageId}.${attemptId}: task $taskId)
+val taskInfo = if (maxMetrics.isEmpty) {
   "(driver)"
 } else {
   s"(stage ${maxMetrics(1)}.${maxMetrics(2)}: task ${maxMetrics(3)})"
@@ -180,18 +178,20 @@ object SQLMetrics {
   numberFormat.format(values.sum)
 } else if (metricsType == AVERAGE_METRIC) {
   val validValues = value

[spark] branch branch-3.0 updated: [SPARK-31271][UI] fix web ui for driver side SQL metrics

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 7c90ec0  [SPARK-31271][UI] fix web ui for driver side SQL metrics
7c90ec0 is described below

commit 7c90ec065f81c3933eef1f0dd172f1a518b1232b
Author: Wenchen Fan 
AuthorDate: Fri Mar 27 15:45:35 2020 -0700

[SPARK-31271][UI] fix web ui for driver side SQL metrics

### What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/23551, we changed the metrics type 
of driver-side SQL metrics to size/time etc. which comes with max/min/median 
info.

This doesn't make sense for driver side SQL metrics as they have only one 
value. It makes the web UI hard to read:

![image](https://user-images.githubusercontent.com/3182036/77653892-42db9900-6fab-11ea-8e7f-92f763fa32ff.png)

This PR updates the SQL metrics UI to only display max/min/median if there 
are more than one metrics values:

![image](https://user-images.githubusercontent.com/3182036/77653975-5f77d100-6fab-11ea-849e-64c935377c8e.png)

### Why are the changes needed?

Makes the UI easier to read

### Does this PR introduce any user-facing change?

no

### How was this patch tested?
manual test

Closes #28037 from cloud-fan/ui.

Authored-by: Wenchen Fan 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit c4e98c065c99d2cf840e6006ee5414fbaaba9937)
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/execution/metric/SQLMetrics.scala| 60 +++---
 .../spark/sql/execution/ui/SparkPlanGraph.scala|  7 ++-
 .../sql/execution/metric/SQLMetricsSuite.scala | 33 +++-
 .../sql/execution/metric/SQLMetricsTestUtils.scala | 12 ++---
 .../execution/ui/SQLAppStatusListenerSuite.scala   |  9 ++--
 5 files changed, 68 insertions(+), 53 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 1394e0f..92d2179 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -116,26 +116,23 @@ object SQLMetrics {
 // data size total (min, med, max):
 // 100GB (100MB, 1GB, 10GB)
 val acc = new SQLMetric(SIZE_METRIC, -1)
-acc.register(sc, name = Some(s"$name total (min, med, max (stageId: 
taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
   def createTimingMetric(sc: SparkContext, name: String): SQLMetric = {
 // The final result of this metric in physical operator UI may looks like:
-// duration(min, med, max):
+// duration total (min, med, max):
 // 5s (800ms, 1s, 2s)
 val acc = new SQLMetric(TIMING_METRIC, -1)
-acc.register(sc, name = Some(s"$name total (min, med, max (stageId: 
taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
   def createNanoTimingMetric(sc: SparkContext, name: String): SQLMetric = {
 // Same with createTimingMetric, just normalize the unit of time to 
millisecond.
 val acc = new SQLMetric(NS_TIMING_METRIC, -1)
-acc.register(sc, name = Some(s"$name total (min, med, max (stageId: 
taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
@@ -150,8 +147,7 @@ object SQLMetrics {
 // probe avg (min, med, max):
 // (1.2, 2.2, 6.3)
 val acc = new SQLMetric(AVERAGE_METRIC)
-acc.register(sc, name = Some(s"$name (min, med, max (stageId: taskId))"),
-  countFailedValues = false)
+acc.register(sc, name = Some(name), countFailedValues = false)
 acc
   }
 
@@ -164,13 +160,15 @@ object SQLMetrics {
 metricsType != SUM_METRIC
   }
 
+  private val METRICS_NAME_SUFFIX = "(min, med, max (stageId: taskId))"
+
   /**
* A function that defines how we aggregate the final accumulator results 
among all tasks,
* and represent it in string for a SQL physical operator.
 */
   def stringValue(metricsType: String, values: Array[Long], maxMetrics: 
Array[Long]): String = {
-// stringMetric = "(driver)" OR (stage ${stageId}.${attemptId}: task 
$taskId)
-val stringMetric = if (maxMetrics.isEmpty) {
+// taskInfo = "(driver)" OR (stage ${stageId}.${attemptId}: task $taskId)
+val taskInfo = if (maxMetrics.isEmpty) {
   "(driver)"
 } else {
   s"(stage ${maxMetrics(1)}.${maxMetrics(2)}: task ${maxMetrics(3)})"
@@ -180,18 +178,20 @@ object SQLMetrics {

[spark] branch branch-3.0 updated: [SPARK-31238][SPARK-31284][TEST][FOLLOWUP] Fix readResourceOrcFile to create a local file from resource

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 4e13ba9  [SPARK-31238][SPARK-31284][TEST][FOLLOWUP] Fix 
readResourceOrcFile to create a local file from resource
4e13ba9 is described below

commit 4e13ba90446745fc5a9f46ed1f80c6eefb738795
Author: Dongjoon Hyun 
AuthorDate: Fri Mar 27 18:44:53 2020 -0700

[SPARK-31238][SPARK-31284][TEST][FOLLOWUP] Fix readResourceOrcFile to 
create a local file from resource

### What changes were proposed in this pull request?

This PR aims to copy a test resource file to a local file in `OrcTest` 
suite before reading it.

### Why are the changes needed?

SPARK-31238 and SPARK-31284 added test cases to access the resouce file in 
`sql/core` module from `sql/hive` module. In **Maven** test environment, this 
causes a failure.
```
- SPARK-31238: compatibility with Spark 2.4 in reading dates *** FAILED ***
java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative 
path in absolute URI:

jar:file:/home/jenkins/workspace/spark-master-test-maven-hadoop-3.2-hive-2.3-jdk-11/sql/core/target/spark-sql_2.12-3.1.0-SNAPSHOT-tests.jar!/test-data/before_1582_date_v2_4.snappy.orc
```

```
- SPARK-31284: compatibility with Spark 2.4 in reading timestamps *** 
FAILED ***
java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative 
path in absolute URI:

jar:file:/home/jenkins/workspace/spark-master-test-maven-hadoop-3.2-hive-2.3/sql/core/target/spark-sql_2.12-3.1.0-SNAPSHOT-tests.jar!/test-data/before_1582_ts_v2_4.snappy.orc
```

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Pass the Jenkins with Maven.

Closes #28059 from dongjoon-hyun/SPARK-31238.

Authored-by: Dongjoon Hyun 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit d025ddbaa7e7b9746d8e47aeed61ed39d2f09f0e)
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/sql/execution/datasources/orc/OrcTest.scala   | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
index 16772fe..e929f90 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.commons.io.FileUtils
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
@@ -136,6 +137,10 @@ abstract class OrcTest extends QueryTest with 
FileBasedDataSourceTest with Befor
 
   protected def readResourceOrcFile(name: String): DataFrame = {
 val url = Thread.currentThread().getContextClassLoader.getResource(name)
-spark.read.orc(url.toString)
+// Copy to avoid URISyntaxException when `sql/hive` accesses the resources 
in `sql/core`
+val file = File.createTempFile("orc-test", ".orc")
+file.deleteOnExit();
+FileUtils.copyURLToFile(url, file)
+spark.read.orc(file.getAbsolutePath)
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (c4e98c0 -> d025ddba)

2020-03-27 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from c4e98c0  [SPARK-31271][UI] fix web ui for driver side SQL metrics
 add d025ddba [SPARK-31238][SPARK-31284][TEST][FOLLOWUP] Fix 
readResourceOrcFile to create a local file from resource

No new revisions were added by this update.

Summary of changes:
 .../org/apache/spark/sql/execution/datasources/orc/OrcTest.scala   | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (0b237bd -> 34c7476)

2020-03-28 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 0b237bd  [SPARK-31292][CORE][SQL] Replace toSet.toSeq with distinct 
for readability
 add 34c7476  [SPARK-30722][DOCS][FOLLOW-UP] Add Pandas Function API into 
the menu

No new revisions were added by this update.

Summary of changes:
 docs/_data/menu-sql.yaml | 2 ++
 1 file changed, 2 insertions(+)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30722][DOCS][FOLLOW-UP] Add Pandas Function API into the menu

2020-03-28 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 1c4fe31  [SPARK-30722][DOCS][FOLLOW-UP] Add Pandas Function API into 
the menu
1c4fe31 is described below

commit 1c4fe31a5a697c80ca59ed6286ad1423d8541e6a
Author: HyukjinKwon 
AuthorDate: Sat Mar 28 18:36:34 2020 -0700

[SPARK-30722][DOCS][FOLLOW-UP] Add Pandas Function API into the menu

### What changes were proposed in this pull request?

This PR adds "Pandas Function API" into the menu.

### Why are the changes needed?

To be consistent and to make easier to navigate.

### Does this PR introduce any user-facing change?

No, master only.

![Screen Shot 2020-03-27 at 11 40 29 
PM](https://user-images.githubusercontent.com/6477701/77767405-60306600-7084-11ea-944a-93726259cd00.png)

### How was this patch tested?

Manually verified by `SKIP_API=1 jekyll build`.

Closes #28054 from HyukjinKwon/followup-spark-30722.

Authored-by: HyukjinKwon 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 34c7476cb5da98d5f3be354669dcd762df2b75e1)
Signed-off-by: Dongjoon Hyun 
---
 docs/_data/menu-sql.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml
index c17bfd3..8a1d937 100644
--- a/docs/_data/menu-sql.yaml
+++ b/docs/_data/menu-sql.yaml
@@ -67,6 +67,8 @@
   url: 
sql-pyspark-pandas-with-arrow.html#enabling-for-conversion-tofrom-pandas
 - text: "Pandas UDFs (a.k.a. Vectorized UDFs)"
   url: sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs
+- text: "Pandas Function APIs"
+  url: sql-pyspark-pandas-with-arrow.html#pandas-function-apis
 - text: Usage Notes
   url: sql-pyspark-pandas-with-arrow.html#usage-notes
 - text: Migration Guide


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31280][SQL] Perform propagating empty relation after RewritePredicateSubquery

2020-03-29 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new f376d24  [SPARK-31280][SQL] Perform propagating empty relation after 
RewritePredicateSubquery
f376d24 is described below

commit f376d24ea1f40740864d38ceb424713372e7e6ce
Author: Kent Yao 
AuthorDate: Sun Mar 29 11:32:22 2020 -0700

[SPARK-31280][SQL] Perform propagating empty relation after 
RewritePredicateSubquery

### What changes were proposed in this pull request?
```sql
scala> spark.sql(" select * from values(1), (2) t(key) where key in (select 
1 as key where 1=0)").queryExecution
res15: org.apache.spark.sql.execution.QueryExecution =
== Parsed Logical Plan ==
'Project [*]
+- 'Filter 'key IN (list#39 [])
   :  +- Project [1 AS key#38]
   : +- Filter (1 = 0)
   :+- OneRowRelation
   +- 'SubqueryAlias t
  +- 'UnresolvedInlineTable [key], [List(1), List(2)]

== Analyzed Logical Plan ==
key: int
Project [key#40]
+- Filter key#40 IN (list#39 [])
   :  +- Project [1 AS key#38]
   : +- Filter (1 = 0)
   :+- OneRowRelation
   +- SubqueryAlias t
  +- LocalRelation [key#40]

== Optimized Logical Plan ==
Join LeftSemi, (key#40 = key#38)
:- LocalRelation [key#40]
+- LocalRelation , [key#38]

== Physical Plan ==
*(1) BroadcastHashJoin [key#40], [key#38], LeftSemi, BuildRight
:- *(1) LocalTableScan [key#40]
+- Br...
```

`LocalRelation  ` should be able to propagate after subqueries are 
lift up to joins

### Why are the changes needed?

optimize query

### Does this PR introduce any user-facing change?

no
### How was this patch tested?

add new tests

Closes #28043 from yaooqinn/SPARK-31280.

Authored-by: Kent Yao 
Signed-off-by: Dongjoon Hyun 
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |  2 ++
 .../catalyst/optimizer/RewriteSubquerySuite.scala  | 17 +++---
 .../apache/spark/sql/execution/PlannerSuite.scala  | 36 ++
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index da147dd..827f528 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -198,6 +198,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
   CheckCartesianProducts) :+
 Batch("RewriteSubquery", Once,
   RewritePredicateSubquery,
+  ConvertToLocalRelation,
+  PropagateEmptyRelation,
   ColumnPruning,
   CollapseProject,
   RemoveNoopOperators) :+
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
index f00d22e..2238afd 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
@@ -22,17 +22,17 @@ import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions.ListQuery
 import org.apache.spark.sql.catalyst.plans.{LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
-import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
 
 
 class RewriteSubquerySuite extends PlanTest {
 
-  object Optimize extends RuleExecutor[LogicalPlan] {
+  case class Optimize(addOn: Rule[LogicalPlan]) extends 
RuleExecutor[LogicalPlan] {
 val batches =
   Batch("Column Pruning", FixedPoint(100), ColumnPruning) ::
   Batch("Rewrite Subquery", FixedPoint(1),
 RewritePredicateSubquery,
-ColumnPruning,
+addOn,
 CollapseProject,
 RemoveNoopOperators) :: Nil
   }
@@ -43,7 +43,7 @@ class RewriteSubquerySuite extends PlanTest {
 
 val query = 
relation.where('a.in(ListQuery(relInSubquery.select('x.select('a)
 
-val optimized = Optimize.execute(query.analyze)
+val optimized = Optimize(ColumnPruning).execute(query.analyze)
 val correctAnswer = relation
   .select('a)
   .join(relInSubquery.select('x), LeftSemi, Some('a === 'x))
@@ -52,4 +52,13 @@ class RewriteSubquerySuite extends PlanTest {
 comparePlans(optimized, correctAnswer)
   }
 
+  test("SPARK-31280: Per

[spark] branch master updated (e656e99 -> f376d24)

2020-03-29 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from e656e99  [SPARK-30363][SQL][DOCS][FOLLOWUP] Fix a broken link in SQL 
Reference
 add f376d24  [SPARK-31280][SQL] Perform propagating empty relation after 
RewritePredicateSubquery

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |  2 ++
 .../catalyst/optimizer/RewriteSubquerySuite.scala  | 17 +++---
 .../apache/spark/sql/execution/PlannerSuite.scala  | 36 ++
 3 files changed, 51 insertions(+), 4 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and help messages for Kinesis integration

2020-03-29 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 83f943c  [SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and 
help messages for Kinesis integration
83f943c is described below

commit 83f943c4b922413159cdcc09dfee7ef67d586215
Author: Kengo Seki 
AuthorDate: Sun Mar 29 14:27:19 2020 -0700

[SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and help messages 
for Kinesis integration

### What changes were proposed in this pull request?

This PR (SPARK-31293) fixes wrong command examples, parameter descriptions 
and help message format for Amazon Kinesis integration with Spark Streaming.

### Why are the changes needed?

To improve usability of those commands.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

I ran the fixed commands manually and confirmed they worked as expected.

Closes #28063 from sekikn/SPARK-31293.

Authored-by: Kengo Seki 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 60dd1a690fed62b1d6442cdc8cf3f89ef4304d5a)
Signed-off-by: Dongjoon Hyun 
---
 docs/streaming-kinesis-integration.md  |  3 +--
 .../main/python/examples/streaming/kinesis_wordcount_asl.py|  7 ---
 .../apache/spark/examples/streaming/KinesisWordCountASL.scala  | 10 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/streaming-kinesis-integration.md 
b/docs/streaming-kinesis-integration.md
index e68d513..db813c4 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -246,8 +246,7 @@ To run the example,


 
-./bin/spark-submit --jars external/kinesis-asl/target/scala-*/\
-spark-streaming-kinesis-asl-assembly_*.jar \
+./bin/spark-submit --jars 
'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar'
 \
 
external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 \
 [Kinesis app name] [Kinesis stream name] [endpoint URL] [region 
name]
 
diff --git 
a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 
b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
index 49794fa..777a332 100644
--- 
a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
+++ 
b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -26,6 +26,7 @@
  name of the Kinesis stream (ie. mySparkStream)
  endpoint of the Kinesis service
   (e.g. https://kinesis.us-east-1.amazonaws.com)
+ region name of the Kinesis endpoint (e.g. us-east-1)
 
 
   Example:
@@ -34,10 +35,10 @@
   $ export AWS_SECRET_KEY=
 
   # run the example
-  $ bin/spark-submit -jars external/kinesis-asl/target/scala-*/\
-spark-streaming-kinesis-asl-assembly_*.jar \
+  $ bin/spark-submit --jars \
+
'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar'
 \
 
external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 \
-myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com
+myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com 
us-east-1
 
   There is a companion helper class called KinesisWordProducerASL which puts 
dummy data
   onto the Kinesis stream.
diff --git 
a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
 
b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index a5d5ac7..32f4a67 100644
--- 
a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ 
b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -78,7 +78,7 @@ object KinesisWordCountASL extends Logging {
 if (args.length != 3) {
   System.err.println(
 """
-  |Usage: KinesisWordCountASL

+  |Usage: KinesisWordCountASL   
   |
   | is the name of the consumer app, used to track the 
read data in DynamoDB
   | is the name of the Kinesis stream
@@ -171,11 +171,11 @@ object KinesisWordCountASL extends Logging {
  *is the endpoint of the Kinesis service
  * (ie. https://kinesis.us-east-1.amazonaws.com)
  *is the rate of records per second to put onto the stream
- *is the rate of records per second to put onto the 
stream
+ *is the number of words per record
  *
  * Example:
  *$ SPARK_HOME/bin/run-example streaming.KinesisWordProducerASL 
mySparkStream \
- * https://kinesis.us-

[spark] branch master updated: [SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and help messages for Kinesis integration

2020-03-29 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 60dd1a6  [SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and 
help messages for Kinesis integration
60dd1a6 is described below

commit 60dd1a690fed62b1d6442cdc8cf3f89ef4304d5a
Author: Kengo Seki 
AuthorDate: Sun Mar 29 14:27:19 2020 -0700

[SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and help messages 
for Kinesis integration

### What changes were proposed in this pull request?

This PR (SPARK-31293) fixes wrong command examples, parameter descriptions 
and help message format for Amazon Kinesis integration with Spark Streaming.

### Why are the changes needed?

To improve usability of those commands.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

I ran the fixed commands manually and confirmed they worked as expected.

Closes #28063 from sekikn/SPARK-31293.

Authored-by: Kengo Seki 
Signed-off-by: Dongjoon Hyun 
---
 docs/streaming-kinesis-integration.md  |  3 +--
 .../main/python/examples/streaming/kinesis_wordcount_asl.py|  7 ---
 .../apache/spark/examples/streaming/KinesisWordCountASL.scala  | 10 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/streaming-kinesis-integration.md 
b/docs/streaming-kinesis-integration.md
index e68d513..db813c4 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -246,8 +246,7 @@ To run the example,


 
-./bin/spark-submit --jars external/kinesis-asl/target/scala-*/\
-spark-streaming-kinesis-asl-assembly_*.jar \
+./bin/spark-submit --jars 
'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar'
 \
 
external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 \
 [Kinesis app name] [Kinesis stream name] [endpoint URL] [region 
name]
 
diff --git 
a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 
b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
index 49794fa..777a332 100644
--- 
a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
+++ 
b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -26,6 +26,7 @@
  name of the Kinesis stream (ie. mySparkStream)
  endpoint of the Kinesis service
   (e.g. https://kinesis.us-east-1.amazonaws.com)
+ region name of the Kinesis endpoint (e.g. us-east-1)
 
 
   Example:
@@ -34,10 +35,10 @@
   $ export AWS_SECRET_KEY=
 
   # run the example
-  $ bin/spark-submit -jars external/kinesis-asl/target/scala-*/\
-spark-streaming-kinesis-asl-assembly_*.jar \
+  $ bin/spark-submit --jars \
+
'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar'
 \
 
external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 \
-myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com
+myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com 
us-east-1
 
   There is a companion helper class called KinesisWordProducerASL which puts 
dummy data
   onto the Kinesis stream.
diff --git 
a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
 
b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index a5d5ac7..32f4a67 100644
--- 
a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ 
b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -78,7 +78,7 @@ object KinesisWordCountASL extends Logging {
 if (args.length != 3) {
   System.err.println(
 """
-  |Usage: KinesisWordCountASL

+  |Usage: KinesisWordCountASL   
   |
   | is the name of the consumer app, used to track the 
read data in DynamoDB
   | is the name of the Kinesis stream
@@ -171,11 +171,11 @@ object KinesisWordCountASL extends Logging {
  *is the endpoint of the Kinesis service
  * (ie. https://kinesis.us-east-1.amazonaws.com)
  *is the rate of records per second to put onto the stream
- *is the rate of records per second to put onto the 
stream
+ *is the number of words per record
  *
  * Example:
  *$ SPARK_HOME/bin/run-example streaming.KinesisWordProducerASL 
mySparkStream \
- * https://kinesis.us-east-1.amazonaws.com us-east-1 10 5
+ * https://kinesis.us-east-1.amazonaws.com 10 5
  */
 object Kinesi

[spark] branch branch-2.4 updated: [SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and help messages for Kinesis integration

2020-03-29 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new f05ac28a [SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and 
help messages for Kinesis integration
f05ac28a is described below

commit f05ac28a949b9ad5b154ac959b9da6a24a4e09e6
Author: Kengo Seki 
AuthorDate: Sun Mar 29 14:27:19 2020 -0700

[SPARK-31293][DSTREAMS][KINESIS][DOC] Fix wrong examples and help messages 
for Kinesis integration

This PR (SPARK-31293) fixes wrong command examples, parameter descriptions 
and help message format for Amazon Kinesis integration with Spark Streaming.

To improve usability of those commands.

No

I ran the fixed commands manually and confirmed they worked as expected.

Closes #28063 from sekikn/SPARK-31293.

Authored-by: Kengo Seki 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 60dd1a690fed62b1d6442cdc8cf3f89ef4304d5a)
Signed-off-by: Dongjoon Hyun 
---
 docs/streaming-kinesis-integration.md  |  3 +--
 .../main/python/examples/streaming/kinesis_wordcount_asl.py|  7 ---
 .../apache/spark/examples/streaming/KinesisWordCountASL.scala  | 10 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/streaming-kinesis-integration.md 
b/docs/streaming-kinesis-integration.md
index 6a52e8a..0685cd8 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -217,8 +217,7 @@ To run the example,


 
-bin/spark-submit --jars external/kinesis-asl/target/scala-*/\
-spark-streaming-kinesis-asl-assembly_*.jar \
+./bin/spark-submit --jars 
'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar'
 \
 
external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 \
 [Kinesis app name] [Kinesis stream name] [endpoint URL] [region 
name]
 
diff --git 
a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 
b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
index 49794fa..777a332 100644
--- 
a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
+++ 
b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -26,6 +26,7 @@
  name of the Kinesis stream (ie. mySparkStream)
  endpoint of the Kinesis service
   (e.g. https://kinesis.us-east-1.amazonaws.com)
+ region name of the Kinesis endpoint (e.g. us-east-1)
 
 
   Example:
@@ -34,10 +35,10 @@
   $ export AWS_SECRET_KEY=
 
   # run the example
-  $ bin/spark-submit -jars external/kinesis-asl/target/scala-*/\
-spark-streaming-kinesis-asl-assembly_*.jar \
+  $ bin/spark-submit --jars \
+
'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar'
 \
 
external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 \
-myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com
+myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com 
us-east-1
 
   There is a companion helper class called KinesisWordProducerASL which puts 
dummy data
   onto the Kinesis stream.
diff --git 
a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
 
b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index fcb790e..d97ab74 100644
--- 
a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ 
b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -78,7 +78,7 @@ object KinesisWordCountASL extends Logging {
 if (args.length != 3) {
   System.err.println(
 """
-  |Usage: KinesisWordCountASL

+  |Usage: KinesisWordCountASL   
   |
   | is the name of the consumer app, used to track the 
read data in DynamoDB
   | is the name of the Kinesis stream
@@ -171,11 +171,11 @@ object KinesisWordCountASL extends Logging {
  *is the endpoint of the Kinesis service
  * (ie. https://kinesis.us-east-1.amazonaws.com)
  *is the rate of records per second to put onto the stream
- *is the rate of records per second to put onto the 
stream
+ *is the number of words per record
  *
  * Example:
  *$ SPARK_HOME/bin/run-example streaming.KinesisWordProducerASL 
mySparkStream \
- * https://kinesis.us-east-1.amazonaws.com us-east-1 10 5
+ * https://kinesis.us-east-1.amazonaws.com 10 5
  */
 object KinesisWordProducerASL {
   def main(args: Array[String]) {
@@ -183,13 +183,

[spark] branch branch-2.4 updated: [SPARK-31101][BUILD][2.4] Upgrade Janino to 3.0.16

2020-03-29 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new 4add8ad  [SPARK-31101][BUILD][2.4] Upgrade Janino to 3.0.16
4add8ad is described below

commit 4add8ad70c19518722d643948de3b78c1fcfefe5
Author: Jungtaek Lim (HeartSaVioR) 
AuthorDate: Sun Mar 29 19:25:57 2020 -0700

[SPARK-31101][BUILD][2.4] Upgrade Janino to 3.0.16

### What changes were proposed in this pull request?

This PR(SPARK-31101) proposes to upgrade Janino to 3.0.16 which is released 
recently.

* Merged pull request janino-compiler/janino#114 "Grow the code for 
relocatables, and do fixup, and relocate".

Please see the commit log.
- https://github.com/janino-compiler/janino/commits/3.0.16

You can see the changelog from the link: 
http://janino-compiler.github.io/janino/changelog.html / though release note 
for Janino 3.0.16 is actually incorrect.

### Why are the changes needed?

We got some report on failure on user's query which Janino throws error on 
compiling generated code. The issue is here: janino-compiler/janino#113 It 
contains the information of generated code, symptom (error), and analysis of 
the bug, so please refer the link for more details.
Janino 3.0.16 contains the PR janino-compiler/janino#114 which would enable 
Janino to succeed to compile user's query properly.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Existing UTs.

Below test code fails on branch-2.4 and passes with this patch.

(Note that there seems to be the case where another UT affects this UT to 
not fail - adding this to SQLQuerySuite won't fail this UT, but adding this to 
DateFunctionsSuite will fail this UT, and if you run this UT solely in 
SQLQuerySuite via `build/sbt "sql/testOnly *.SQLQuerySuite -- -z SPARK-31115"` 
then it fails.)

```
  /**
   * NOTE: The test code tries to control the size of for/switch statement 
in expand_doConsume,
   * as well as the overall size of expand_doConsume, so that the query 
triggers known Janino
   * bug - https://github.com/janino-compiler/janino/issues/113.
   *
   * The expected exception message from Janino when we use switch 
statement for "ExpandExec":
   * - "Operand stack inconsistent at offset xxx: Previous size 1, now 0"
   * which will not happen when we use if-else-if statement for 
"ExpandExec".
   *
   * "The number of fields" and "The number of distinct aggregation 
functions" are the major
   * factors to increase the size of generated code: while these values 
should be large enough
   * to trigger the Janino bug, these values should not also too big; 
otherwise one of below
   * exceptions might be thrown:
   * - "expand_doConsume would be beyond 64KB"
   * - "java.lang.ClassFormatError: Too many arguments in method signature 
in class file"
   */
  test("SPARK-31115 Lots of columns and distinct aggregations shouldn't 
break code generation") {
withSQLConf(
  (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true"),
  (SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key, "1"),
  (SQLConf.CODEGEN_FALLBACK.key, "false"),
  (SQLConf.CODEGEN_LOGGING_MAX_LINES.key, "-1")
) {
  var df = Seq(("1", "2", 1), ("1", "2", 2), ("2", "3", 3), ("2", "3", 
4)).toDF("a", "b", "c")

  // The value is tested under commit 
"244405fe57d7737d81c34ba9e8917df6285889eb":
  // the query fails with switch statement, whereas it passes with 
if-else statement.
  // Note that the value depends on the Spark logic as well - different 
Spark versions may
  // require different value to ensure the test failing with switch 
statement.
  val numNewFields = 100

  df = df.withColumns(
(1 to numNewFields).map { idx => s"a$idx" },
(1 to numNewFields).map { idx =>
  when(col("c").mod(lit(2)).===(lit(0)), 
lit(idx)).otherwise(col("c"))
}
  )

  val aggExprs: Array[Column] = Range(1, numNewFields).map { idx =>
if (idx % 2 == 0) {
  coalesce(countDistinct(s"a$idx"), lit(0))
} else {
  coalesce(count(s"a$idx"), lit(0))
}
  }.toArray

  val aggDf = df
.groupBy("a", "b")
.agg(aggExprs.he

[spark] branch master updated (0d997e5 -> 1d0fc9a)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 0d997e5  [SPARK-31219][YARN] Enable closeIdleConnections in 
YarnShuffleService
 add 1d0fc9a  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh

No new revisions were added by this update.

Summary of changes:
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (0d997e5 -> 1d0fc9a)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 0d997e5  [SPARK-31219][YARN] Enable closeIdleConnections in 
YarnShuffleService
 add 1d0fc9a  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh

No new revisions were added by this update.

Summary of changes:
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (0d997e5 -> 1d0fc9a)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 0d997e5  [SPARK-31219][YARN] Enable closeIdleConnections in 
YarnShuffleService
 add 1d0fc9a  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh

No new revisions were added by this update.

Summary of changes:
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (0d997e5 -> 1d0fc9a)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 0d997e5  [SPARK-31219][YARN] Enable closeIdleConnections in 
YarnShuffleService
 add 1d0fc9a  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh

No new revisions were added by this update.

Summary of changes:
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (1d0fc9a -> aa98ac5)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 1d0fc9a  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh
 add aa98ac5  [SPARK-30775][DOC] Improve the description of executor 
metrics in the monitoring documentation

No new revisions were added by this update.

Summary of changes:
 docs/monitoring.md | 58 +++---
 1 file changed, 51 insertions(+), 7 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (aa98ac5 -> cda2e30)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from aa98ac5  [SPARK-30775][DOC] Improve the description of executor 
metrics in the monitoring documentation
 add cda2e30  Revert "[SPARK-31280][SQL] Perform propagating empty relation 
after RewritePredicateSubquery"

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |  2 --
 .../catalyst/optimizer/RewriteSubquerySuite.scala  | 17 +++---
 .../apache/spark/sql/execution/PlannerSuite.scala  | 36 --
 3 files changed, 4 insertions(+), 51 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated (aa98ac5 -> cda2e30)

2020-03-30 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from aa98ac5  [SPARK-30775][DOC] Improve the description of executor 
metrics in the monitoring documentation
 add cda2e30  Revert "[SPARK-31280][SQL] Perform propagating empty relation 
after RewritePredicateSubquery"

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |  2 --
 .../catalyst/optimizer/RewriteSubquerySuite.scala  | 17 +++---
 .../apache/spark/sql/execution/PlannerSuite.scala  | 36 --
 3 files changed, 4 insertions(+), 51 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in Docker entrypoint.sh

2020-03-31 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 5a96ee7  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh
5a96ee7 is described below

commit 5a96ee7619ea07edefd030c66641e6e473a890e0
Author: Đặng Minh Dũng 
AuthorDate: Mon Mar 30 15:41:57 2020 -0700

[SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in Docker 
entrypoint.sh

A small change to fix an error in Docker `entrypoint.sh`

When spark running on Kubernetes, I got the following logs:
```log
+ '[' -n ']'
+ '[' -z ']'
++ /bin/hadoop classpath
/opt/entrypoint.sh: line 62: /bin/hadoop: No such file or directory
+ export SPARK_DIST_CLASSPATH=
+ SPARK_DIST_CLASSPATH=
```
This is because you are missing some quotes on bash comparisons.

No

CI

Closes #28075 from dungdm93/patch-1.

Authored-by: Đặng Minh Dũng 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 1d0fc9aa85b3ad3326b878de49b748413dee1dd9)
Signed-off-by: Dongjoon Hyun 
---
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git 
a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh 
b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index 6ee3523..8218c29 100755
--- 
a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ 
b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -58,8 +58,8 @@ fi
 
 # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so 
Hadoop jars are available to the executor.
 # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding 
customizations of this value from elsewhere e.g. Docker/K8s.
-if [ -n ${HADOOP_HOME}  ] && [ -z ${SPARK_DIST_CLASSPATH}  ]; then
-  export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)  
+if [ -n "${HADOOP_HOME}"  ] && [ -z "${SPARK_DIST_CLASSPATH}"  ]; then
+  export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
 fi
 
 if ! [ -z ${HADOOP_CONF_DIR+x} ]; then


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in Docker entrypoint.sh

2020-03-31 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 5a96ee7  [SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in 
Docker entrypoint.sh
5a96ee7 is described below

commit 5a96ee7619ea07edefd030c66641e6e473a890e0
Author: Đặng Minh Dũng 
AuthorDate: Mon Mar 30 15:41:57 2020 -0700

[SPARK-29574][K8S][FOLLOWUP] Fix bash comparison error in Docker 
entrypoint.sh

A small change to fix an error in Docker `entrypoint.sh`

When spark running on Kubernetes, I got the following logs:
```log
+ '[' -n ']'
+ '[' -z ']'
++ /bin/hadoop classpath
/opt/entrypoint.sh: line 62: /bin/hadoop: No such file or directory
+ export SPARK_DIST_CLASSPATH=
+ SPARK_DIST_CLASSPATH=
```
This is because you are missing some quotes on bash comparisons.

No

CI

Closes #28075 from dungdm93/patch-1.

Authored-by: Đặng Minh Dũng 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit 1d0fc9aa85b3ad3326b878de49b748413dee1dd9)
Signed-off-by: Dongjoon Hyun 
---
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git 
a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh 
b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index 6ee3523..8218c29 100755
--- 
a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ 
b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -58,8 +58,8 @@ fi
 
 # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so 
Hadoop jars are available to the executor.
 # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding 
customizations of this value from elsewhere e.g. Docker/K8s.
-if [ -n ${HADOOP_HOME}  ] && [ -z ${SPARK_DIST_CLASSPATH}  ]; then
-  export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)  
+if [ -n "${HADOOP_HOME}"  ] && [ -z "${SPARK_DIST_CLASSPATH}"  ]; then
+  export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
 fi
 
 if ! [ -z ${HADOOP_CONF_DIR+x} ]; then


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30775][DOC] Improve the description of executor metrics in the monitoring documentation

2020-03-31 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new ca3887a  [SPARK-30775][DOC] Improve the description of executor 
metrics in the monitoring documentation
ca3887a is described below

commit ca3887a0de31fa78097ca7ee92ead914a3ce050c
Author: Luca Canali 
AuthorDate: Mon Mar 30 18:00:54 2020 -0700

[SPARK-30775][DOC] Improve the description of executor metrics in the 
monitoring documentation

### What changes were proposed in this pull request?
This PR (SPARK-30775) aims to improve the description of the executor 
metrics in the monitoring documentation.

### Why are the changes needed?
Improve and clarify monitoring documentation by:
- adding reference to the Prometheus end point, as implemented in 
[SPARK-29064]
- extending the list and descripion of executor metrics, following up from 
[SPARK-27157]

### Does this PR introduce any user-facing change?
Documentation update.

### How was this patch tested?
n.a.

Closes #27526 from LucaCanali/docPrometheusMetricsFollowupSpark29064.

Authored-by: Luca Canali 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit aa98ac52dbbe3fc2d3b152af9324a71f48439a38)
Signed-off-by: Dongjoon Hyun 
---
 docs/monitoring.md | 58 +++---
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/docs/monitoring.md b/docs/monitoring.md
index ba3f1dc..131cd2a 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -689,9 +689,12 @@ A list of the available metrics, with a short description:
 ### Executor Metrics
 
 Executor-level metrics are sent from each executor to the driver as part of 
the Heartbeat to describe the performance metrics of Executor itself like JVM 
heap memory, GC information.
-Executor metric values and their measured peak values per executor are exposed 
via the REST API at the end point `/applications/[app-id]/executors`.
-In addition, aggregated per-stage peak values of the executor metrics are 
written to the event log if `spark.eventLog.logStageExecutorMetrics` is true.
-Executor metrics are also exposed via the Spark metrics system based on the 
Dropwizard metrics library.
+Executor metric values and their measured memory peak values per executor are 
exposed via the REST API in JSON format and in Prometheus format.
+The JSON end point is exposed at: `/applications/[app-id]/executors`, and the 
Prometheus endpoint at: `/metrics/executors/prometheus`.
+The Prometheus endpoint is conditional to a configuration parameter: 
`spark.ui.prometheus.enabled=true` (the default is `false`).
+In addition, aggregated per-stage peak values of the executor memory metrics 
are written to the event log if
+`spark.eventLog.logStageExecutorMetrics` is true.  
+Executor memory metrics are also exposed via the Spark metrics system based on 
the Dropwizard metrics library.
 A list of the available metrics, with a short description:
 
 
@@ -699,21 +702,62 @@ A list of the available metrics, with a short description:
   Short description
   
   
+rddBlocks
+RDD blocks in the block manager of this executor.
+  
+  
+memoryUsed
+Storage memory used by this executor.
+  
+  
+diskUsed
+Disk space used for RDD storage by this executor.
+  
+  
+totalCores
+Number of cores available in this executor.
+  
+  
+maxTasks
+Maximum number of tasks that can run concurrently in this 
executor.
+  
+  
+activeTasks
+Number of tasks currently executing.
+  
+  
+failedTasks
+Number of tasks that have failed in this executor.
+  
+  
+completedTasks
+Number of tasks that have completed in this executor.
+  
+  
+totalTasks
+Total number of tasks (running, failed and completed) in this 
executor.
+  
+  
+totalDuration
+Elapsed time the JVM spent executing tasks in this executor.
+The value is expressed in milliseconds.
+  
+  
 totalGCTime
-Elapsed time the JVM spent in garbage collection summed in this 
Executor.
+Elapsed time the JVM spent in garbage collection summed in this 
executor.
 The value is expressed in milliseconds.
   
   
 totalInputBytes
-Total input bytes summed in this Executor.
+Total input bytes summed in this executor.
   
   
 totalShuffleRead
-Total shuffer read bytes summed in this Executor.
+Total shuffle read bytes summed in this executor.
   
   
 totalShuffleWrite
-Total shuffer write bytes summed in this Executor.
+Total shuffle write bytes summed in this executor.
   
   
 maxMemory


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h

[spark] branch branch-3.0 updated: [SPARK-30775][DOC] Improve the description of executor metrics in the monitoring documentation

2020-03-31 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new ca3887a  [SPARK-30775][DOC] Improve the description of executor 
metrics in the monitoring documentation
ca3887a is described below

commit ca3887a0de31fa78097ca7ee92ead914a3ce050c
Author: Luca Canali 
AuthorDate: Mon Mar 30 18:00:54 2020 -0700

[SPARK-30775][DOC] Improve the description of executor metrics in the 
monitoring documentation

### What changes were proposed in this pull request?
This PR (SPARK-30775) aims to improve the description of the executor 
metrics in the monitoring documentation.

### Why are the changes needed?
Improve and clarify monitoring documentation by:
- adding reference to the Prometheus end point, as implemented in 
[SPARK-29064]
- extending the list and descripion of executor metrics, following up from 
[SPARK-27157]

### Does this PR introduce any user-facing change?
Documentation update.

### How was this patch tested?
n.a.

Closes #27526 from LucaCanali/docPrometheusMetricsFollowupSpark29064.

Authored-by: Luca Canali 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit aa98ac52dbbe3fc2d3b152af9324a71f48439a38)
Signed-off-by: Dongjoon Hyun 
---
 docs/monitoring.md | 58 +++---
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/docs/monitoring.md b/docs/monitoring.md
index ba3f1dc..131cd2a 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -689,9 +689,12 @@ A list of the available metrics, with a short description:
 ### Executor Metrics
 
 Executor-level metrics are sent from each executor to the driver as part of 
the Heartbeat to describe the performance metrics of Executor itself like JVM 
heap memory, GC information.
-Executor metric values and their measured peak values per executor are exposed 
via the REST API at the end point `/applications/[app-id]/executors`.
-In addition, aggregated per-stage peak values of the executor metrics are 
written to the event log if `spark.eventLog.logStageExecutorMetrics` is true.
-Executor metrics are also exposed via the Spark metrics system based on the 
Dropwizard metrics library.
+Executor metric values and their measured memory peak values per executor are 
exposed via the REST API in JSON format and in Prometheus format.
+The JSON end point is exposed at: `/applications/[app-id]/executors`, and the 
Prometheus endpoint at: `/metrics/executors/prometheus`.
+The Prometheus endpoint is conditional to a configuration parameter: 
`spark.ui.prometheus.enabled=true` (the default is `false`).
+In addition, aggregated per-stage peak values of the executor memory metrics 
are written to the event log if
+`spark.eventLog.logStageExecutorMetrics` is true.  
+Executor memory metrics are also exposed via the Spark metrics system based on 
the Dropwizard metrics library.
 A list of the available metrics, with a short description:
 
 
@@ -699,21 +702,62 @@ A list of the available metrics, with a short description:
   Short description
   
   
+rddBlocks
+RDD blocks in the block manager of this executor.
+  
+  
+memoryUsed
+Storage memory used by this executor.
+  
+  
+diskUsed
+Disk space used for RDD storage by this executor.
+  
+  
+totalCores
+Number of cores available in this executor.
+  
+  
+maxTasks
+Maximum number of tasks that can run concurrently in this 
executor.
+  
+  
+activeTasks
+Number of tasks currently executing.
+  
+  
+failedTasks
+Number of tasks that have failed in this executor.
+  
+  
+completedTasks
+Number of tasks that have completed in this executor.
+  
+  
+totalTasks
+Total number of tasks (running, failed and completed) in this 
executor.
+  
+  
+totalDuration
+Elapsed time the JVM spent executing tasks in this executor.
+The value is expressed in milliseconds.
+  
+  
 totalGCTime
-Elapsed time the JVM spent in garbage collection summed in this 
Executor.
+Elapsed time the JVM spent in garbage collection summed in this 
executor.
 The value is expressed in milliseconds.
   
   
 totalInputBytes
-Total input bytes summed in this Executor.
+Total input bytes summed in this executor.
   
   
 totalShuffleRead
-Total shuffer read bytes summed in this Executor.
+Total shuffle read bytes summed in this executor.
   
   
 totalShuffleWrite
-Total shuffer write bytes summed in this Executor.
+Total shuffle write bytes summed in this executor.
   
   
 maxMemory


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h

[spark] branch master updated (1a7f964 -> 5ec1814)

2020-03-31 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 1a7f964  [SPARK-31305][SQL][DOCS] Add a page to list all commands in 
SQL Reference
 add 5ec1814  [SPARK-31248][CORE][TEST] Fix flaky 
ExecutorAllocationManagerSuite.interleaving add and remove

No new revisions were added by this update.

Summary of changes:
 core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala | 2 +-
 .../test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala | 5 -
 2 files changed, 5 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31308][PYSPARK] Merging pyFiles to files argument for Non-PySpark applications

2020-03-31 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 20fc6fa  [SPARK-31308][PYSPARK] Merging pyFiles to files argument for 
Non-PySpark applications
20fc6fa is described below

commit 20fc6fa8398b9dc47b9ae7df52133a306f89b25f
Author: Liang-Chi Hsieh 
AuthorDate: Tue Mar 31 18:08:55 2020 -0700

[SPARK-31308][PYSPARK] Merging pyFiles to files argument for Non-PySpark 
applications

### What changes were proposed in this pull request?

This PR (SPARK-31308) proposed to add python dependencies even it is not 
Python applications.

### Why are the changes needed?

For now, we add `pyFiles` argument to `files` argument only for Python 
applications, in SparkSubmit. Like the reason in #21420, "for some Spark 
applications, though they're a java program, they require not only jar 
dependencies, but also python dependencies.", we need to add `pyFiles` to 
`files` even it is not Python applications.

### Does this PR introduce any user-facing change?

Yes. After this change, for non-PySpark applications, the Python files 
specified by `pyFiles` are also added to `files` like PySpark applications.

### How was this patch tested?

Manually test on jupyter notebook or do `spark-submit` with `--verbose`.

```
Spark config:
...
(spark.files,file:/Users/dongjoon/PRS/SPARK-PR-28077/a.py)
(spark.submit.deployMode,client)
(spark.master,local[*])
```

Closes #28077 from viirya/pyfile.

Lead-authored-by: Liang-Chi Hsieh 
Co-authored-by: Liang-Chi Hsieh 
Signed-off-by: Dongjoon Hyun 
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 4d67dfa..1271a3d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -474,10 +474,12 @@ private[spark] class SparkSubmit extends Logging {
 args.mainClass = "org.apache.spark.deploy.PythonRunner"
 args.childArgs = ArrayBuffer(localPrimaryResource, localPyFiles) ++ 
args.childArgs
   }
-  if (clusterManager != YARN) {
-// The YARN backend handles python files differently, so don't merge 
the lists.
-args.files = mergeFileLists(args.files, args.pyFiles)
-  }
+}
+
+// Non-PySpark applications can need Python dependencies.
+if (deployMode == CLIENT && clusterManager != YARN) {
+  // The YARN backend handles python files differently, so don't merge the 
lists.
+  args.files = mergeFileLists(args.files, args.pyFiles)
 }
 
 if (localPyFiles != null) {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31313][K8S][TEST] Add `m01` node name to support Minikube 1.8.x

2020-04-01 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 08a88e0  [SPARK-31313][K8S][TEST] Add `m01` node name to support 
Minikube 1.8.x
08a88e0 is described below

commit 08a88e04f4417c5a0168592d1f01c300b4ae9de5
Author: Dongjoon Hyun 
AuthorDate: Wed Apr 1 03:42:26 2020 +

[SPARK-31313][K8S][TEST] Add `m01` node name to support Minikube 1.8.x

### What changes were proposed in this pull request?

This PR aims to add `m01` as a node name additionally to `PVTestsSuite`.

### Why are the changes needed?

minikube 1.8.0 ~ 1.8.2 generate a cluster with a nodename `m01` while all 
the other versions have `minikube`. This causes `PVTestSuite` failure.
```
$ minikube --vm-driver=hyperkit start --memory 6000 --cpus 8
* minikube v1.8.2 on Darwin 10.15.3
  - MINIKUBE_ACTIVE_DOCKERD=minikube
* Using the hyperkit driver based on user configuration
* Creating hyperkit VM (CPUs=8, Memory=6000MB, Disk=2MB) ...
* Preparing Kubernetes v1.18.0 on Docker 19.03.6 ...
* Launching Kubernetes ...
* Enabling addons: default-storageclass, storage-provisioner
* Waiting for cluster to come online ...
* Done! kubectl is now configured to use "minikube"

$ kubectl get nodes
NAME   STATUS   ROLESAGE   VERSION
m01Readymaster   22s   v1.17.3
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

This only adds a new node name. So, K8S Jenkins job should passed.
In addition, `K8s` integration test suite should be tested on `minikube 
1.8.2` manually.

```
KubernetesSuite:
- Run SparkPi with no resources
- Run SparkPi with a very long application name.
- Use SparkLauncher.NO_RESOURCE
- Run SparkPi with a master URL without a scheme.
- Run SparkPi with an argument.
- Run SparkPi with custom labels, annotations, and environment variables.
- All pods have the same service account by default
- Run extraJVMOptions check on driver
- Run SparkRemoteFileTest using a remote data file
- Run SparkPi with env and mount secrets.
- Run PySpark on simple pi.py example
- Run PySpark with Python2 to test a pyfiles example
- Run PySpark with Python3 to test a pyfiles example
- Run PySpark with memory customization
- Run in client mode.
- Start pod creation from template
- PVs with local storage
- Launcher client dependencies
- Test basic decommissioning
- Run SparkR on simple dataframe.R example
Run completed in 10 minutes, 23 seconds.
Total number of tests run: 20
Suites: completed 2, aborted 0
Tests: succeeded 20, failed 0, canceled 0, ignored 0, pending 0
All tests passed.
```

For the above test, Minikube 1.8.2 and K8s v1.18.0 is used.
```
$ minikube version
minikube version: v1.8.2
commit: eb13446e786c9ef70cb0a9f85a633194e62396a1

$ kubectl version --short
Client Version: v1.18.0
Server Version: v1.18.0
```

Closes #28080 from dongjoon-hyun/SPARK-31313.

Authored-by: Dongjoon Hyun 
Signed-off-by: DB Tsai 
(cherry picked from commit dba525c997b0033ac1b6fd24236cd72938f94bbf)
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git 
a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
 
b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
index f021821..a7cb84e 100644
--- 
a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
+++ 
b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
@@ -56,7 +56,8 @@ private[spark] trait PVTestsSuite { k8sSuite: KubernetesSuite 
=>
 .withMatchExpressions(new NodeSelectorRequirementBuilder()
   .withKey("kubernetes.io/hostname")
   .withOperator("In")
-  .withValues("minikube", "docker-for-desktop", 
"docker-desktop").build()).build())
+  .withValues("minikube", "m01", "docker-for-desktop", 
"docker-desktop")
+  .build()).build())
 .endRequired()
   .endNodeAffinity()
   .endSpec()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31313][K8S][TEST] Add `m01` node name to support Minikube 1.8.x

2020-04-01 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 08a88e0  [SPARK-31313][K8S][TEST] Add `m01` node name to support 
Minikube 1.8.x
08a88e0 is described below

commit 08a88e04f4417c5a0168592d1f01c300b4ae9de5
Author: Dongjoon Hyun 
AuthorDate: Wed Apr 1 03:42:26 2020 +

[SPARK-31313][K8S][TEST] Add `m01` node name to support Minikube 1.8.x

### What changes were proposed in this pull request?

This PR aims to add `m01` as a node name additionally to `PVTestsSuite`.

### Why are the changes needed?

minikube 1.8.0 ~ 1.8.2 generate a cluster with a nodename `m01` while all 
the other versions have `minikube`. This causes `PVTestSuite` failure.
```
$ minikube --vm-driver=hyperkit start --memory 6000 --cpus 8
* minikube v1.8.2 on Darwin 10.15.3
  - MINIKUBE_ACTIVE_DOCKERD=minikube
* Using the hyperkit driver based on user configuration
* Creating hyperkit VM (CPUs=8, Memory=6000MB, Disk=2MB) ...
* Preparing Kubernetes v1.18.0 on Docker 19.03.6 ...
* Launching Kubernetes ...
* Enabling addons: default-storageclass, storage-provisioner
* Waiting for cluster to come online ...
* Done! kubectl is now configured to use "minikube"

$ kubectl get nodes
NAME   STATUS   ROLESAGE   VERSION
m01Readymaster   22s   v1.17.3
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

This only adds a new node name. So, K8S Jenkins job should passed.
In addition, `K8s` integration test suite should be tested on `minikube 
1.8.2` manually.

```
KubernetesSuite:
- Run SparkPi with no resources
- Run SparkPi with a very long application name.
- Use SparkLauncher.NO_RESOURCE
- Run SparkPi with a master URL without a scheme.
- Run SparkPi with an argument.
- Run SparkPi with custom labels, annotations, and environment variables.
- All pods have the same service account by default
- Run extraJVMOptions check on driver
- Run SparkRemoteFileTest using a remote data file
- Run SparkPi with env and mount secrets.
- Run PySpark on simple pi.py example
- Run PySpark with Python2 to test a pyfiles example
- Run PySpark with Python3 to test a pyfiles example
- Run PySpark with memory customization
- Run in client mode.
- Start pod creation from template
- PVs with local storage
- Launcher client dependencies
- Test basic decommissioning
- Run SparkR on simple dataframe.R example
Run completed in 10 minutes, 23 seconds.
Total number of tests run: 20
Suites: completed 2, aborted 0
Tests: succeeded 20, failed 0, canceled 0, ignored 0, pending 0
All tests passed.
```

For the above test, Minikube 1.8.2 and K8s v1.18.0 is used.
```
$ minikube version
minikube version: v1.8.2
commit: eb13446e786c9ef70cb0a9f85a633194e62396a1

$ kubectl version --short
Client Version: v1.18.0
Server Version: v1.18.0
```

Closes #28080 from dongjoon-hyun/SPARK-31313.

Authored-by: Dongjoon Hyun 
Signed-off-by: DB Tsai 
(cherry picked from commit dba525c997b0033ac1b6fd24236cd72938f94bbf)
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git 
a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
 
b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
index f021821..a7cb84e 100644
--- 
a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
+++ 
b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PVTestsSuite.scala
@@ -56,7 +56,8 @@ private[spark] trait PVTestsSuite { k8sSuite: KubernetesSuite 
=>
 .withMatchExpressions(new NodeSelectorRequirementBuilder()
   .withKey("kubernetes.io/hostname")
   .withOperator("In")
-  .withValues("minikube", "docker-for-desktop", 
"docker-desktop").build()).build())
+  .withValues("minikube", "m01", "docker-for-desktop", 
"docker-desktop")
+  .build()).build())
 .endRequired()
   .endNodeAffinity()
   .endSpec()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31285][CORE] uppercase schedule mode string at config

2020-04-01 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 2c0e15e  [SPARK-31285][CORE] uppercase schedule mode string at config
2c0e15e is described below

commit 2c0e15e1d0c774d57a801038de4fc826702a7c5c
Author: ulysses 
AuthorDate: Wed Apr 1 11:46:41 2020 -0700

[SPARK-31285][CORE] uppercase schedule mode string at config

### What changes were proposed in this pull request?

In `TaskSchedulerImpl`, Spark will upper schedule mode 
`SchedulingMode.withName(schedulingModeConf.toUpperCase(Locale.ROOT))`.
But at other place, Spark does not. Such as 
[AllJobsPage](https://github.com/apache/spark/blob/5945d46c11a86fd85f9e65f24c2e88f368eee01f/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala#L304).
We should have the same behavior and uppercase schema mode string at config.

### Why are the changes needed?

Before this pr, it's ok to set `spark.scheduler.mode=fair` logically.
But Spark will throw warn log
```
java.util.NoSuchElementException: No value found for 'fair'
at scala.Enumeration.withName(Enumeration.scala:124)
at 
org.apache.spark.ui.jobs.AllJobsPage$$anonfun$22.apply(AllJobsPage.scala:314)
at 
org.apache.spark.ui.jobs.AllJobsPage$$anonfun$22.apply(AllJobsPage.scala:314)
at scala.Option.map(Option.scala:146)
at org.apache.spark.ui.jobs.AllJobsPage.render(AllJobsPage.scala:314)
at org.apache.spark.ui.WebUI$$anonfun$2.apply(WebUI.scala:90)
at org.apache.spark.ui.WebUI$$anonfun$2.apply(WebUI.scala:90)
at org.apache.spark.ui.JettyUtils$$anon$3.doGet(JettyUtils.scala:90)
```

### Does this PR introduce any user-facing change?

Almost no.

### How was this patch tested?

Exists Test.

Closes #28049 from ulysses-you/SPARK-31285.

Authored-by: ulysses 
Signed-off-by: Dongjoon Hyun 
---
 core/src/main/scala/org/apache/spark/internal/config/package.scala | 2 ++
 core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala 
b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index f70ee2e..8f8b6ad 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal
 
+import java.util.Locale
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.launcher.SparkLauncher
@@ -1756,6 +1757,7 @@ package object config {
 ConfigBuilder("spark.scheduler.mode")
   .version("0.8.0")
   .stringConf
+  .transform(_.toUpperCase(Locale.ROOT))
   .createWithDefault(SchedulingMode.FIFO.toString)
 
   private[spark] val SCHEDULER_REVIVE_INTERVAL =
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala 
b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index f0f84fe..718c571 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -141,7 +141,7 @@ private[spark] class TaskSchedulerImpl(
   private val schedulingModeConf = conf.get(SCHEDULER_MODE)
   val schedulingMode: SchedulingMode =
 try {
-  SchedulingMode.withName(schedulingModeConf.toUpperCase(Locale.ROOT))
+  SchedulingMode.withName(schedulingModeConf)
 } catch {
   case e: java.util.NoSuchElementException =>
 throw new SparkException(s"Unrecognized $SCHEDULER_MODE_PROPERTY: 
$schedulingModeConf")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31073][DOC][FOLLOWUP] Add description for Shuffle Write Time metric in StagePage to web-ui.md

2020-04-01 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b9b1b54  [SPARK-31073][DOC][FOLLOWUP] Add description for Shuffle 
Write Time metric in StagePage to web-ui.md
b9b1b54 is described below

commit b9b1b549af4dd57767fc0c23f5fe7695ca9fa9d5
Author: Kousuke Saruta 
AuthorDate: Wed Apr 1 12:03:41 2020 -0700

[SPARK-31073][DOC][FOLLOWUP] Add description for Shuffle Write Time metric 
in StagePage to web-ui.md

### What changes were proposed in this pull request?

This PR adds description for `Shuffle Write Time` to `web-ui.md`.

### Why are the changes needed?

#27837 added `Shuffle Write Time` metric to task metrics summary but it's 
not documented yet.

### Does this PR introduce any user-facing change?

Yes.
We can see the description for `Shuffle Write Time` in the new 
`web-ui.html`.
https://user-images.githubusercontent.com/4736016/78175342-a9722280-7495-11ea-9cc6-62c6f3619aa3.png";>

### How was this patch tested?

Built docs by `SKIP_API=1 jekyll build` in `doc` directory and then 
confirmed `web-ui.html`.

Closes #28093 from sarutak/SPARK-31073-doc.

Authored-by: Kousuke Saruta 
Signed-off-by: Dongjoon Hyun 
---
 docs/web-ui.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/web-ui.md b/docs/web-ui.md
index e28a689..c53af80 100644
--- a/docs/web-ui.md
+++ b/docs/web-ui.md
@@ -143,6 +143,7 @@ Summary metrics for all task are represented in a table and 
in a timeline.
 * **Shuffle Read Size / Records**. Total shuffle bytes read, includes both 
data read locally and data read from remote executors.
 * **Shuffle Read Blocked Time** is the time that tasks spent blocked waiting 
for shuffle data to be read from remote machines.
 * **Shuffle Remote Reads** is the total shuffle bytes read from remote 
executors.
+* **Shuffle Write Time** is the time that tasks spent writing shuffle data.
 * **Shuffle spill (memory)** is the size of the deserialized form of the 
shuffled data in memory.
 * **Shuffle spill (disk)** is the size of the serialized form of the data on 
disk.
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31073][DOC][FOLLOWUP] Add description for Shuffle Write Time metric in StagePage to web-ui.md

2020-04-01 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 5a7395d  [SPARK-31073][DOC][FOLLOWUP] Add description for Shuffle 
Write Time metric in StagePage to web-ui.md
5a7395d is described below

commit 5a7395d296b57c8e06bbf100b4ebdaa1100c3a91
Author: Kousuke Saruta 
AuthorDate: Wed Apr 1 12:03:41 2020 -0700

[SPARK-31073][DOC][FOLLOWUP] Add description for Shuffle Write Time metric 
in StagePage to web-ui.md

### What changes were proposed in this pull request?

This PR adds description for `Shuffle Write Time` to `web-ui.md`.

### Why are the changes needed?

#27837 added `Shuffle Write Time` metric to task metrics summary but it's 
not documented yet.

### Does this PR introduce any user-facing change?

Yes.
We can see the description for `Shuffle Write Time` in the new 
`web-ui.html`.
https://user-images.githubusercontent.com/4736016/78175342-a9722280-7495-11ea-9cc6-62c6f3619aa3.png";>

### How was this patch tested?

Built docs by `SKIP_API=1 jekyll build` in `doc` directory and then 
confirmed `web-ui.html`.

Closes #28093 from sarutak/SPARK-31073-doc.

Authored-by: Kousuke Saruta 
Signed-off-by: Dongjoon Hyun 
(cherry picked from commit b9b1b549af4dd57767fc0c23f5fe7695ca9fa9d5)
Signed-off-by: Dongjoon Hyun 
---
 docs/web-ui.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/web-ui.md b/docs/web-ui.md
index e28a689..c53af80 100644
--- a/docs/web-ui.md
+++ b/docs/web-ui.md
@@ -143,6 +143,7 @@ Summary metrics for all task are represented in a table and 
in a timeline.
 * **Shuffle Read Size / Records**. Total shuffle bytes read, includes both 
data read locally and data read from remote executors.
 * **Shuffle Read Blocked Time** is the time that tasks spent blocked waiting 
for shuffle data to be read from remote machines.
 * **Shuffle Remote Reads** is the total shuffle bytes read from remote 
executors.
+* **Shuffle Write Time** is the time that tasks spent writing shuffle data.
 * **Shuffle spill (memory)** is the size of the deserialized form of the 
shuffled data in memory.
 * **Shuffle spill (disk)** is the size of the serialized form of the data on 
disk.
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-31322][SQL] rename QueryPlan.collectInPlanAndSubqueries to collectWithSubqueries

2020-04-01 Thread dongjoon

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 09f036a  [SPARK-31322][SQL] rename 
QueryPlan.collectInPlanAndSubqueries to collectWithSubqueries
09f036a is described below

commit 09f036a14cee4825edc73b463e1eebe85ff1c915
Author: Wenchen Fan 
AuthorDate: Wed Apr 1 12:04:40 2020 -0700

[SPARK-31322][SQL] rename QueryPlan.collectInPlanAndSubqueries to 
collectWithSubqueries

### What changes were proposed in this pull request?

rename `QueryPlan.collectInPlanAndSubqueries` to `collectWithSubqueries`

### Why are the changes needed?

The old name is too verbose. `QueryPlan` is internal but it's the core of 
catalyst and we'd better make the API name clearer before we release it.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #28092 from cloud-fan/rename.

Authored-by: Wenchen Fan 
Signed-off-by: Dongjoon Hyun 
---
 .../main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala| 4 ++--
 .../scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala| 2 +-
 .../scala/org/apache/spark/sql/execution/CollectMetricsExec.scala | 2 +-
 .../scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 9f86fb2..13e5b12 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -232,10 +232,10 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] 
extends TreeNode[PlanT
   }
 
   /**
-   * Returns a sequence containing the result of applying a partial function 
to all elements in this
+   * A variant of `collect`. This method not only apply the given function to 
all elements in this
* plan, also considering all the plans in its (nested) subqueries
*/
-  def collectInPlanAndSubqueries[B](f: PartialFunction[PlanType, B]): Seq[B] =
+  def collectWithSubqueries[B](f: PartialFunction[PlanType, B]): Seq[B] =
 (this +: subqueriesAll).flatMap(_.collect(f))
 
   override def innerChildren: Seq[QueryPlan[_]] = subqueries
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala
index d96f808..91ce187 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala
@@ -78,7 +78,7 @@ class QueryPlanSuite extends SparkFunSuite {
 
 val countRelationsInPlan = plan.collect({ case _: UnresolvedRelation => 1 
}).sum
 val countRelationsInPlanAndSubqueries =
-  plan.collectInPlanAndSubqueries({ case _: UnresolvedRelation => 1 }).sum
+  plan.collectWithSubqueries({ case _: UnresolvedRelation => 1 }).sum
 
 assert(countRelationsInPlan == 2)
 assert(countRelationsInPlanAndSubqueries == 5)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
index e482bc9..e1b9c8f 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
@@ -87,7 +87,7 @@ object CollectMetricsExec {
* Recursively collect all collected metrics from a query tree.
*/
   def collect(plan: SparkPlan): Map[String, Row] = {
-val metrics = plan.collectInPlanAndSubqueries {
+val metrics = plan.collectWithSubqueries {
   case collector: CollectMetricsExec => collector.name -> 
collector.collectedMetrics
 }
 metrics.toMap
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
index baa9f5e..cdf9ea4 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
@@ -1234,7 +1234,7 @@ abstract class DynamicPartitionPruningSuiteBase
 
   val plan = df.queryExecution.executedPlan
   val countSubqueryBroadcasts =
-plan.collectInPlanAndSubqueries({ case _: SubqueryBroadcastExec => 1 
}).sum
+plan.collectWithSubqueries({ case _: SubqueryBroadcastExec => 1 }).sum
 
   a

< 1 2 3 4 5 6 7 8 9 10 >

301 - 400 of 2023 matches

Mail list logo