[spark] 01/01: Preparing development version 2.4.2-SNAPSHOT

2019-02-20 Thread dbtsai
This is an automated email from the ASF dual-hosted git repository.

dbtsai pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git

commit 0926f49f407201384dd020073b41e7023465b7e9
Author: DB Tsai 
AuthorDate: Thu Feb 21 00:46:07 2019 +

Preparing development version 2.4.2-SNAPSHOT
---
 R/pkg/DESCRIPTION  | 2 +-
 assembly/pom.xml   | 2 +-
 common/kvstore/pom.xml | 2 +-
 common/network-common/pom.xml  | 2 +-
 common/network-shuffle/pom.xml | 2 +-
 common/network-yarn/pom.xml| 2 +-
 common/sketch/pom.xml  | 2 +-
 common/tags/pom.xml| 2 +-
 common/unsafe/pom.xml  | 2 +-
 core/pom.xml   | 2 +-
 docs/_config.yml   | 4 ++--
 examples/pom.xml   | 2 +-
 external/avro/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml  | 2 +-
 external/flume-assembly/pom.xml| 2 +-
 external/flume-sink/pom.xml| 2 +-
 external/flume/pom.xml | 2 +-
 external/kafka-0-10-assembly/pom.xml   | 2 +-
 external/kafka-0-10-sql/pom.xml| 2 +-
 external/kafka-0-10/pom.xml| 2 +-
 external/kafka-0-8-assembly/pom.xml| 2 +-
 external/kafka-0-8/pom.xml | 2 +-
 external/kinesis-asl-assembly/pom.xml  | 2 +-
 external/kinesis-asl/pom.xml   | 2 +-
 external/spark-ganglia-lgpl/pom.xml| 2 +-
 graphx/pom.xml | 2 +-
 hadoop-cloud/pom.xml   | 2 +-
 launcher/pom.xml   | 2 +-
 mllib-local/pom.xml| 2 +-
 mllib/pom.xml  | 2 +-
 pom.xml| 2 +-
 python/pyspark/version.py  | 2 +-
 repl/pom.xml   | 2 +-
 resource-managers/kubernetes/core/pom.xml  | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml| 2 +-
 resource-managers/yarn/pom.xml | 2 +-
 sql/catalyst/pom.xml   | 2 +-
 sql/core/pom.xml   | 2 +-
 sql/hive-thriftserver/pom.xml  | 2 +-
 sql/hive/pom.xml   | 2 +-
 streaming/pom.xml  | 2 +-
 tools/pom.xml  | 2 +-
 43 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 714b6f1..2361289 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.4.1
+Version: 2.4.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 8e11fd6..cdf 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.1
+2.4.2-SNAPSHOT
 ../pom.xml
   
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index f0eee07..092f85b 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.1
+2.4.2-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 8c8bdf4..5236fd6 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.1
+2.4.2-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 663f41d..b70dadf 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.1
+2.4.2-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 9acade1..e9ae143 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.1
+2.4.2-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 1a31a39..2ae4fcb 100644
--- 

[spark] branch branch-2.4 updated (274142b -> 0926f49)

2019-02-20 Thread dbtsai
This is an automated email from the ASF dual-hosted git repository.

dbtsai pushed a change to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 274142b  [SPARK-26859][SQL] Fix field writer index bug in 
non-vectorized ORC deserializer
 add 061185b  Preparing Spark release v2.4.1-rc3
 new 0926f49  Preparing development version 2.4.2-SNAPSHOT

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] 01/01: Preparing Spark release v2.4.1-rc3

2019-02-20 Thread dbtsai
This is an automated email from the ASF dual-hosted git repository.

dbtsai pushed a commit to tag v2.4.1-rc3
in repository https://gitbox.apache.org/repos/asf/spark.git

commit 061185b9b872a672e3d58f8bbe819f8f70b33f91
Author: DB Tsai 
AuthorDate: Thu Feb 21 00:45:49 2019 +

Preparing Spark release v2.4.1-rc3
---
 R/pkg/DESCRIPTION  | 2 +-
 assembly/pom.xml   | 2 +-
 common/kvstore/pom.xml | 2 +-
 common/network-common/pom.xml  | 2 +-
 common/network-shuffle/pom.xml | 2 +-
 common/network-yarn/pom.xml| 2 +-
 common/sketch/pom.xml  | 2 +-
 common/tags/pom.xml| 2 +-
 common/unsafe/pom.xml  | 2 +-
 core/pom.xml   | 2 +-
 docs/_config.yml   | 4 ++--
 examples/pom.xml   | 2 +-
 external/avro/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml  | 2 +-
 external/flume-assembly/pom.xml| 2 +-
 external/flume-sink/pom.xml| 2 +-
 external/flume/pom.xml | 2 +-
 external/kafka-0-10-assembly/pom.xml   | 2 +-
 external/kafka-0-10-sql/pom.xml| 2 +-
 external/kafka-0-10/pom.xml| 2 +-
 external/kafka-0-8-assembly/pom.xml| 2 +-
 external/kafka-0-8/pom.xml | 2 +-
 external/kinesis-asl-assembly/pom.xml  | 2 +-
 external/kinesis-asl/pom.xml   | 2 +-
 external/spark-ganglia-lgpl/pom.xml| 2 +-
 graphx/pom.xml | 2 +-
 hadoop-cloud/pom.xml   | 2 +-
 launcher/pom.xml   | 2 +-
 mllib-local/pom.xml| 2 +-
 mllib/pom.xml  | 2 +-
 pom.xml| 2 +-
 python/pyspark/version.py  | 2 +-
 repl/pom.xml   | 2 +-
 resource-managers/kubernetes/core/pom.xml  | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml| 2 +-
 resource-managers/yarn/pom.xml | 2 +-
 sql/catalyst/pom.xml   | 2 +-
 sql/core/pom.xml   | 2 +-
 sql/hive-thriftserver/pom.xml  | 2 +-
 sql/hive/pom.xml   | 2 +-
 streaming/pom.xml  | 2 +-
 tools/pom.xml  | 2 +-
 43 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 2361289..714b6f1 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.4.2
+Version: 2.4.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index cdf..8e11fd6 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.2-SNAPSHOT
+2.4.1
 ../pom.xml
   
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 092f85b..f0eee07 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.2-SNAPSHOT
+2.4.1
 ../../pom.xml
   
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 5236fd6..8c8bdf4 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.2-SNAPSHOT
+2.4.1
 ../../pom.xml
   
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index b70dadf..663f41d 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.2-SNAPSHOT
+2.4.1
 ../../pom.xml
   
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index e9ae143..9acade1 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.4.2-SNAPSHOT
+2.4.1
 ../../pom.xml
   
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 2ae4fcb..1a31a39 100644
--- a/common/sketch/pom.xml
+++ 

[spark] tag v2.4.1-rc3 created (now 061185b)

2019-02-20 Thread dbtsai
This is an automated email from the ASF dual-hosted git repository.

dbtsai pushed a change to tag v2.4.1-rc3
in repository https://gitbox.apache.org/repos/asf/spark.git.


  at 061185b  (commit)
This tag includes the following new commits:

 new 061185b  Preparing Spark release v2.4.1-rc3

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r32577 - /dev/spark/KEYS

2019-02-20 Thread dbtsai
Author: dbtsai
Date: Thu Feb 21 00:41:41 2019
New Revision: 32577

Log:
Update KEYS

Modified:
dev/spark/KEYS

Modified: dev/spark/KEYS
==
--- dev/spark/KEYS (original)
+++ dev/spark/KEYS Thu Feb 21 00:41:41 2019
@@ -888,32 +888,30 @@ pqnGj9s6Uudh/FXfVN5MC0/pH/ySSACkXwCmKXAh
 =4noL
 -END PGP PUBLIC KEY BLOCK-
 
-pub   nistp521 2019-02-20 [SC]
-  A728E15E5059CCC2356EEECFE48ADA6526DF721D
+pub   nistp384 2019-02-21 [SC]
+  F8E155E845606E350C4147710359BC9965359766
 uid   [ultimate] DB Tsai 
-uid   [ultimate] DB Tsai 
 uid   [ultimate] DB Tsai 
+sub   nistp384 2019-02-21 [E]
 
 -BEGIN PGP PUBLIC KEY BLOCK-
 
-mJMEXG3fOxMFK4EEACMEIwQA01FGjsvr1wzKjR0wYXk13KmHocwmskJX7DlUkj/k
-rc+GwJreGef+KffH7IbPNYMLAOr8AyBmMhYu6n90u5+7KKoAM364IXZUDM2nxyCD
-On7tsihd3zCpc6ZzRCsz2tiAzkR48fVF2N93PvLF/MajBv7NJ2T/TDYTA6mb0nQ4
-F7MFvKu0G0RCIFRzYWkgPGRidHNhaUBkYnRzYWkuY29tPojWBBMTCgA7AhsDBQsJ
-CAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt
-4VoCGQEACgkQ5IraZSbfch0wxAIInTKRniXwKzGFdsj40uxe5OhjbXLAe6p3IKU9
-MgohUkUsCzuDG8USloLW6VTb/Z08FpXhVQrr+fcUW50+uKgNRcICCQGYZE8vYcvk
-oyaIuetIVss2OmY7d3U+Vc+Bnjh3xVXg1U/m4ztmPO208hIzu8nb/svotZTNX8Xo
-7d6Wnq/FfX70V7QaREIgVHNhaSA8ZF90c2FpQGFwcGxlLmNvbT6I0wQTEwoAOBYh
-BKco4V5QWczCNW7uz+SK2mUm33IdBQJcbeIPAhsDBQsJCAcCBhUKCQgLAgQWAgMB
-Ah4BAheAAAoJEOSK2mUm33IdG+0CCQGfZwg2LSfeftBRas66DPC0IBMhKiyKqdRJ
-MLGLRx5kgzfm5WBaTC53GOyualmk7Qslz/e7W+OfR+zfD/ozO5At9AIHUN0rj+6h
-TENQyv+/9TfCWIySm63fbk1HewBVe99TgNNcnPSGKplOLEChqkc1L2cVq83XbWey
-wnGTe6TX9Fn/jhC0G0RCIFRzYWkgPGRidHNhaUBhcGFjaGUub3JnPojTBBMTCgA4
-FiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt4iYCGwMFCwkIBwIGFQoJCAsCBBYC
-AwECHgECF4AACgkQ5IraZSbfch23pwII8HqQ7hwLKqpVUnO7CrTv6dtpeRpNSnGy
-Z2pLCxbXPBFfC1OVGGlGooKCDr/Sj1u8zin8YQ+GVl4Jgd44FNPP9Y0CCQEEBFyN
-3U1t3ZXxmWpjiRcUAsXrEMyJFbohFwK+FU3/TZWpfouj0JrVbUEJh6BTo7mzVyOj
-oNNkPvWRX+SnhI8A2Q==
-=qGJE
+mG8EXG3xSBMFK4EEACIDAwTFUItdmiASsQ34SfIfDCGtSSqpGQHlSfDB801cJRSK
+tAxO51Xu3E6BSpSTcImHzstwxGj7rkSDSHhiwZG18314+ykKVNHSuIFDdYEUi2aR
+UQ0RWSiGhVS+Eg51v07Zc2q0G0RCIFRzYWkgPGRidHNhaUBkYnRzYWkuY29tPoiz
+BBMTCQA7AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAFiEE+OFV6EVgbjUMQUdx
+A1m8mWU1l2YFAlxt8YECGQEACgkQA1m8mWU1l2Y+tgGAqzmo6XJZ9D0HxKGmvqu3
+HF/DHN2rEkutYfoGk8Wv3I3ALVVa8Qnh8zikO5y4hL4tAX90IL/NDXHoUR2sdsbJ
+tf+Eidnqc6BARrJiMXueH5KvlmtM3nmL8f28+ht8J09SPLm0G0RCIFRzYWkgPGRi
+dHNhaUBhcGFjaGUub3JnPoiwBBMTCQA4FiEE+OFV6EVgbjUMQUdxA1m8mWU1l2YF
+Alxt8ZcCGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AACgkQA1m8mWU1l2bZFwF/
+b46arpjADavN31useQj7cEPPjaPLgTsTKmG51lYZaDZlOXZLFVXKp/S++2IqO3C+
+AX46fL0wL+yPZ1XJ8hz0vkDIzUstC6AH6EjtmD0x/JLOei4wFB/ke631GUVkHrQZ
+lsS4cwRcbfFIEgUrgQQAIgMDBGKOAw+vHLvIS0OQ+U8M4360AFE/anh7M/wn1bFW
+50IdrOOW0Ss19Rib9UzBBW9FHU1udMGJmVOH4aBFRTXr2VOTPcswI1ArG7DoB/bs
+R2uwKt67grmq3MqQj5EJNKUzFwMBCQmImAQYEwkAIBYhBPjhVehFYG41DEFHcQNZ
+vJllNZdmBQJcbfFIAhsMAAoJEANZvJllNZdmUowBf3uHuFhO3IVozWsAnEfB8wdN
+vGQkKNb5uKWrMUAAkeUuY7+AeFDiZFxIcdfrx7B7tgF/c/CS4sgVUAIpDbw/gsfl
+UwauMsN1CWpCIaMEFZYNo//B+auqTseNXMtqqGaIgTXv
+=NYIb
 -END PGP PUBLIC KEY BLOCK-



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-26824][SS] Fix the checkpoint location and _spark_metadata when it contains special chars

2019-02-20 Thread zsxwing
This is an automated email from the ASF dual-hosted git repository.

zsxwing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 77b99af  [SPARK-26824][SS] Fix the checkpoint location and 
_spark_metadata when it contains special chars
77b99af is described below

commit 77b99af57330cf2e5016a6acc69642d54041b041
Author: Shixiong Zhu 
AuthorDate: Wed Feb 20 15:44:20 2019 -0800

[SPARK-26824][SS] Fix the checkpoint location and _spark_metadata when it 
contains special chars

## What changes were proposed in this pull request?

When a user specifies a checkpoint location or a file sink output using a 
path containing special chars that need to be escaped in a path, the streaming 
query will store checkpoint and file sink metadata in a wrong place. In this 
PR, I uploaded a checkpoint that was generated by the following codes using 
Spark 2.4.0 to show this issue:

```
implicit val s = spark.sqlContext
val input = org.apache.spark.sql.execution.streaming.MemoryStream[Int]
input.addData(1, 2, 3)
val q = 
input.toDF.writeStream.format("parquet").option("checkpointLocation", ".../chk 
%#chk").start(".../output %#output")
q.stop()
```
Here is the structure of the directory:
```
sql/core/src/test/resources/structured-streaming/escaped-path-2.4.0
├── chk%252520%252525%252523chk
│   ├── commits
│   │   └── 0
│   ├── metadata
│   └── offsets
│   └── 0
├── output %#output
│   └── part-0-97f675a2-bb82-4201-8245-05f3dae4c372-c000.snappy.parquet
└── output%20%25%23output
└── _spark_metadata
└── 0
```

In this checkpoint, the user specified checkpoint location is `.../chk 
%#chk` but the real path to store the checkpoint is 
`.../chk%252520%252525%252523chk` (this is generated by escaping the original 
path three times). The user specified output path is `.../output %#output` but 
the path to store `_spark_metadata` is 
`.../output%20%25%23output/_spark_metadata` (this is generated by escaping the 
original path once). The data files are still in the correct path (such as 
`.../output %#ou [...]

This checkpoint will be used in unit tests in this PR.

The fix is just simply removing improper `Path.toUri` calls to fix the 
issue.

However, as the user may not read the release note and is not aware of this 
checkpoint location change, if they upgrade Spark without moving checkpoint to 
the new location, their query will just start from the scratch. In order to not 
surprise the users, this PR also adds a check to **detect the impacted paths 
and throws an error** to include the migration guide. This check can be turned 
off by an internal sql conf 
`spark.sql.streaming.checkpoint.escapedPathCheck.enabled`. Here are ex [...]

- Streaming checkpoint error:
```
Error: we detected a possible problem with the location of your checkpoint 
and you
likely need to move it before restarting this query.

Earlier version of Spark incorrectly escaped paths when writing out 
checkpoints for
structured streaming. While this was corrected in Spark 3.0, it appears 
that your
query was started using an earlier version that incorrectly handled the 
checkpoint
path.

Correct Checkpoint Directory: /.../chk %#chk
Incorrect Checkpoint Directory: /.../chk%252520%252525%252523chk

Please move the data from the incorrect directory to the correct one, 
delete the
incorrect directory, and then restart this query. If you believe you are 
receiving
this message in error, you can disable it with the SQL conf
spark.sql.streaming.checkpoint.escapedPathCheck.enabled.
```

- File sink error (`_spark_metadata`):
```
Error: we detected a possible problem with the location of your 
"_spark_metadata"
directory and you likely need to move it before restarting this query.

Earlier version of Spark incorrectly escaped paths when writing out the
"_spark_metadata" directory for structured streaming. While this was 
corrected in
Spark 3.0, it appears that your query was started using an earlier version 
that
incorrectly handled the "_spark_metadata" path.

Correct "_spark_metadata" Directory: /.../output %#output/_spark_metadata
Incorrect "_spark_metadata" Directory: 
/.../output%20%25%23output/_spark_metadata

Please move the data from the incorrect directory to the correct one, 
delete the
incorrect directory, and then restart this query. If you believe you are 
receiving
this message in error, you can disable it with the SQL conf
spark.sql.streaming.checkpoint.escapedPathCheck.enabled.
```

## How was this patch tested?

The new unit tests.

Closes #23733 from zsxwing/path-fix.

Authored-by: Shixiong Zhu 

svn commit: r32573 - /dev/spark/KEYS

2019-02-20 Thread dbtsai
Author: dbtsai
Date: Wed Feb 20 23:32:09 2019
New Revision: 32573

Log:
Update KEYS

Modified:
dev/spark/KEYS

Modified: dev/spark/KEYS
==
--- dev/spark/KEYS (original)
+++ dev/spark/KEYS Wed Feb 20 23:32:09 2019
@@ -887,3 +887,33 @@ pqnGj9s6Uudh/FXfVN5MC0/pH/ySSACkXwCmKXAh
 /G8Bpm/p4kbeqJtsx3t7nhPke7fG
 =4noL
 -END PGP PUBLIC KEY BLOCK-
+
+pub   nistp521 2019-02-20 [SC]
+  A728E15E5059CCC2356EEECFE48ADA6526DF721D
+uid   [ultimate] DB Tsai 
+uid   [ultimate] DB Tsai 
+uid   [ultimate] DB Tsai 
+
+-BEGIN PGP PUBLIC KEY BLOCK-
+
+mJMEXG3fOxMFK4EEACMEIwQA01FGjsvr1wzKjR0wYXk13KmHocwmskJX7DlUkj/k
+rc+GwJreGef+KffH7IbPNYMLAOr8AyBmMhYu6n90u5+7KKoAM364IXZUDM2nxyCD
+On7tsihd3zCpc6ZzRCsz2tiAzkR48fVF2N93PvLF/MajBv7NJ2T/TDYTA6mb0nQ4
+F7MFvKu0G0RCIFRzYWkgPGRidHNhaUBkYnRzYWkuY29tPojWBBMTCgA7AhsDBQsJ
+CAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt
+4VoCGQEACgkQ5IraZSbfch0wxAIInTKRniXwKzGFdsj40uxe5OhjbXLAe6p3IKU9
+MgohUkUsCzuDG8USloLW6VTb/Z08FpXhVQrr+fcUW50+uKgNRcICCQGYZE8vYcvk
+oyaIuetIVss2OmY7d3U+Vc+Bnjh3xVXg1U/m4ztmPO208hIzu8nb/svotZTNX8Xo
+7d6Wnq/FfX70V7QaREIgVHNhaSA8ZF90c2FpQGFwcGxlLmNvbT6I0wQTEwoAOBYh
+BKco4V5QWczCNW7uz+SK2mUm33IdBQJcbeIPAhsDBQsJCAcCBhUKCQgLAgQWAgMB
+Ah4BAheAAAoJEOSK2mUm33IdG+0CCQGfZwg2LSfeftBRas66DPC0IBMhKiyKqdRJ
+MLGLRx5kgzfm5WBaTC53GOyualmk7Qslz/e7W+OfR+zfD/ozO5At9AIHUN0rj+6h
+TENQyv+/9TfCWIySm63fbk1HewBVe99TgNNcnPSGKplOLEChqkc1L2cVq83XbWey
+wnGTe6TX9Fn/jhC0G0RCIFRzYWkgPGRidHNhaUBhcGFjaGUub3JnPojTBBMTCgA4
+FiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt4iYCGwMFCwkIBwIGFQoJCAsCBBYC
+AwECHgECF4AACgkQ5IraZSbfch23pwII8HqQ7hwLKqpVUnO7CrTv6dtpeRpNSnGy
+Z2pLCxbXPBFfC1OVGGlGooKCDr/Sj1u8zin8YQ+GVl4Jgd44FNPP9Y0CCQEEBFyN
+3U1t3ZXxmWpjiRcUAsXrEMyJFbohFwK+FU3/TZWpfouj0JrVbUEJh6BTo7mzVyOj
+oNNkPvWRX+SnhI8A2Q==
+=qGJE
+-END PGP PUBLIC KEY BLOCK-



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-26892][CORE] Fix saveAsTextFile throws NullPointerException when null row present

2019-02-20 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 2153b31  [SPARK-26892][CORE] Fix saveAsTextFile throws 
NullPointerException when null row present
2153b31 is described below

commit 2153b316bda119ede8c80ceda522027a6581031b
Author: liupengcheng 
AuthorDate: Wed Feb 20 16:42:55 2019 -0600

[SPARK-26892][CORE] Fix saveAsTextFile throws NullPointerException when 
null row present

## What changes were proposed in this pull request?

Currently, RDD.saveAsTextFile may throw NullPointerException then null row 
is present.
```
scala> sc.parallelize(Seq(1,null),1).saveAsTextFile("/tmp/foobar.dat")
19/02/15 21:39:17 ERROR Utils: Aborting task
java.lang.NullPointerException
at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$3(RDD.scala:1510)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at 
org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$executeTask$1(SparkHadoopWriter.scala:129)
at 
org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1352)
at 
org.apache.spark.internal.io.SparkHadoopWriter$.executeTask(SparkHadoopWriter.scala:127)
at 
org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$1(SparkHadoopWriter.scala:83)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:425)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1318)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:428)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
```

This PR write "Null" for null row to avoid NPE and fix it.

## How was this patch tested?

NA

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #23799 from 
liupc/Fix-saveAsTextFile-throws-NullPointerException-when-null-row-present.

Lead-authored-by: liupengcheng 
Co-authored-by: Liupengcheng 
Signed-off-by: Sean Owen 
---
 .../org/apache/spark/rdd/PairRDDFunctions.scala|  2 +-
 core/src/main/scala/org/apache/spark/rdd/RDD.scala | 32 +++---
 .../test/scala/org/apache/spark/FileSuite.scala|  8 ++
 3 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala 
b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 8b5f9bb..7f8064f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -1012,7 +1012,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   outputFormatClass: Class[_ <: OutputFormat[_, _]],
   codec: Class[_ <: CompressionCodec]): Unit = self.withScope {
 saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass,
-  new JobConf(self.context.hadoopConfiguration), Some(codec))
+  new JobConf(self.context.hadoopConfiguration), Option(codec))
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index d39f418..1b67e99 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1492,45 +1492,21 @@ abstract class RDD[T: ClassTag](
* Save this RDD as a text file, using string representations of elements.
*/
   def saveAsTextFile(path: String): Unit = withScope {
-// https://issues.apache.org/jira/browse/SPARK-2075
-//
-// NullWritable is a `Comparable` in Hadoop 1.+, so the compiler cannot 
find an implicit
-// Ordering for it and will use the default `null`. However, it's a 
`Comparable[NullWritable]`
-// in Hadoop 2.+, so the compiler will call the implicit 
`Ordering.ordered` method to create an
-// Ordering for `NullWritable`. That's why the compiler will generate 
different anonymous
-// classes for `saveAsTextFile` in Hadoop 1.+ and Hadoop 2.+.
-//
-// Therefore, here we provide an explicit Ordering `null` to make sure the 
compiler generate
-// same bytecodes for `saveAsTextFile`.
-val nullWritableClassTag = implicitly[ClassTag[NullWritable]]
-val textClassTag = implicitly[ClassTag[Text]]
-val r = this.mapPartitions { iter =>
-  val text = new Text()
-  iter.map { x =>
-text.set(x.toString)
-(NullWritable.get(), text)
-  }
-}
-RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)
-  

[spark] branch master updated: [SPARK-26877][YARN] Support user-level app staging directory in yarn mode when spark.yarn…

2019-02-20 Thread vanzin
This is an automated email from the ASF dual-hosted git repository.

vanzin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new eb6fd7e  [SPARK-26877][YARN] Support user-level app staging directory 
in yarn mode when spark.yarn…
eb6fd7e is described below

commit eb6fd7eab77d3d5b2e7e827a21b127b146e5c089
Author: Liupengcheng 
AuthorDate: Wed Feb 20 11:45:12 2019 -0800

[SPARK-26877][YARN] Support user-level app staging directory in yarn mode 
when spark.yarn…

Currently, when running applications on yarn mode, the app staging 
directory of  is controlled by `spark.yarn.stagingDir` config if specified, and 
this directory cannot separate different users, sometimes, it's inconvenient 
for file and quota management for users.

Sometimes, there might be an unexpected increasing of the staging files, 
two possible reasons are:
1. The `spark.yarn.preserve.staging.files` provided can be misused by users
2. cron task constantly starting new applications on non-existent yarn 
queue(wrong configuration).

But now, we are not easy to find out the which user obtains the most HDFS 
files or spaces.
what's more, even we want set HDFS name quota or space quota for each user 
to limit the increase is impossible.

So I propose to add user sub directories under this app staging directory 
which is more clear.

existing UT

Closes #23786 from liupc/Support-user-level-app-staging-dir.

Authored-by: Liupengcheng 
Signed-off-by: Marcelo Vanzin 
---
 .../yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 6ca81fb..e0dba8c 100644
--- 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -177,7 +177,8 @@ private[spark] class Client(
 
   // The app staging dir based on the STAGING_DIR configuration if 
configured
   // otherwise based on the users home directory.
-  val appStagingBaseDir = sparkConf.get(STAGING_DIR).map { new Path(_) }
+  val appStagingBaseDir = sparkConf.get(STAGING_DIR)
+.map { new Path(_, 
UserGroupInformation.getCurrentUser.getShortUserName) }
 .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory())
   stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-26900][SQL] Simplify truncation to quarter of year

2019-02-20 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 331ac60  [SPARK-26900][SQL] Simplify truncation to quarter of year
331ac60 is described below

commit 331ac60f2818f191efb583ba4e73467f2a9d7d17
Author: Maxim Gekk 
AuthorDate: Wed Feb 20 08:55:08 2019 -0600

[SPARK-26900][SQL] Simplify truncation to quarter of year

## What changes were proposed in this pull request?

In the PR, I propose to simplify timestamp truncation to quarter of year by 
using *java.time* API directly. The `LocalDate` instance can be truncation to 
quarter timestamp via adjusting by chrono field `IsoFields.DAY_OF_QUARTER`.

## How was this patch tested?

This was checked by existing test suite - `DateTimeUtilsSuite`.

Closes #23808 from MaxGekk/date-quarter-of-year.

Authored-by: Maxim Gekk 
Signed-off-by: Sean Owen 
---
 .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 14 +++---
 sql/core/benchmarks/DateTimeBenchmark-results.txt  |  6 +++---
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index b537695..28fb6a9 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -719,17 +719,9 @@ object DateTimeUtils {
 daysToMillis(prevMonday, timeZone)
   case TRUNC_TO_QUARTER =>
 val dDays = millisToDays(millis, timeZone)
-val month = getQuarter(dDays) match {
-  case 1 => Month.JANUARY
-  case 2 => Month.APRIL
-  case 3 => Month.JULY
-  case 4 => Month.OCTOBER
-}
-millis = daysToMillis(truncDate(dDays, TRUNC_TO_MONTH), timeZone)
-val instant = Instant.ofEpochMilli(millis)
-val localDateTime = LocalDateTime.ofInstant(instant, timeZone.toZoneId)
-val truncated = localDateTime.withMonth(month.getValue)
-truncated.atZone(timeZone.toZoneId).toInstant.toEpochMilli
+val daysOfQuarter = LocalDate.ofEpochDay(dDays)
+  .`with`(IsoFields.DAY_OF_QUARTER, 1L).toEpochDay.toInt
+daysToMillis(daysOfQuarter, timeZone)
   case _ =>
 // caller make sure that this should never be reached
 sys.error(s"Invalid trunc level: $level")
diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt 
b/sql/core/benchmarks/DateTimeBenchmark-results.txt
index 8bbe310..bc824af 100644
--- a/sql/core/benchmarks/DateTimeBenchmark-results.txt
+++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt
@@ -324,12 +324,12 @@ date_trunc WEEK: Best/Avg 
Time(ms)Rate(M/s)   Per Ro
 date_trunc WEEK wholestage off 794 /  797 12.6 
 79.4   1.0X
 date_trunc WEEK wholestage on  754 /  761 13.3 
 75.4   1.1X
 
-Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-ea-b03 on Mac OS X 10.14.2
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.3
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 date_trunc QUARTER:  Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
 

-date_trunc QUARTER wholestage off 5261 / 5271  1.9 
526.1   1.0X
-date_trunc QUARTER wholestage on  5145 / 5151  1.9 
514.5   1.0X
+date_trunc QUARTER wholestage off 1465 / 1467  6.8 
146.5   1.0X
+date_trunc QUARTER wholestage on  1419 / 1423  7.0 
141.9   1.0X
 
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-ea-b03 on Mac OS X 10.14.2
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-22798][PYTHON][ML] Add multiple column support to PySpark StringIndexer

2019-02-20 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 74e9e1c  [SPARK-22798][PYTHON][ML] Add multiple column support to 
PySpark StringIndexer
74e9e1c is described below

commit 74e9e1c192f00920b69aa47813e2ac2f4e9b4325
Author: Huaxin Gao 
AuthorDate: Wed Feb 20 08:52:46 2019 -0600

[SPARK-22798][PYTHON][ML] Add multiple column support to PySpark 
StringIndexer

## What changes were proposed in this pull request?

Add multiple column support to PySpark StringIndexer

## How was this patch tested?

Add doctest

Closes #23741 from huaxingao/spark-22798.

Authored-by: Huaxin Gao 
Signed-off-by: Sean Owen 
---
 python/pyspark/ml/feature.py| 60 -
 python/pyspark/ml/tests/test_wrapper.py |  8 -
 python/pyspark/ml/wrapper.py| 22 ++--
 3 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 23d56c8..0d1e9bd 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2290,7 +2290,8 @@ class StandardScalerModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
 return self._call_java("mean")
 
 
-class _StringIndexerParams(JavaParams, HasHandleInvalid, HasInputCol, 
HasOutputCol):
+class _StringIndexerParams(JavaParams, HasHandleInvalid, HasInputCol, 
HasOutputCol,
+   HasInputCols, HasOutputCols):
 """
 Params for :py:attr:`StringIndexer` and :py:attr:`StringIndexerModel`.
 """
@@ -2371,16 +2372,37 @@ class StringIndexer(JavaEstimator, 
_StringIndexerParams, JavaMLReadable, JavaMLW
 >>> sorted(set([(i[0], i[1]) for i in result.select(result.id, 
result.indexed).collect()]),
 ... key=lambda x: x[0])
 [(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)]
+>>> testData = sc.parallelize([Row(id=0, label1="a", label2="e"),
+...Row(id=1, label1="b", label2="f"),
+...Row(id=2, label1="c", label2="e"),
+...Row(id=3, label1="a", label2="f"),
+...Row(id=4, label1="a", label2="f"),
+...Row(id=5, label1="c", label2="f")], 3)
+>>> multiRowDf = spark.createDataFrame(testData)
+>>> inputs = ["label1", "label2"]
+>>> outputs = ["index1", "index2"]
+>>> stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)
+>>> model = stringIndexer.fit(multiRowDf)
+>>> result = model.transform(multiRowDf)
+>>> sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, 
result.index1,
+... result.index2).collect()]), key=lambda x: x[0])
+[(0, 0.0, 1.0), (1, 2.0, 0.0), (2, 1.0, 1.0), (3, 0.0, 0.0), (4, 0.0, 
0.0), (5, 1.0, 0.0)]
+>>> fromlabelsModel = StringIndexerModel.from_arrays_of_labels([["a", "b", 
"c"], ["e", "f"]],
+... inputCols=inputs, outputCols=outputs)
+>>> result = fromlabelsModel.transform(multiRowDf)
+>>> sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, 
result.index1,
+... result.index2).collect()]), key=lambda x: x[0])
+[(0, 0.0, 0.0), (1, 1.0, 1.0), (2, 2.0, 0.0), (3, 0.0, 1.0), (4, 0.0, 
1.0), (5, 2.0, 1.0)]
 
 .. versionadded:: 1.4.0
 """
 
 @keyword_only
-def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
- stringOrderType="frequencyDesc"):
+def __init__(self, inputCol=None, outputCol=None, inputCols=None, 
outputCols=None,
+ handleInvalid="error", stringOrderType="frequencyDesc"):
 """
-__init__(self, inputCol=None, outputCol=None, handleInvalid="error", \
- stringOrderType="frequencyDesc")
+__init__(self, inputCol=None, outputCol=None, inputCols=None, 
outputCols=None, \
+ handleInvalid="error", stringOrderType="frequencyDesc")
 """
 super(StringIndexer, self).__init__()
 self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
@@ -2389,11 +2411,11 @@ class StringIndexer(JavaEstimator, 
_StringIndexerParams, JavaMLReadable, JavaMLW
 
 @keyword_only
 @since("1.4.0")
-def setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
-  stringOrderType="frequencyDesc"):
+def setParams(self, inputCol=None, outputCol=None, inputCols=None, 
outputCols=None,
+  handleInvalid="error", stringOrderType="frequencyDesc"):
 """
-setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \
-  stringOrderType="frequencyDesc")
+setParams(self, inputCol=None, outputCol=None, inputCols=None, 
outputCols=None, \
+  

[spark] branch branch-2.4 updated: [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer

2019-02-20 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new 274142b  [SPARK-26859][SQL] Fix field writer index bug in 
non-vectorized ORC deserializer
274142b is described below

commit 274142be08eb3a4239046d7f7260c7284ed041c2
Author: Ivan Vergiliev 
AuthorDate: Wed Feb 20 21:49:38 2019 +0800

[SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC 
deserializer

## What changes were proposed in this pull request?

This happens in a schema evolution use case only when a user specifies the 
schema manually and use non-vectorized ORC deserializer code path.

There is a bug in `OrcDeserializer.scala` that results in `null`s being set 
at the wrong column position, and for state from previous records to remain 
uncleared in next records. There are more details for when exactly the bug gets 
triggered and what the outcome is in the [JIRA 
issue](https://jira.apache.org/jira/browse/SPARK-26859).

The high-level summary is that this bug results in severe data correctness 
issues, but fortunately the set of conditions to expose the bug are complicated 
and make the surface area somewhat small.

This change fixes the problem and adds a respective test.

## How was this patch tested?

Pass the Jenkins with the newly added test cases.

Closes #23766 from IvanVergiliev/fix-orc-deserializer.

Lead-authored-by: Ivan Vergiliev 
Co-authored-by: Dongjoon Hyun 
Signed-off-by: Wenchen Fan 
(cherry picked from commit 096552ae4d6fcef5e20c54384a2687db41ba2fa1)
Signed-off-by: Wenchen Fan 
---
 .../datasources/orc/OrcDeserializer.scala  | 34 +++
 .../execution/datasources/ReadSchemaSuite.scala|  6 
 .../sql/execution/datasources/ReadSchemaTest.scala | 39 +-
 3 files changed, 64 insertions(+), 15 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
index 4ecc54b..decd5c5 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -37,28 +37,34 @@ class OrcDeserializer(
 
   private val resultRow = new 
SpecificInternalRow(requiredSchema.map(_.dataType))
 
+  // `fieldWriters(index)` is
+  // - null if the respective source column is missing, since the output value
+  //   is always null in this case
+  // - a function that updates target column `index` otherwise.
   private val fieldWriters: Array[WritableComparable[_] => Unit] = {
 requiredSchema.zipWithIndex
-  // The value of missing columns are always null, do not need writers.
-  .filterNot { case (_, index) => requestedColIds(index) == -1 }
   .map { case (f, index) =>
-val writer = newWriter(f.dataType, new RowUpdater(resultRow))
-(value: WritableComparable[_]) => writer(index, value)
+if (requestedColIds(index) == -1) {
+  null
+} else {
+  val writer = newWriter(f.dataType, new RowUpdater(resultRow))
+  (value: WritableComparable[_]) => writer(index, value)
+}
   }.toArray
   }
 
-  private val validColIds = requestedColIds.filterNot(_ == -1)
-
   def deserialize(orcStruct: OrcStruct): InternalRow = {
-var i = 0
-while (i < validColIds.length) {
-  val value = orcStruct.getFieldValue(validColIds(i))
-  if (value == null) {
-resultRow.setNullAt(i)
-  } else {
-fieldWriters(i)(value)
+var targetColumnIndex = 0
+while (targetColumnIndex < fieldWriters.length) {
+  if (fieldWriters(targetColumnIndex) != null) {
+val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex))
+if (value == null) {
+  resultRow.setNullAt(targetColumnIndex)
+} else {
+  fieldWriters(targetColumnIndex)(value)
+}
   }
-  i += 1
+  targetColumnIndex += 1
 }
 resultRow
   }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
index 23c58e1..de234c1 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
@@ -72,6 +72,7 @@ class HeaderCSVReadSchemaSuite
 
 class JsonReadSchemaSuite
   extends ReadSchemaSuite
+  with AddColumnIntoTheMiddleTest
   with HideColumnInTheMiddleTest
   with ChangePositionTest
   with IntegralTypeTest
@@ -84,6 +85,7 @@ class 

[spark] branch master updated: [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer

2019-02-20 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 096552a  [SPARK-26859][SQL] Fix field writer index bug in 
non-vectorized ORC deserializer
096552a is described below

commit 096552ae4d6fcef5e20c54384a2687db41ba2fa1
Author: Ivan Vergiliev 
AuthorDate: Wed Feb 20 21:49:38 2019 +0800

[SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC 
deserializer

## What changes were proposed in this pull request?

This happens in a schema evolution use case only when a user specifies the 
schema manually and use non-vectorized ORC deserializer code path.

There is a bug in `OrcDeserializer.scala` that results in `null`s being set 
at the wrong column position, and for state from previous records to remain 
uncleared in next records. There are more details for when exactly the bug gets 
triggered and what the outcome is in the [JIRA 
issue](https://jira.apache.org/jira/browse/SPARK-26859).

The high-level summary is that this bug results in severe data correctness 
issues, but fortunately the set of conditions to expose the bug are complicated 
and make the surface area somewhat small.

This change fixes the problem and adds a respective test.

## How was this patch tested?

Pass the Jenkins with the newly added test cases.

Closes #23766 from IvanVergiliev/fix-orc-deserializer.

Lead-authored-by: Ivan Vergiliev 
Co-authored-by: Dongjoon Hyun 
Signed-off-by: Wenchen Fan 
---
 .../datasources/orc/OrcDeserializer.scala  | 34 +++
 .../execution/datasources/ReadSchemaSuite.scala|  6 
 .../sql/execution/datasources/ReadSchemaTest.scala | 39 +-
 3 files changed, 64 insertions(+), 15 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
index ee16b3a..62e1670 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -37,28 +37,34 @@ class OrcDeserializer(
 
   private val resultRow = new 
SpecificInternalRow(requiredSchema.map(_.dataType))
 
+  // `fieldWriters(index)` is
+  // - null if the respective source column is missing, since the output value
+  //   is always null in this case
+  // - a function that updates target column `index` otherwise.
   private val fieldWriters: Array[WritableComparable[_] => Unit] = {
 requiredSchema.zipWithIndex
-  // The value of missing columns are always null, do not need writers.
-  .filterNot { case (_, index) => requestedColIds(index) == -1 }
   .map { case (f, index) =>
-val writer = newWriter(f.dataType, new RowUpdater(resultRow))
-(value: WritableComparable[_]) => writer(index, value)
+if (requestedColIds(index) == -1) {
+  null
+} else {
+  val writer = newWriter(f.dataType, new RowUpdater(resultRow))
+  (value: WritableComparable[_]) => writer(index, value)
+}
   }.toArray
   }
 
-  private val validColIds = requestedColIds.filterNot(_ == -1)
-
   def deserialize(orcStruct: OrcStruct): InternalRow = {
-var i = 0
-while (i < validColIds.length) {
-  val value = orcStruct.getFieldValue(validColIds(i))
-  if (value == null) {
-resultRow.setNullAt(i)
-  } else {
-fieldWriters(i)(value)
+var targetColumnIndex = 0
+while (targetColumnIndex < fieldWriters.length) {
+  if (fieldWriters(targetColumnIndex) != null) {
+val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex))
+if (value == null) {
+  resultRow.setNullAt(targetColumnIndex)
+} else {
+  fieldWriters(targetColumnIndex)(value)
+}
   }
-  i += 1
+  targetColumnIndex += 1
 }
 resultRow
   }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
index 23c58e1..de234c1 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala
@@ -72,6 +72,7 @@ class HeaderCSVReadSchemaSuite
 
 class JsonReadSchemaSuite
   extends ReadSchemaSuite
+  with AddColumnIntoTheMiddleTest
   with HideColumnInTheMiddleTest
   with ChangePositionTest
   with IntegralTypeTest
@@ -84,6 +85,7 @@ class JsonReadSchemaSuite
 
 class OrcReadSchemaSuite
   extends ReadSchemaSuite
+  with AddColumnIntoTheMiddleTest
   with