[spark] 01/01: Preparing development version 2.4.2-SNAPSHOT
This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git commit 0926f49f407201384dd020073b41e7023465b7e9 Author: DB Tsai AuthorDate: Thu Feb 21 00:46:07 2019 + Preparing development version 2.4.2-SNAPSHOT --- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/kvstore/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml| 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml| 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/avro/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml| 2 +- external/flume-sink/pom.xml| 2 +- external/flume/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml| 2 +- external/kafka-0-10/pom.xml| 2 +- external/kafka-0-8-assembly/pom.xml| 2 +- external/kafka-0-8/pom.xml | 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml| 2 +- graphx/pom.xml | 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml| 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/kubernetes/integration-tests/pom.xml | 2 +- resource-managers/mesos/pom.xml| 2 +- resource-managers/yarn/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 43 files changed, 44 insertions(+), 44 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 714b6f1..2361289 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.4.1 +Version: 2.4.2 Title: R Frontend for Apache Spark Description: Provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), diff --git a/assembly/pom.xml b/assembly/pom.xml index 8e11fd6..cdf 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.4.1 +2.4.2-SNAPSHOT ../pom.xml diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index f0eee07..092f85b 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.1 +2.4.2-SNAPSHOT ../../pom.xml diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 8c8bdf4..5236fd6 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.1 +2.4.2-SNAPSHOT ../../pom.xml diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 663f41d..b70dadf 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.1 +2.4.2-SNAPSHOT ../../pom.xml diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 9acade1..e9ae143 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.1 +2.4.2-SNAPSHOT ../../pom.xml diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 1a31a39..2ae4fcb 100644 ---
[spark] branch branch-2.4 updated (274142b -> 0926f49)
This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a change to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git. from 274142b [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer add 061185b Preparing Spark release v2.4.1-rc3 new 0926f49 Preparing development version 2.4.2-SNAPSHOT The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] 01/01: Preparing Spark release v2.4.1-rc3
This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a commit to tag v2.4.1-rc3 in repository https://gitbox.apache.org/repos/asf/spark.git commit 061185b9b872a672e3d58f8bbe819f8f70b33f91 Author: DB Tsai AuthorDate: Thu Feb 21 00:45:49 2019 + Preparing Spark release v2.4.1-rc3 --- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/kvstore/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml| 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml| 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/avro/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml| 2 +- external/flume-sink/pom.xml| 2 +- external/flume/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml| 2 +- external/kafka-0-10/pom.xml| 2 +- external/kafka-0-8-assembly/pom.xml| 2 +- external/kafka-0-8/pom.xml | 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml| 2 +- graphx/pom.xml | 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml| 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/kubernetes/integration-tests/pom.xml | 2 +- resource-managers/mesos/pom.xml| 2 +- resource-managers/yarn/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 43 files changed, 44 insertions(+), 44 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 2361289..714b6f1 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.4.2 +Version: 2.4.1 Title: R Frontend for Apache Spark Description: Provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), diff --git a/assembly/pom.xml b/assembly/pom.xml index cdf..8e11fd6 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.4.2-SNAPSHOT +2.4.1 ../pom.xml diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 092f85b..f0eee07 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.2-SNAPSHOT +2.4.1 ../../pom.xml diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 5236fd6..8c8bdf4 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.2-SNAPSHOT +2.4.1 ../../pom.xml diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index b70dadf..663f41d 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.2-SNAPSHOT +2.4.1 ../../pom.xml diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index e9ae143..9acade1 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.4.2-SNAPSHOT +2.4.1 ../../pom.xml diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 2ae4fcb..1a31a39 100644 --- a/common/sketch/pom.xml +++
[spark] tag v2.4.1-rc3 created (now 061185b)
This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a change to tag v2.4.1-rc3 in repository https://gitbox.apache.org/repos/asf/spark.git. at 061185b (commit) This tag includes the following new commits: new 061185b Preparing Spark release v2.4.1-rc3 The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r32577 - /dev/spark/KEYS
Author: dbtsai Date: Thu Feb 21 00:41:41 2019 New Revision: 32577 Log: Update KEYS Modified: dev/spark/KEYS Modified: dev/spark/KEYS == --- dev/spark/KEYS (original) +++ dev/spark/KEYS Thu Feb 21 00:41:41 2019 @@ -888,32 +888,30 @@ pqnGj9s6Uudh/FXfVN5MC0/pH/ySSACkXwCmKXAh =4noL -END PGP PUBLIC KEY BLOCK- -pub nistp521 2019-02-20 [SC] - A728E15E5059CCC2356EEECFE48ADA6526DF721D +pub nistp384 2019-02-21 [SC] + F8E155E845606E350C4147710359BC9965359766 uid [ultimate] DB Tsai -uid [ultimate] DB Tsai uid [ultimate] DB Tsai +sub nistp384 2019-02-21 [E] -BEGIN PGP PUBLIC KEY BLOCK- -mJMEXG3fOxMFK4EEACMEIwQA01FGjsvr1wzKjR0wYXk13KmHocwmskJX7DlUkj/k -rc+GwJreGef+KffH7IbPNYMLAOr8AyBmMhYu6n90u5+7KKoAM364IXZUDM2nxyCD -On7tsihd3zCpc6ZzRCsz2tiAzkR48fVF2N93PvLF/MajBv7NJ2T/TDYTA6mb0nQ4 -F7MFvKu0G0RCIFRzYWkgPGRidHNhaUBkYnRzYWkuY29tPojWBBMTCgA7AhsDBQsJ -CAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt -4VoCGQEACgkQ5IraZSbfch0wxAIInTKRniXwKzGFdsj40uxe5OhjbXLAe6p3IKU9 -MgohUkUsCzuDG8USloLW6VTb/Z08FpXhVQrr+fcUW50+uKgNRcICCQGYZE8vYcvk -oyaIuetIVss2OmY7d3U+Vc+Bnjh3xVXg1U/m4ztmPO208hIzu8nb/svotZTNX8Xo -7d6Wnq/FfX70V7QaREIgVHNhaSA8ZF90c2FpQGFwcGxlLmNvbT6I0wQTEwoAOBYh -BKco4V5QWczCNW7uz+SK2mUm33IdBQJcbeIPAhsDBQsJCAcCBhUKCQgLAgQWAgMB -Ah4BAheAAAoJEOSK2mUm33IdG+0CCQGfZwg2LSfeftBRas66DPC0IBMhKiyKqdRJ -MLGLRx5kgzfm5WBaTC53GOyualmk7Qslz/e7W+OfR+zfD/ozO5At9AIHUN0rj+6h -TENQyv+/9TfCWIySm63fbk1HewBVe99TgNNcnPSGKplOLEChqkc1L2cVq83XbWey -wnGTe6TX9Fn/jhC0G0RCIFRzYWkgPGRidHNhaUBhcGFjaGUub3JnPojTBBMTCgA4 -FiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt4iYCGwMFCwkIBwIGFQoJCAsCBBYC -AwECHgECF4AACgkQ5IraZSbfch23pwII8HqQ7hwLKqpVUnO7CrTv6dtpeRpNSnGy -Z2pLCxbXPBFfC1OVGGlGooKCDr/Sj1u8zin8YQ+GVl4Jgd44FNPP9Y0CCQEEBFyN -3U1t3ZXxmWpjiRcUAsXrEMyJFbohFwK+FU3/TZWpfouj0JrVbUEJh6BTo7mzVyOj -oNNkPvWRX+SnhI8A2Q== -=qGJE +mG8EXG3xSBMFK4EEACIDAwTFUItdmiASsQ34SfIfDCGtSSqpGQHlSfDB801cJRSK +tAxO51Xu3E6BSpSTcImHzstwxGj7rkSDSHhiwZG18314+ykKVNHSuIFDdYEUi2aR +UQ0RWSiGhVS+Eg51v07Zc2q0G0RCIFRzYWkgPGRidHNhaUBkYnRzYWkuY29tPoiz +BBMTCQA7AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAFiEE+OFV6EVgbjUMQUdx +A1m8mWU1l2YFAlxt8YECGQEACgkQA1m8mWU1l2Y+tgGAqzmo6XJZ9D0HxKGmvqu3 +HF/DHN2rEkutYfoGk8Wv3I3ALVVa8Qnh8zikO5y4hL4tAX90IL/NDXHoUR2sdsbJ +tf+Eidnqc6BARrJiMXueH5KvlmtM3nmL8f28+ht8J09SPLm0G0RCIFRzYWkgPGRi +dHNhaUBhcGFjaGUub3JnPoiwBBMTCQA4FiEE+OFV6EVgbjUMQUdxA1m8mWU1l2YF +Alxt8ZcCGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AACgkQA1m8mWU1l2bZFwF/ +b46arpjADavN31useQj7cEPPjaPLgTsTKmG51lYZaDZlOXZLFVXKp/S++2IqO3C+ +AX46fL0wL+yPZ1XJ8hz0vkDIzUstC6AH6EjtmD0x/JLOei4wFB/ke631GUVkHrQZ +lsS4cwRcbfFIEgUrgQQAIgMDBGKOAw+vHLvIS0OQ+U8M4360AFE/anh7M/wn1bFW +50IdrOOW0Ss19Rib9UzBBW9FHU1udMGJmVOH4aBFRTXr2VOTPcswI1ArG7DoB/bs +R2uwKt67grmq3MqQj5EJNKUzFwMBCQmImAQYEwkAIBYhBPjhVehFYG41DEFHcQNZ +vJllNZdmBQJcbfFIAhsMAAoJEANZvJllNZdmUowBf3uHuFhO3IVozWsAnEfB8wdN +vGQkKNb5uKWrMUAAkeUuY7+AeFDiZFxIcdfrx7B7tgF/c/CS4sgVUAIpDbw/gsfl +UwauMsN1CWpCIaMEFZYNo//B+auqTseNXMtqqGaIgTXv +=NYIb -END PGP PUBLIC KEY BLOCK- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-26824][SS] Fix the checkpoint location and _spark_metadata when it contains special chars
This is an automated email from the ASF dual-hosted git repository. zsxwing pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 77b99af [SPARK-26824][SS] Fix the checkpoint location and _spark_metadata when it contains special chars 77b99af is described below commit 77b99af57330cf2e5016a6acc69642d54041b041 Author: Shixiong Zhu AuthorDate: Wed Feb 20 15:44:20 2019 -0800 [SPARK-26824][SS] Fix the checkpoint location and _spark_metadata when it contains special chars ## What changes were proposed in this pull request? When a user specifies a checkpoint location or a file sink output using a path containing special chars that need to be escaped in a path, the streaming query will store checkpoint and file sink metadata in a wrong place. In this PR, I uploaded a checkpoint that was generated by the following codes using Spark 2.4.0 to show this issue: ``` implicit val s = spark.sqlContext val input = org.apache.spark.sql.execution.streaming.MemoryStream[Int] input.addData(1, 2, 3) val q = input.toDF.writeStream.format("parquet").option("checkpointLocation", ".../chk %#chk").start(".../output %#output") q.stop() ``` Here is the structure of the directory: ``` sql/core/src/test/resources/structured-streaming/escaped-path-2.4.0 ├── chk%252520%252525%252523chk │ ├── commits │ │ └── 0 │ ├── metadata │ └── offsets │ └── 0 ├── output %#output │ └── part-0-97f675a2-bb82-4201-8245-05f3dae4c372-c000.snappy.parquet └── output%20%25%23output └── _spark_metadata └── 0 ``` In this checkpoint, the user specified checkpoint location is `.../chk %#chk` but the real path to store the checkpoint is `.../chk%252520%252525%252523chk` (this is generated by escaping the original path three times). The user specified output path is `.../output %#output` but the path to store `_spark_metadata` is `.../output%20%25%23output/_spark_metadata` (this is generated by escaping the original path once). The data files are still in the correct path (such as `.../output %#ou [...] This checkpoint will be used in unit tests in this PR. The fix is just simply removing improper `Path.toUri` calls to fix the issue. However, as the user may not read the release note and is not aware of this checkpoint location change, if they upgrade Spark without moving checkpoint to the new location, their query will just start from the scratch. In order to not surprise the users, this PR also adds a check to **detect the impacted paths and throws an error** to include the migration guide. This check can be turned off by an internal sql conf `spark.sql.streaming.checkpoint.escapedPathCheck.enabled`. Here are ex [...] - Streaming checkpoint error: ``` Error: we detected a possible problem with the location of your checkpoint and you likely need to move it before restarting this query. Earlier version of Spark incorrectly escaped paths when writing out checkpoints for structured streaming. While this was corrected in Spark 3.0, it appears that your query was started using an earlier version that incorrectly handled the checkpoint path. Correct Checkpoint Directory: /.../chk %#chk Incorrect Checkpoint Directory: /.../chk%252520%252525%252523chk Please move the data from the incorrect directory to the correct one, delete the incorrect directory, and then restart this query. If you believe you are receiving this message in error, you can disable it with the SQL conf spark.sql.streaming.checkpoint.escapedPathCheck.enabled. ``` - File sink error (`_spark_metadata`): ``` Error: we detected a possible problem with the location of your "_spark_metadata" directory and you likely need to move it before restarting this query. Earlier version of Spark incorrectly escaped paths when writing out the "_spark_metadata" directory for structured streaming. While this was corrected in Spark 3.0, it appears that your query was started using an earlier version that incorrectly handled the "_spark_metadata" path. Correct "_spark_metadata" Directory: /.../output %#output/_spark_metadata Incorrect "_spark_metadata" Directory: /.../output%20%25%23output/_spark_metadata Please move the data from the incorrect directory to the correct one, delete the incorrect directory, and then restart this query. If you believe you are receiving this message in error, you can disable it with the SQL conf spark.sql.streaming.checkpoint.escapedPathCheck.enabled. ``` ## How was this patch tested? The new unit tests. Closes #23733 from zsxwing/path-fix. Authored-by: Shixiong Zhu
svn commit: r32573 - /dev/spark/KEYS
Author: dbtsai Date: Wed Feb 20 23:32:09 2019 New Revision: 32573 Log: Update KEYS Modified: dev/spark/KEYS Modified: dev/spark/KEYS == --- dev/spark/KEYS (original) +++ dev/spark/KEYS Wed Feb 20 23:32:09 2019 @@ -887,3 +887,33 @@ pqnGj9s6Uudh/FXfVN5MC0/pH/ySSACkXwCmKXAh /G8Bpm/p4kbeqJtsx3t7nhPke7fG =4noL -END PGP PUBLIC KEY BLOCK- + +pub nistp521 2019-02-20 [SC] + A728E15E5059CCC2356EEECFE48ADA6526DF721D +uid [ultimate] DB Tsai +uid [ultimate] DB Tsai +uid [ultimate] DB Tsai + +-BEGIN PGP PUBLIC KEY BLOCK- + +mJMEXG3fOxMFK4EEACMEIwQA01FGjsvr1wzKjR0wYXk13KmHocwmskJX7DlUkj/k +rc+GwJreGef+KffH7IbPNYMLAOr8AyBmMhYu6n90u5+7KKoAM364IXZUDM2nxyCD +On7tsihd3zCpc6ZzRCsz2tiAzkR48fVF2N93PvLF/MajBv7NJ2T/TDYTA6mb0nQ4 +F7MFvKu0G0RCIFRzYWkgPGRidHNhaUBkYnRzYWkuY29tPojWBBMTCgA7AhsDBQsJ +CAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt +4VoCGQEACgkQ5IraZSbfch0wxAIInTKRniXwKzGFdsj40uxe5OhjbXLAe6p3IKU9 +MgohUkUsCzuDG8USloLW6VTb/Z08FpXhVQrr+fcUW50+uKgNRcICCQGYZE8vYcvk +oyaIuetIVss2OmY7d3U+Vc+Bnjh3xVXg1U/m4ztmPO208hIzu8nb/svotZTNX8Xo +7d6Wnq/FfX70V7QaREIgVHNhaSA8ZF90c2FpQGFwcGxlLmNvbT6I0wQTEwoAOBYh +BKco4V5QWczCNW7uz+SK2mUm33IdBQJcbeIPAhsDBQsJCAcCBhUKCQgLAgQWAgMB +Ah4BAheAAAoJEOSK2mUm33IdG+0CCQGfZwg2LSfeftBRas66DPC0IBMhKiyKqdRJ +MLGLRx5kgzfm5WBaTC53GOyualmk7Qslz/e7W+OfR+zfD/ozO5At9AIHUN0rj+6h +TENQyv+/9TfCWIySm63fbk1HewBVe99TgNNcnPSGKplOLEChqkc1L2cVq83XbWey +wnGTe6TX9Fn/jhC0G0RCIFRzYWkgPGRidHNhaUBhcGFjaGUub3JnPojTBBMTCgA4 +FiEEpyjhXlBZzMI1bu7P5IraZSbfch0FAlxt4iYCGwMFCwkIBwIGFQoJCAsCBBYC +AwECHgECF4AACgkQ5IraZSbfch23pwII8HqQ7hwLKqpVUnO7CrTv6dtpeRpNSnGy +Z2pLCxbXPBFfC1OVGGlGooKCDr/Sj1u8zin8YQ+GVl4Jgd44FNPP9Y0CCQEEBFyN +3U1t3ZXxmWpjiRcUAsXrEMyJFbohFwK+FU3/TZWpfouj0JrVbUEJh6BTo7mzVyOj +oNNkPvWRX+SnhI8A2Q== +=qGJE +-END PGP PUBLIC KEY BLOCK- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-26892][CORE] Fix saveAsTextFile throws NullPointerException when null row present
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2153b31 [SPARK-26892][CORE] Fix saveAsTextFile throws NullPointerException when null row present 2153b31 is described below commit 2153b316bda119ede8c80ceda522027a6581031b Author: liupengcheng AuthorDate: Wed Feb 20 16:42:55 2019 -0600 [SPARK-26892][CORE] Fix saveAsTextFile throws NullPointerException when null row present ## What changes were proposed in this pull request? Currently, RDD.saveAsTextFile may throw NullPointerException then null row is present. ``` scala> sc.parallelize(Seq(1,null),1).saveAsTextFile("/tmp/foobar.dat") 19/02/15 21:39:17 ERROR Utils: Aborting task java.lang.NullPointerException at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$3(RDD.scala:1510) at scala.collection.Iterator$$anon$10.next(Iterator.scala:459) at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$executeTask$1(SparkHadoopWriter.scala:129) at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1352) at org.apache.spark.internal.io.SparkHadoopWriter$.executeTask(SparkHadoopWriter.scala:127) at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$1(SparkHadoopWriter.scala:83) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:425) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1318) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:428) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) ``` This PR write "Null" for null row to avoid NPE and fix it. ## How was this patch tested? NA Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #23799 from liupc/Fix-saveAsTextFile-throws-NullPointerException-when-null-row-present. Lead-authored-by: liupengcheng Co-authored-by: Liupengcheng Signed-off-by: Sean Owen --- .../org/apache/spark/rdd/PairRDDFunctions.scala| 2 +- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 32 +++--- .../test/scala/org/apache/spark/FileSuite.scala| 8 ++ 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 8b5f9bb..7f8064f 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -1012,7 +1012,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) outputFormatClass: Class[_ <: OutputFormat[_, _]], codec: Class[_ <: CompressionCodec]): Unit = self.withScope { saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, - new JobConf(self.context.hadoopConfiguration), Some(codec)) + new JobConf(self.context.hadoopConfiguration), Option(codec)) } /** diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index d39f418..1b67e99 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1492,45 +1492,21 @@ abstract class RDD[T: ClassTag]( * Save this RDD as a text file, using string representations of elements. */ def saveAsTextFile(path: String): Unit = withScope { -// https://issues.apache.org/jira/browse/SPARK-2075 -// -// NullWritable is a `Comparable` in Hadoop 1.+, so the compiler cannot find an implicit -// Ordering for it and will use the default `null`. However, it's a `Comparable[NullWritable]` -// in Hadoop 2.+, so the compiler will call the implicit `Ordering.ordered` method to create an -// Ordering for `NullWritable`. That's why the compiler will generate different anonymous -// classes for `saveAsTextFile` in Hadoop 1.+ and Hadoop 2.+. -// -// Therefore, here we provide an explicit Ordering `null` to make sure the compiler generate -// same bytecodes for `saveAsTextFile`. -val nullWritableClassTag = implicitly[ClassTag[NullWritable]] -val textClassTag = implicitly[ClassTag[Text]] -val r = this.mapPartitions { iter => - val text = new Text() - iter.map { x => -text.set(x.toString) -(NullWritable.get(), text) - } -} -RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null) -
[spark] branch master updated: [SPARK-26877][YARN] Support user-level app staging directory in yarn mode when spark.yarn…
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new eb6fd7e [SPARK-26877][YARN] Support user-level app staging directory in yarn mode when spark.yarn… eb6fd7e is described below commit eb6fd7eab77d3d5b2e7e827a21b127b146e5c089 Author: Liupengcheng AuthorDate: Wed Feb 20 11:45:12 2019 -0800 [SPARK-26877][YARN] Support user-level app staging directory in yarn mode when spark.yarn… Currently, when running applications on yarn mode, the app staging directory of is controlled by `spark.yarn.stagingDir` config if specified, and this directory cannot separate different users, sometimes, it's inconvenient for file and quota management for users. Sometimes, there might be an unexpected increasing of the staging files, two possible reasons are: 1. The `spark.yarn.preserve.staging.files` provided can be misused by users 2. cron task constantly starting new applications on non-existent yarn queue(wrong configuration). But now, we are not easy to find out the which user obtains the most HDFS files or spaces. what's more, even we want set HDFS name quota or space quota for each user to limit the increase is impossible. So I propose to add user sub directories under this app staging directory which is more clear. existing UT Closes #23786 from liupc/Support-user-level-app-staging-dir. Authored-by: Liupengcheng Signed-off-by: Marcelo Vanzin --- .../yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 6ca81fb..e0dba8c 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -177,7 +177,8 @@ private[spark] class Client( // The app staging dir based on the STAGING_DIR configuration if configured // otherwise based on the users home directory. - val appStagingBaseDir = sparkConf.get(STAGING_DIR).map { new Path(_) } + val appStagingBaseDir = sparkConf.get(STAGING_DIR) +.map { new Path(_, UserGroupInformation.getCurrentUser.getShortUserName) } .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory()) stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-26900][SQL] Simplify truncation to quarter of year
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 331ac60 [SPARK-26900][SQL] Simplify truncation to quarter of year 331ac60 is described below commit 331ac60f2818f191efb583ba4e73467f2a9d7d17 Author: Maxim Gekk AuthorDate: Wed Feb 20 08:55:08 2019 -0600 [SPARK-26900][SQL] Simplify truncation to quarter of year ## What changes were proposed in this pull request? In the PR, I propose to simplify timestamp truncation to quarter of year by using *java.time* API directly. The `LocalDate` instance can be truncation to quarter timestamp via adjusting by chrono field `IsoFields.DAY_OF_QUARTER`. ## How was this patch tested? This was checked by existing test suite - `DateTimeUtilsSuite`. Closes #23808 from MaxGekk/date-quarter-of-year. Authored-by: Maxim Gekk Signed-off-by: Sean Owen --- .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 14 +++--- sql/core/benchmarks/DateTimeBenchmark-results.txt | 6 +++--- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index b537695..28fb6a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -719,17 +719,9 @@ object DateTimeUtils { daysToMillis(prevMonday, timeZone) case TRUNC_TO_QUARTER => val dDays = millisToDays(millis, timeZone) -val month = getQuarter(dDays) match { - case 1 => Month.JANUARY - case 2 => Month.APRIL - case 3 => Month.JULY - case 4 => Month.OCTOBER -} -millis = daysToMillis(truncDate(dDays, TRUNC_TO_MONTH), timeZone) -val instant = Instant.ofEpochMilli(millis) -val localDateTime = LocalDateTime.ofInstant(instant, timeZone.toZoneId) -val truncated = localDateTime.withMonth(month.getValue) -truncated.atZone(timeZone.toZoneId).toInstant.toEpochMilli +val daysOfQuarter = LocalDate.ofEpochDay(dDays) + .`with`(IsoFields.DAY_OF_QUARTER, 1L).toEpochDay.toInt +daysToMillis(daysOfQuarter, timeZone) case _ => // caller make sure that this should never be reached sys.error(s"Invalid trunc level: $level") diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index 8bbe310..bc824af 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -324,12 +324,12 @@ date_trunc WEEK: Best/Avg Time(ms)Rate(M/s) Per Ro date_trunc WEEK wholestage off 794 / 797 12.6 79.4 1.0X date_trunc WEEK wholestage on 754 / 761 13.3 75.4 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-ea-b03 on Mac OS X 10.14.2 +Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz date_trunc QUARTER: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -date_trunc QUARTER wholestage off 5261 / 5271 1.9 526.1 1.0X -date_trunc QUARTER wholestage on 5145 / 5151 1.9 514.5 1.0X +date_trunc QUARTER wholestage off 1465 / 1467 6.8 146.5 1.0X +date_trunc QUARTER wholestage on 1419 / 1423 7.0 141.9 1.0X Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-ea-b03 on Mac OS X 10.14.2 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-22798][PYTHON][ML] Add multiple column support to PySpark StringIndexer
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 74e9e1c [SPARK-22798][PYTHON][ML] Add multiple column support to PySpark StringIndexer 74e9e1c is described below commit 74e9e1c192f00920b69aa47813e2ac2f4e9b4325 Author: Huaxin Gao AuthorDate: Wed Feb 20 08:52:46 2019 -0600 [SPARK-22798][PYTHON][ML] Add multiple column support to PySpark StringIndexer ## What changes were proposed in this pull request? Add multiple column support to PySpark StringIndexer ## How was this patch tested? Add doctest Closes #23741 from huaxingao/spark-22798. Authored-by: Huaxin Gao Signed-off-by: Sean Owen --- python/pyspark/ml/feature.py| 60 - python/pyspark/ml/tests/test_wrapper.py | 8 - python/pyspark/ml/wrapper.py| 22 ++-- 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 23d56c8..0d1e9bd 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2290,7 +2290,8 @@ class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): return self._call_java("mean") -class _StringIndexerParams(JavaParams, HasHandleInvalid, HasInputCol, HasOutputCol): +class _StringIndexerParams(JavaParams, HasHandleInvalid, HasInputCol, HasOutputCol, + HasInputCols, HasOutputCols): """ Params for :py:attr:`StringIndexer` and :py:attr:`StringIndexerModel`. """ @@ -2371,16 +2372,37 @@ class StringIndexer(JavaEstimator, _StringIndexerParams, JavaMLReadable, JavaMLW >>> sorted(set([(i[0], i[1]) for i in result.select(result.id, result.indexed).collect()]), ... key=lambda x: x[0]) [(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)] +>>> testData = sc.parallelize([Row(id=0, label1="a", label2="e"), +...Row(id=1, label1="b", label2="f"), +...Row(id=2, label1="c", label2="e"), +...Row(id=3, label1="a", label2="f"), +...Row(id=4, label1="a", label2="f"), +...Row(id=5, label1="c", label2="f")], 3) +>>> multiRowDf = spark.createDataFrame(testData) +>>> inputs = ["label1", "label2"] +>>> outputs = ["index1", "index2"] +>>> stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs) +>>> model = stringIndexer.fit(multiRowDf) +>>> result = model.transform(multiRowDf) +>>> sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1, +... result.index2).collect()]), key=lambda x: x[0]) +[(0, 0.0, 1.0), (1, 2.0, 0.0), (2, 1.0, 1.0), (3, 0.0, 0.0), (4, 0.0, 0.0), (5, 1.0, 0.0)] +>>> fromlabelsModel = StringIndexerModel.from_arrays_of_labels([["a", "b", "c"], ["e", "f"]], +... inputCols=inputs, outputCols=outputs) +>>> result = fromlabelsModel.transform(multiRowDf) +>>> sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1, +... result.index2).collect()]), key=lambda x: x[0]) +[(0, 0.0, 0.0), (1, 1.0, 1.0), (2, 2.0, 0.0), (3, 0.0, 1.0), (4, 0.0, 1.0), (5, 2.0, 1.0)] .. versionadded:: 1.4.0 """ @keyword_only -def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDesc"): +def __init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None, + handleInvalid="error", stringOrderType="frequencyDesc"): """ -__init__(self, inputCol=None, outputCol=None, handleInvalid="error", \ - stringOrderType="frequencyDesc") +__init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \ + handleInvalid="error", stringOrderType="frequencyDesc") """ super(StringIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid) @@ -2389,11 +2411,11 @@ class StringIndexer(JavaEstimator, _StringIndexerParams, JavaMLReadable, JavaMLW @keyword_only @since("1.4.0") -def setParams(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDesc"): +def setParams(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None, + handleInvalid="error", stringOrderType="frequencyDesc"): """ -setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \ - stringOrderType="frequencyDesc") +setParams(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None, \ +
[spark] branch branch-2.4 updated: [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 274142b [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer 274142b is described below commit 274142be08eb3a4239046d7f7260c7284ed041c2 Author: Ivan Vergiliev AuthorDate: Wed Feb 20 21:49:38 2019 +0800 [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer ## What changes were proposed in this pull request? This happens in a schema evolution use case only when a user specifies the schema manually and use non-vectorized ORC deserializer code path. There is a bug in `OrcDeserializer.scala` that results in `null`s being set at the wrong column position, and for state from previous records to remain uncleared in next records. There are more details for when exactly the bug gets triggered and what the outcome is in the [JIRA issue](https://jira.apache.org/jira/browse/SPARK-26859). The high-level summary is that this bug results in severe data correctness issues, but fortunately the set of conditions to expose the bug are complicated and make the surface area somewhat small. This change fixes the problem and adds a respective test. ## How was this patch tested? Pass the Jenkins with the newly added test cases. Closes #23766 from IvanVergiliev/fix-orc-deserializer. Lead-authored-by: Ivan Vergiliev Co-authored-by: Dongjoon Hyun Signed-off-by: Wenchen Fan (cherry picked from commit 096552ae4d6fcef5e20c54384a2687db41ba2fa1) Signed-off-by: Wenchen Fan --- .../datasources/orc/OrcDeserializer.scala | 34 +++ .../execution/datasources/ReadSchemaSuite.scala| 6 .../sql/execution/datasources/ReadSchemaTest.scala | 39 +- 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala index 4ecc54b..decd5c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala @@ -37,28 +37,34 @@ class OrcDeserializer( private val resultRow = new SpecificInternalRow(requiredSchema.map(_.dataType)) + // `fieldWriters(index)` is + // - null if the respective source column is missing, since the output value + // is always null in this case + // - a function that updates target column `index` otherwise. private val fieldWriters: Array[WritableComparable[_] => Unit] = { requiredSchema.zipWithIndex - // The value of missing columns are always null, do not need writers. - .filterNot { case (_, index) => requestedColIds(index) == -1 } .map { case (f, index) => -val writer = newWriter(f.dataType, new RowUpdater(resultRow)) -(value: WritableComparable[_]) => writer(index, value) +if (requestedColIds(index) == -1) { + null +} else { + val writer = newWriter(f.dataType, new RowUpdater(resultRow)) + (value: WritableComparable[_]) => writer(index, value) +} }.toArray } - private val validColIds = requestedColIds.filterNot(_ == -1) - def deserialize(orcStruct: OrcStruct): InternalRow = { -var i = 0 -while (i < validColIds.length) { - val value = orcStruct.getFieldValue(validColIds(i)) - if (value == null) { -resultRow.setNullAt(i) - } else { -fieldWriters(i)(value) +var targetColumnIndex = 0 +while (targetColumnIndex < fieldWriters.length) { + if (fieldWriters(targetColumnIndex) != null) { +val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex)) +if (value == null) { + resultRow.setNullAt(targetColumnIndex) +} else { + fieldWriters(targetColumnIndex)(value) +} } - i += 1 + targetColumnIndex += 1 } resultRow } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala index 23c58e1..de234c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala @@ -72,6 +72,7 @@ class HeaderCSVReadSchemaSuite class JsonReadSchemaSuite extends ReadSchemaSuite + with AddColumnIntoTheMiddleTest with HideColumnInTheMiddleTest with ChangePositionTest with IntegralTypeTest @@ -84,6 +85,7 @@ class
[spark] branch master updated: [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 096552a [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer 096552a is described below commit 096552ae4d6fcef5e20c54384a2687db41ba2fa1 Author: Ivan Vergiliev AuthorDate: Wed Feb 20 21:49:38 2019 +0800 [SPARK-26859][SQL] Fix field writer index bug in non-vectorized ORC deserializer ## What changes were proposed in this pull request? This happens in a schema evolution use case only when a user specifies the schema manually and use non-vectorized ORC deserializer code path. There is a bug in `OrcDeserializer.scala` that results in `null`s being set at the wrong column position, and for state from previous records to remain uncleared in next records. There are more details for when exactly the bug gets triggered and what the outcome is in the [JIRA issue](https://jira.apache.org/jira/browse/SPARK-26859). The high-level summary is that this bug results in severe data correctness issues, but fortunately the set of conditions to expose the bug are complicated and make the surface area somewhat small. This change fixes the problem and adds a respective test. ## How was this patch tested? Pass the Jenkins with the newly added test cases. Closes #23766 from IvanVergiliev/fix-orc-deserializer. Lead-authored-by: Ivan Vergiliev Co-authored-by: Dongjoon Hyun Signed-off-by: Wenchen Fan --- .../datasources/orc/OrcDeserializer.scala | 34 +++ .../execution/datasources/ReadSchemaSuite.scala| 6 .../sql/execution/datasources/ReadSchemaTest.scala | 39 +- 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala index ee16b3a..62e1670 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala @@ -37,28 +37,34 @@ class OrcDeserializer( private val resultRow = new SpecificInternalRow(requiredSchema.map(_.dataType)) + // `fieldWriters(index)` is + // - null if the respective source column is missing, since the output value + // is always null in this case + // - a function that updates target column `index` otherwise. private val fieldWriters: Array[WritableComparable[_] => Unit] = { requiredSchema.zipWithIndex - // The value of missing columns are always null, do not need writers. - .filterNot { case (_, index) => requestedColIds(index) == -1 } .map { case (f, index) => -val writer = newWriter(f.dataType, new RowUpdater(resultRow)) -(value: WritableComparable[_]) => writer(index, value) +if (requestedColIds(index) == -1) { + null +} else { + val writer = newWriter(f.dataType, new RowUpdater(resultRow)) + (value: WritableComparable[_]) => writer(index, value) +} }.toArray } - private val validColIds = requestedColIds.filterNot(_ == -1) - def deserialize(orcStruct: OrcStruct): InternalRow = { -var i = 0 -while (i < validColIds.length) { - val value = orcStruct.getFieldValue(validColIds(i)) - if (value == null) { -resultRow.setNullAt(i) - } else { -fieldWriters(i)(value) +var targetColumnIndex = 0 +while (targetColumnIndex < fieldWriters.length) { + if (fieldWriters(targetColumnIndex) != null) { +val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex)) +if (value == null) { + resultRow.setNullAt(targetColumnIndex) +} else { + fieldWriters(targetColumnIndex)(value) +} } - i += 1 + targetColumnIndex += 1 } resultRow } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala index 23c58e1..de234c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala @@ -72,6 +72,7 @@ class HeaderCSVReadSchemaSuite class JsonReadSchemaSuite extends ReadSchemaSuite + with AddColumnIntoTheMiddleTest with HideColumnInTheMiddleTest with ChangePositionTest with IntegralTypeTest @@ -84,6 +85,7 @@ class JsonReadSchemaSuite class OrcReadSchemaSuite extends ReadSchemaSuite + with AddColumnIntoTheMiddleTest with