svn commit: r52092 - in /dev/spark/v3.2.1-rc2-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _site/api/java/org/apache/parqu
Author: huaxingao Date: Sat Jan 15 08:52:14 2022 New Revision: 52092 Log: Apache Spark v3.2.1-rc2 docs [This commit notification would consist of 2355 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37854][CORE] Replace type check with pattern matching in Spark code
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c7c51bc [SPARK-37854][CORE] Replace type check with pattern matching in Spark code c7c51bc is described below commit c7c51bcab5cb067d36bccf789e0e4ad7f37ffb7c Author: yangjie01 AuthorDate: Sat Jan 15 08:54:16 2022 -0600 [SPARK-37854][CORE] Replace type check with pattern matching in Spark code ### What changes were proposed in this pull request? There are many method use `isInstanceOf + asInstanceOf` for type conversion in Spark code now, the main change of this pr is replace `type check` with `pattern matching` for code simplification. ### Why are the changes needed? Code simplification ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GA Closes #35154 from LuciferYang/SPARK-37854. Authored-by: yangjie01 Signed-off-by: Sean Owen --- .../main/scala/org/apache/spark/TestUtils.scala| 36 ++-- .../main/scala/org/apache/spark/api/r/SerDe.scala | 12 ++-- .../spark/internal/config/ConfigBuilder.scala | 18 +++--- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 64 +++--- .../main/scala/org/apache/spark/rdd/PipedRDD.scala | 7 ++- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 8 ++- .../main/scala/org/apache/spark/util/Utils.scala | 38 ++--- .../storage/ShuffleBlockFetcherIteratorSuite.scala | 10 ++-- .../org/apache/spark/util/FileAppenderSuite.scala | 17 +++--- .../scala/org/apache/spark/util/UtilsSuite.scala | 19 --- .../apache/spark/examples/mllib/LDAExample.scala | 11 ++-- .../spark/mllib/api/python/PythonMLLibAPI.scala| 12 ++-- .../expressions/aggregate/Percentile.scala | 14 ++--- .../apache/spark/sql/catalyst/trees/TreeNode.scala | 7 +-- .../sql/catalyst/encoders/RowEncoderSuite.scala| 11 ++-- .../sql/execution/columnar/ColumnAccessor.scala| 10 ++-- .../spark/sql/execution/columnar/ColumnType.scala | 50 + .../sql/execution/datasources/FileScanRDD.scala| 19 --- .../org/apache/spark/sql/jdbc/H2Dialect.scala | 30 +- .../spark/sql/SparkSessionExtensionSuite.scala | 57 +-- .../sql/execution/joins/BroadcastJoinSuite.scala | 13 ++--- .../apache/spark/sql/streaming/StreamTest.scala| 6 +- .../sql/hive/client/IsolatedClientLoader.scala | 12 ++-- .../spark/streaming/scheduler/JobGenerator.scala | 10 ++-- .../org/apache/spark/streaming/util/StateMap.scala | 21 +++ 25 files changed, 263 insertions(+), 249 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala index 20159af..d2af955 100644 --- a/core/src/main/scala/org/apache/spark/TestUtils.scala +++ b/core/src/main/scala/org/apache/spark/TestUtils.scala @@ -337,22 +337,26 @@ private[spark] object TestUtils { connection.setRequestMethod(method) headers.foreach { case (k, v) => connection.setRequestProperty(k, v) } -// Disable cert and host name validation for HTTPS tests. -if (connection.isInstanceOf[HttpsURLConnection]) { - val sslCtx = SSLContext.getInstance("SSL") - val trustManager = new X509TrustManager { -override def getAcceptedIssuers(): Array[X509Certificate] = null -override def checkClientTrusted(x509Certificates: Array[X509Certificate], -s: String): Unit = {} -override def checkServerTrusted(x509Certificates: Array[X509Certificate], -s: String): Unit = {} - } - val verifier = new HostnameVerifier() { -override def verify(hostname: String, session: SSLSession): Boolean = true - } - sslCtx.init(null, Array(trustManager), new SecureRandom()) - connection.asInstanceOf[HttpsURLConnection].setSSLSocketFactory(sslCtx.getSocketFactory()) - connection.asInstanceOf[HttpsURLConnection].setHostnameVerifier(verifier) +connection match { + // Disable cert and host name validation for HTTPS tests. + case httpConnection: HttpsURLConnection => +val sslCtx = SSLContext.getInstance("SSL") +val trustManager = new X509TrustManager { + override def getAcceptedIssuers: Array[X509Certificate] = null + + override def checkClientTrusted(x509Certificates: Array[X509Certificate], + s: String): Unit = {} + + override def checkServerTrusted(x509Certificates: Array[X509Certificate], + s: String): Unit = {} +} +val verifier = new HostnameVerifier() { + override def verify(hostname: String, session: SSLSession): Boolean = true +} +sslCtx.init(null, Array(trustManager), new SecureRandom()) +httpConnection
[spark] branch master updated: [SPARK-37862][SQL] RecordBinaryComparator should fast skip the check of aligning with unaligned platform
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 8ae9707 [SPARK-37862][SQL] RecordBinaryComparator should fast skip the check of aligning with unaligned platform 8ae9707 is described below commit 8ae970790814a0080713857261a3b1c2e2b01dd7 Author: ulysses-you AuthorDate: Sat Jan 15 08:59:56 2022 -0600 [SPARK-37862][SQL] RecordBinaryComparator should fast skip the check of aligning with unaligned platform ### What changes were proposed in this pull request? `RecordBinaryComparator` compare the entire row, so it need to check if the platform is unaligned. #35078 had given the perf number to show the benefits. So this PR aims to do the same thing that fast skip the check of aligning with unaligned platform. ### Why are the changes needed? Improve the performance. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Pass CI. And the perf number should be same with #35078 Closes #35161 from ulysses-you/unaligned. Authored-by: ulysses-you Signed-off-by: Sean Owen --- .../java/org/apache/spark/sql/execution/RecordBinaryComparator.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java b/sql/core/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java index 1f24340..e91873a 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java @@ -24,6 +24,7 @@ import java.nio.ByteOrder; public final class RecordBinaryComparator extends RecordComparator { + private static final boolean UNALIGNED = Platform.unaligned(); private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN); @@ -41,7 +42,7 @@ public final class RecordBinaryComparator extends RecordComparator { // we have guaranteed `leftLen` == `rightLen`. // check if stars align and we can get both offsets to be aligned -if ((leftOff % 8) == (rightOff % 8)) { +if (!UNALIGNED && ((leftOff % 8) == (rightOff % 8))) { while ((leftOff + i) % 8 != 0 && i < leftLen) { final int v1 = Platform.getByte(leftObj, leftOff + i); final int v2 = Platform.getByte(rightObj, rightOff + i); @@ -52,7 +53,7 @@ public final class RecordBinaryComparator extends RecordComparator { } } // for architectures that support unaligned accesses, chew it up 8 bytes at a time -if (Platform.unaligned() || (((leftOff + i) % 8 == 0) && ((rightOff + i) % 8 == 0))) { +if (UNALIGNED || (((leftOff + i) % 8 == 0) && ((rightOff + i) % 8 == 0))) { while (i <= leftLen - 8) { long v1 = Platform.getLong(leftObj, leftOff + i); long v2 = Platform.getLong(rightObj, rightOff + i); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37876][CORE][SQL] Move `SpecificParquetRecordReaderBase.listDirectory` to `TestUtils`
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7614472 [SPARK-37876][CORE][SQL] Move `SpecificParquetRecordReaderBase.listDirectory` to `TestUtils` 7614472 is described below commit 7614472950cb57ffefa0a51dd1163103c5d42df6 Author: yangjie01 AuthorDate: Sat Jan 15 09:01:55 2022 -0600 [SPARK-37876][CORE][SQL] Move `SpecificParquetRecordReaderBase.listDirectory` to `TestUtils` ### What changes were proposed in this pull request? `SpecificParquetRecordReaderBase.listDirectory` is used to return the list of files at `path` recursively and the result will skips files that are ignored normally by MapReduce. This method is only used by tests in Spark now and the tests also includes non-parquet test scenario, such as `OrcColumnarBatchReaderSuite`. So this pr move this method from `SpecificParquetRecordReaderBase` to `TestUtils` to make it as a test method. ### Why are the changes needed? Refactoring: move test method to `TestUtils`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GA Closes #35177 from LuciferYang/list-directory. Authored-by: yangjie01 Signed-off-by: Sean Owen --- .../src/main/scala/org/apache/spark/TestUtils.scala | 15 +++ .../parquet/SpecificParquetRecordReaderBase.java| 21 - .../benchmark/DataSourceReadBenchmark.scala | 11 ++- .../orc/OrcColumnarBatchReaderSuite.scala | 4 ++-- .../datasources/parquet/ParquetEncodingSuite.scala | 11 ++- .../datasources/parquet/ParquetIOSuite.scala| 6 +++--- .../execution/datasources/parquet/ParquetTest.scala | 3 ++- .../spark/sql/test/DataFrameReaderWriterSuite.scala | 5 ++--- 8 files changed, 36 insertions(+), 40 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala index d2af955..505b3ab 100644 --- a/core/src/main/scala/org/apache/spark/TestUtils.scala +++ b/core/src/main/scala/org/apache/spark/TestUtils.scala @@ -446,6 +446,21 @@ private[spark] object TestUtils { current ++ current.filter(_.isDirectory).flatMap(recursiveList) } + /** + * Returns the list of files at 'path' recursively. This skips files that are ignored normally + * by MapReduce. + */ + def listDirectory(path: File): Array[String] = { +val result = ArrayBuffer.empty[String] +if (path.isDirectory) { + path.listFiles.foreach(f => result.appendAll(listDirectory(f))) +} else { + val c = path.getName.charAt(0) + if (c != '.' && c != '_') result.append(path.getAbsolutePath) +} +result.toArray + } + /** Creates a temp JSON file that contains the input JSON record. */ def createTempJsonFile(dir: File, prefix: String, jsonValue: JValue): String = { val file = File.createTempFile(prefix, ".json", dir) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index e1a0607..07e35c1 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -19,10 +19,8 @@ package org.apache.spark.sql.execution.datasources.parquet; import java.io.Closeable; -import java.io.File; import java.io.IOException; import java.lang.reflect.InvocationTargetException; -import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -122,25 +120,6 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReader listDirectory(File path) { -List result = new ArrayList<>(); -if (path.isDirectory()) { - for (File f: path.listFiles()) { -result.addAll(listDirectory(f)); - } -} else { - char c = path.getName().charAt(0); - if (c != '.' && c != '_') { -result.add(path.getAbsolutePath()); - } -} -return result; - } - - /** * Initializes the reader to read the file at `path` with `columns` projected. If columns is * null, all the columns are projected. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index 31cee48..5094cdf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ben
[spark] branch master updated: [SPARK-37920][BUILD] Remove tab character and trailing space in pom.xml
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 482439f [SPARK-37920][BUILD] Remove tab character and trailing space in pom.xml 482439f is described below commit 482439ff4620be9d30b36aa32a26722be9f4a30e Author: stczwd AuthorDate: Sat Jan 15 15:58:56 2022 -0800 [SPARK-37920][BUILD] Remove tab character and trailing space in pom.xml ### Why are the changes needed? There are some tabs in pom.xml, which don't seem to be standardized. This pr tries to modify this problem. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? origin tests Closes #35218 from stczwd/SPARK-37920. Authored-by: stczwd Signed-off-by: Dongjoon Hyun --- pom.xml | 70 - 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 61d576c..07a8861 100644 --- a/pom.xml +++ b/pom.xml @@ -2756,39 +2756,39 @@ - - org.codehaus.mojo - build-helper-maven-plugin - 3.2.0 - - - module-timestamp-property - validate - - timestamp-property - - - module.build.timestamp - ${maven.build.timestamp.format} - current - America/Los_Angeles - - - - local-timestamp-property - validate - - timestamp-property - - - local.build.timestamp - ${maven.build.timestamp.format} - build - America/Los_Angeles - - - - + + org.codehaus.mojo + build-helper-maven-plugin + 3.2.0 + + + module-timestamp-property + validate + +timestamp-property + + +module.build.timestamp +${maven.build.timestamp.format} +current +America/Los_Angeles + + + + local-timestamp-property + validate + +timestamp-property + + +local.build.timestamp +${maven.build.timestamp.format} +build +America/Los_Angeles + + + + net.alchim31.maven scala-maven-plugin @@ -3564,9 +3564,9 @@ scala-2.12 - 2.12.15 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37921][TESTS] Update OrcReadBenchmark to use Hive ORC reader as the basis
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4c59a83 [SPARK-37921][TESTS] Update OrcReadBenchmark to use Hive ORC reader as the basis 4c59a83 is described below commit 4c59a830a6a235400d0184fb6ce24c9e054d3e4b Author: William Hyun AuthorDate: Sat Jan 15 21:52:31 2022 -0800 [SPARK-37921][TESTS] Update OrcReadBenchmark to use Hive ORC reader as the basis ### What changes were proposed in this pull request? This PR aims to update `OrcReadBenchmark` to use Hive ORC reader as the basis for comparison. ### Why are the changes needed? This will improve the visibility of native ORC reader's improvement because currently the new improvements are shown as `1.0x`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually review. Closes #35219 from williamhyun/benchmark. Authored-by: William Hyun Signed-off-by: Dongjoon Hyun --- .../benchmarks/OrcReadBenchmark-jdk11-results.txt | 188 - .../benchmarks/OrcReadBenchmark-jdk17-results.txt | 188 - sql/hive/benchmarks/OrcReadBenchmark-results.txt | 232 ++--- .../spark/sql/hive/orc/OrcReadBenchmark.scala | 74 +++ 4 files changed, 341 insertions(+), 341 deletions(-) diff --git a/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt index 3f9e63f..f9ab5dd 100644 --- a/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt +++ b/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt @@ -2,221 +2,221 @@ SQL Single Numeric Column Scan -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Native ORC MR 1064 1070 9 14.8 67.6 1.0X -Native ORC Vectorized 237326 73 66.3 15.1 4.5X -Hive built-in ORC 1232 1330 139 12.8 78.3 0.9X +Hive built-in ORC 1137 1138 1 13.8 72.3 1.0X +Native ORC MR 962982 17 16.3 61.2 1.2X +Native ORC Vectorized 225298 65 69.9 14.3 5.1X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Native ORC MR 947 1056 155 16.6 60.2 1.0X -Native ORC Vectorized 232311 56 67.7 14.8 4.1X -Hive built-in ORC 1317 1330 19 11.9 83.7 0.7X +Hive built-in ORC 1250 1253 4 12.6 79.5 1.0X +Native ORC MR 1038 1135 136 15.1 66.0 1.2X +Native ORC Vectorized 232307 47 67.9 14.7 5.4X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Native ORC MR 964 1070 150 16.3 61.3 1.0X -Native ORC Vectorized 275304 32 57.2 17.5 3.5X -Hive built-in ORC