svn commit: r28120 - in /dev/spark/2.3.3-SNAPSHOT-2018_07_14_22_01-f9a2b0a-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sun Jul 15 05:15:27 2018 New Revision: 28120 Log: Apache Spark 2.3.3-SNAPSHOT-2018_07_14_22_01-f9a2b0a docs [This commit notification would consist of 1443 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24807][CORE] Adding files/jars twice: output a warning and add a note
Repository: spark Updated Branches: refs/heads/master 3e7dc8296 -> 69993217f [SPARK-24807][CORE] Adding files/jars twice: output a warning and add a note ## What changes were proposed in this pull request? In the PR, I propose to output an warning if the `addFile()` or `addJar()` methods are callled more than once for the same path. Currently, overwriting of already added files is not supported. New comments and warning are reflected the existing behaviour. Author: Maxim Gekk Closes #21771 from MaxGekk/warning-on-adding-file. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69993217 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69993217 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69993217 Branch: refs/heads/master Commit: 69993217fc4f5e5e41a297702389e86fe534dc2f Parents: 3e7dc82 Author: Maxim Gekk Authored: Sat Jul 14 22:07:49 2018 -0700 Committer: Xiao Li Committed: Sat Jul 14 22:07:49 2018 -0700 -- R/pkg/R/context.R | 2 ++ core/src/main/scala/org/apache/spark/SparkContext.scala | 12 .../org/apache/spark/api/java/JavaSparkContext.scala| 6 ++ python/pyspark/context.py | 4 4 files changed, 24 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 8ec727d..3e996a5 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -305,6 +305,8 @@ setCheckpointDirSC <- function(sc, dirName) { #' Currently directories are only supported for Hadoop-supported filesystems. #' Refer Hadoop-supported filesystems at \url{https://wiki.apache.org/hadoop/HCFS}. #' +#' Note: A path can be added only once. Subsequent additions of the same path are ignored. +#' #' @rdname spark.addFile #' @param path The path of the file to be added #' @param recursive Whether to add files recursively from the path. Default is FALSE. http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 74bfb5d..531384a 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1496,6 +1496,8 @@ class SparkContext(config: SparkConf) extends Logging { * @param path can be either a local file, a file in HDFS (or other Hadoop-supported * filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs, * use `SparkFiles.get(fileName)` to find its download location. + * + * @note A path can be added only once. Subsequent additions of the same path are ignored. */ def addFile(path: String): Unit = { addFile(path, false) @@ -1516,6 +1518,8 @@ class SparkContext(config: SparkConf) extends Logging { * use `SparkFiles.get(fileName)` to find its download location. * @param recursive if true, a directory can be given in `path`. Currently directories are * only supported for Hadoop-supported filesystems. + * + * @note A path can be added only once. Subsequent additions of the same path are ignored. */ def addFile(path: String, recursive: Boolean): Unit = { val uri = new Path(path).toUri @@ -1555,6 +1559,9 @@ class SparkContext(config: SparkConf) extends Logging { Utils.fetchFile(uri.toString, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConfiguration, timestamp, useCache = false) postEnvironmentUpdate() +} else { + logWarning(s"The path $path has been added already. Overwriting of added paths " + + "is not supported in the current version.") } } @@ -1803,6 +1810,8 @@ class SparkContext(config: SparkConf) extends Logging { * * @param path can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), * an HTTP, HTTPS or FTP URI, or local:/path for a file on every worker node. + * + * @note A path can be added only once. Subsequent additions of the same path are ignored. */ def addJar(path: String) { def addJarFile(file: File): String = { @@ -1849,6 +1858,9 @@ class SparkContext(config: SparkConf) extends Logging { if (addedJars.putIfAbsent(key, timestamp).isEmpty) { logInfo(s"Added JAR $path at $key with timestamp $timestamp") postEnvironmentUpdate() +} else { + logWarning(s"The jar $path has been added already. Overwriting of added jars " + +"is not supported in
spark git commit: [SPARK-24776][SQL] Avro unit test: deduplicate code and replace deprecated methods
Repository: spark Updated Branches: refs/heads/master 43e4e851b -> 3e7dc8296 [SPARK-24776][SQL] Avro unit test: deduplicate code and replace deprecated methods ## What changes were proposed in this pull request? Improve Avro unit test: 1. use QueryTest/SharedSQLContext/SQLTestUtils, instead of the duplicated test utils. 2. replace deprecated methods This is a follow up PR for #21760, the PR passes pull request tests but failed in: https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Compile/job/spark-master-compile-maven-hadoop-2.6/7842/ This PR is to fix it. ## How was this patch tested? Unit test. Compile with different commands: ``` ./build/mvn --force -DzincPort=3643 -DskipTests -Phadoop-2.6 -Phive-thriftserver -Pkinesis-asl -Pspark-ganglia-lgpl -Pmesos -Pyarn compile test-compile ./build/mvn --force -DzincPort=3643 -DskipTests -Phadoop-2.7 -Phive-thriftserver -Pkinesis-asl -Pspark-ganglia-lgpl -Pmesos -Pyarn compile test-compile ./build/mvn --force -DzincPort=3643 -DskipTests -Phadoop-3.1 -Phive-thriftserver -Pkinesis-asl -Pspark-ganglia-lgpl -Pmesos -Pyarn compile test-compile ``` Author: Gengliang Wang Closes #21768 from gengliangwang/improve_avro_test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e7dc829 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e7dc829 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e7dc829 Branch: refs/heads/master Commit: 3e7dc82960fd3339eee16d83df66761ae6e3fe3d Parents: 43e4e85 Author: Gengliang Wang Authored: Sat Jul 14 21:36:56 2018 -0700 Committer: Xiao Li Committed: Sat Jul 14 21:36:56 2018 -0700 -- .../org/apache/spark/sql/avro/AvroSuite.scala | 98 ++-- .../org/apache/spark/sql/avro/TestUtils.scala | 156 --- 2 files changed, 45 insertions(+), 209 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e7dc829/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala -- diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index c6c1e40..4f94d82 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -31,32 +31,24 @@ import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord} import org.apache.avro.generic.GenericData.{EnumSymbol, Fixed} import org.apache.commons.io.FileUtils -import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ import org.apache.spark.sql.avro.SchemaConverters.IncompatibleSchemaException +import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils} import org.apache.spark.sql.types._ -class AvroSuite extends SparkFunSuite { +class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils { val episodesFile = "src/test/resources/episodes.avro" val testFile = "src/test/resources/test.avro" - private var spark: SparkSession = _ - override protected def beforeAll(): Unit = { super.beforeAll() -spark = SparkSession.builder() - .master("local[2]") - .appName("AvroSuite") - .config("spark.sql.files.maxPartitionBytes", 1024) - .getOrCreate() - } - - override protected def afterAll(): Unit = { -try { - spark.sparkContext.stop() -} finally { - super.afterAll() -} +spark.conf.set("spark.sql.files.maxPartitionBytes", 1024) + } + + def checkReloadMatchesSaved(originalFile: String, newFile: String): Unit = { +val originalEntries = spark.read.avro(testFile).collect() +val newEntries = spark.read.avro(newFile) +checkAnswer(newEntries, originalEntries) } test("reading from multiple paths") { @@ -68,7 +60,7 @@ class AvroSuite extends SparkFunSuite { val df = spark.read.avro(episodesFile) val fields = List("title", "air_date", "doctor") for (field <- fields) { - TestUtils.withTempDir { dir => + withTempPath { dir => val outputDir = s"$dir/${UUID.randomUUID}" df.write.partitionBy(field).avro(outputDir) val input = spark.read.avro(outputDir) @@ -82,12 +74,12 @@ class AvroSuite extends SparkFunSuite { test("request no fields") { val df = spark.read.avro(episodesFile) -df.registerTempTable("avro_table") +df.createOrReplaceTempView("avro_table") assert(spark.sql("select count(*) from avro_table").collect().head === Row(8)) } test("convert formats") { -TestUtils.withTempDir { dir => +withTempPath { dir => val df = spark.read.avro(episodesFile) df.write.parquet(dir.getCanonicalPath)
spark git commit: [SPARK-24718][SQL] Timestamp support pushdown to parquet data source
Repository: spark Updated Branches: refs/heads/master 8aceb961c -> 43e4e851b [SPARK-24718][SQL] Timestamp support pushdown to parquet data source ## What changes were proposed in this pull request? `Timestamp` support pushdown to parquet data source. Only `TIMESTAMP_MICROS` and `TIMESTAMP_MILLIS` support push down. ## How was this patch tested? unit tests and benchmark tests Author: Yuming Wang Closes #21741 from wangyum/SPARK-24718. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43e4e851 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43e4e851 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43e4e851 Branch: refs/heads/master Commit: 43e4e851b642bbee535d22e1b9e72ec6b99f6ed4 Parents: 8aceb96 Author: Yuming Wang Authored: Sun Jul 15 11:13:49 2018 +0800 Committer: hyukjinkwon Committed: Sun Jul 15 11:13:49 2018 +0800 -- .../org/apache/spark/sql/internal/SQLConf.scala | 11 ++ .../FilterPushdownBenchmark-results.txt | 124 +++ .../datasources/parquet/ParquetFileFormat.scala | 3 +- .../datasources/parquet/ParquetFilters.scala| 59 - .../benchmark/FilterPushdownBenchmark.scala | 37 +- .../parquet/ParquetFilterSuite.scala| 74 ++- 6 files changed, 301 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/43e4e851/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 699e939..07d33fa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -378,6 +378,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED = +buildConf("spark.sql.parquet.filterPushdown.timestamp") + .doc("If true, enables Parquet filter push-down optimization for Timestamp. " + +"This configuration only has an effect when 'spark.sql.parquet.filterPushdown' is " + +"enabled and Timestamp stored as TIMESTAMP_MICROS or TIMESTAMP_MILLIS type.") +.internal() +.booleanConf +.createWithDefault(true) + val PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED = buildConf("spark.sql.parquet.filterPushdown.string.startsWith") .doc("If true, enables Parquet filter push-down optimization for string startsWith function. " + @@ -1494,6 +1503,8 @@ class SQLConf extends Serializable with Logging { def parquetFilterPushDownDate: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_DATE_ENABLED) + def parquetFilterPushDownTimestamp: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED) + def parquetFilterPushDownStringStartWith: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED) http://git-wip-us.apache.org/repos/asf/spark/blob/43e4e851/sql/core/benchmarks/FilterPushdownBenchmark-results.txt -- diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index c44908b..4f38cc4 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -578,3 +578,127 @@ Native ORC Vectorized 11622 / 12196 1.4 7 Native ORC Vectorized (Pushdown)11377 / 11654 1.4 723.3 1.0X + +Pushdown benchmark for Timestamp + + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Select 1 timestamp stored as INT96 row (value = CAST(7864320 AS timestamp)): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Parquet Vectorized4784 / 4956 3.3 304.2 1.0X +Parquet Vectorized (Pushdown) 4838 / 4917 3.3 307.6 1.0X +Native ORC Vectorized 3923 / 4173 4.0 249.4 1.2X +Native ORC Vectorized (Pushdown) 894 / 943 17.6 56.8 5.4X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +Select 10%
svn commit: r28118 - /dev/spark/v2.3.2-rc3-bin/
Author: jshao Date: Sun Jul 15 03:04:30 2018 New Revision: 28118 Log: Apache Spark v2.3.2-rc3 Added: dev/spark/v2.3.2-rc3-bin/ dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz (with props) dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512 dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz (with props) dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512 dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.6.tgz (with props) dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.6.tgz.asc dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.6.tgz.sha512 dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.7.tgz (with props) dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.7.tgz.asc dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.7.tgz.sha512 dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-without-hadoop.tgz (with props) dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-without-hadoop.tgz.asc dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-without-hadoop.tgz.sha512 dev/spark/v2.3.2-rc3-bin/spark-2.3.2.tgz (with props) dev/spark/v2.3.2-rc3-bin/spark-2.3.2.tgz.asc dev/spark/v2.3.2-rc3-bin/spark-2.3.2.tgz.sha512 Added: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz == Binary file - no diff available. Propchange: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz -- svn:mime-type = application/octet-stream Added: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc == --- dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc (added) +++ dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc Sun Jul 15 03:04:30 2018 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIcBAABCgAGBQJbSrCkAAoJENsLIaASlz/QxHIP/2+V6MiBci4mepIYNEA5M4A8 +n5hRYXbPDkK6i/tPlCtvdeW8XkcSJejzk+lPJdjQCxfqOsWCiGal42siV7XY/x96 ++08b2XXzLOZ65RHSMJdE3M+qXgEs+kthCSg2Q/mZcssu83BqwNah0JTUIKi3oSmz +11EnY1Pie4VCUn/ASdUPvmWeDTYpuziZnekMjI9B6WFx/gXHBOz8+6gJTpq6Eyq1 +VKrYCMtMtN6mYXh0yYqtYIXTQgA4/DJsmt2BVDrKOWweHkua3hBDNNBQWxB3kR2l +tXawlYtIUxOERKL0lwatDqMoXIj7euEs0EfEPaZrYuulGN3s4yrwcOAuOo2Am5uA +ltCFxlDwVulXPPMbkhv2RIQ4wGjSuMdW6mq94DJCG2SaE5HYgI7yh1MC+iyRu3Ib +Y4xyQPApEj5HcoL0N/HX2FtcZAcf13CFqsfc6jZ+CWLT6xW57LO/mupn84jPHgao +3s2d5l6c5uc3b5vZCmcpI9uy2B4Ts2W4Q39xKqlm6BEARqXDhYKQH09mLTsgLb9K +xdFNKjrb99nSu1yqkJXrw9B95oaPPIGiIPmklfcxLcJ1sHiej7qmmuw511MJOhlJ +Czngh9HPepjxIO0j3LaH0yT18gj7qy+Y9cqd2YD1LPnaYu3kIjgp8oP2draMwlxa +/Z+tLGY+16MsFghl3uQe +=FJmQ +-END PGP SIGNATURE- Added: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512 == --- dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512 (added) +++ dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512 Sun Jul 15 03:04:30 2018 @@ -0,0 +1,3 @@ +SparkR_2.3.2.tar.gz: 5D5225F0 8C8E27C8 579DABC7 5CDF37C8 024F2DE2 069583E9 + 843781A9 30B501AF C72924AD C82DA242 2017D86A 26D0CE9C + 4F1BFDDB B35D7FB2 42F2A6C1 055EA0E8 Added: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz == Binary file - no diff available. Propchange: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz -- svn:mime-type = application/octet-stream Added: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc == --- dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc (added) +++ dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc Sun Jul 15 03:04:30 2018 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIcBAABCgAGBQJbSrUeAAoJENsLIaASlz/QqWAP/i1Z/e5HINjqYcVYOjMHFVv2 +h50ezR7aL4sKFvqh5POaNDoXv3GP+4cRsdlOziLBEV2JxnE+Dnu8H4L8Y+SdH52x +sC+8XbNzBGOJSh0XYQ4Ez8LnMlhCro4n5RgaJJIsRRzbuFvSKakNGL7lW1kxUHQX +DiqXJ+wA5oRLYeITGE8YLjLgYgPwE8oC92WHmi/RWg5ES6dGmzF09X+7ccAmfHxt +zOE6ARtLRJ2aeMw9s0t2DLfSznP8dsNXDz0xPHggWdJNmhLrkQfeBN5AZCPq8hwS +manPzxX9Gb8UFjkRnljds+rGuVW29zVAmWL7rfi8Uv3QSv9oP5ZHyWbiYMDEI2/v +R/EzwT/Gjk1NWk+W5RGVevNMJ/xMy0XfVdlzrkc8Svi8m93ojhxyJMNhVQdFN2PJ +rJoqHHSH8ev4/W1GnW4oUwr06dAewTsmLOa/tSvdVjEk0BwNrsM6GwPOTZB3tPB6 +unfDdgOEvlAq2CRN9GnKftAruprk77fmE4frE10sg5Jms1ohLN503NTJx4gTBW7Y +qCkdYrusPgmW8F6aA865jUylPg/BhIRXE5H4kPnnzgwNYzPjGYdMnuksb3Ls9Mr9 +BxvbkLToe9goSIjGbj+iy/5VGem7tKsKwiDlUN5StAqH+M3ZlEW2hpcILY4cM/VH +KthInMLfa7Ofa9bEZMb7 +=jTWe +-END PGP SIGNATURE- Added: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512 == --- dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512 (added) +++ dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512 Sun Jul 15 03:04:30 2018 @@ -0,0 +1,3 @@
[1/2] spark git commit: Preparing Spark release v2.3.2-rc3
Repository: spark Updated Branches: refs/heads/branch-2.3 9cf375f5b -> f9a2b0a87 Preparing Spark release v2.3.2-rc3 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3726dad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3726dad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3726dad Branch: refs/heads/branch-2.3 Commit: b3726dadcf2997f20231873ec6e057dba433ae64 Parents: 9cf375f Author: Saisai Shao Authored: Sun Jul 15 01:56:00 2018 + Committer: Saisai Shao Committed: Sun Jul 15 01:56:00 2018 + -- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/kvstore/pom.xml| 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/mesos/pom.xml | 2 +- resource-managers/yarn/pom.xml| 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 41 files changed, 42 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 6ec4966..8df2635 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.3.3 +Version: 2.3.2 Title: R Frontend for Apache Spark Description: Provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index f8b15cc..57485fc 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.3.3-SNAPSHOT +2.3.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/common/kvstore/pom.xml -- diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index e412a47..53e58c2 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.3.3-SNAPSHOT +2.3.2 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index d8f9a3d..d05647c 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.3.3-SNAPSHOT +2.3.2 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index a1a4f87..8d46761 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml
[2/2] spark git commit: Preparing development version 2.3.3-SNAPSHOT
Preparing development version 2.3.3-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9a2b0a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9a2b0a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9a2b0a8 Branch: refs/heads/branch-2.3 Commit: f9a2b0a878f05131d76959236243e7f5caffeb96 Parents: b3726da Author: Saisai Shao Authored: Sun Jul 15 01:56:15 2018 + Committer: Saisai Shao Committed: Sun Jul 15 01:56:15 2018 + -- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/kvstore/pom.xml| 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/mesos/pom.xml | 2 +- resource-managers/yarn/pom.xml| 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 41 files changed, 42 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 8df2635..6ec4966 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.3.2 +Version: 2.3.3 Title: R Frontend for Apache Spark Description: Provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 57485fc..f8b15cc 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.3.2 +2.3.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/common/kvstore/pom.xml -- diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 53e58c2..e412a47 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.3.2 +2.3.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index d05647c..d8f9a3d 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.3.2 +2.3.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 8d46761..a1a4f87 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.3.2-rc3 [created] b3726dadc - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r28117 - in /dev/spark/2.4.0-SNAPSHOT-2018_07_14_16_01-8aceb96-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat Jul 14 23:15:42 2018 New Revision: 28117 Log: Apache Spark 2.4.0-SNAPSHOT-2018_07_14_16_01-8aceb96 docs [This commit notification would consist of 1467 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24754][ML] Minhash integer overflow
Repository: spark Updated Branches: refs/heads/master e1de34113 -> 8aceb961c [SPARK-24754][ML] Minhash integer overflow ## What changes were proposed in this pull request? Use longs in calculating min hash to avoid bias due to int overflow. ## How was this patch tested? Existing tests. Author: Sean Owen Closes #21750 from srowen/SPARK-24754. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8aceb961 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8aceb961 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8aceb961 Branch: refs/heads/master Commit: 8aceb961c3b8e462c6002dbe03be61b4fe194f47 Parents: e1de341 Author: Sean Owen Authored: Sat Jul 14 15:59:17 2018 -0500 Committer: Sean Owen Committed: Sat Jul 14 15:59:17 2018 -0500 -- .../main/scala/org/apache/spark/ml/feature/MinHashLSH.scala| 2 +- python/pyspark/ml/feature.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8aceb961/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala index a67a3b0..a043033 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala @@ -66,7 +66,7 @@ class MinHashLSHModel private[ml]( val elemsList = elems.toSparse.indices.toList val hashValues = randCoefficients.map { case (a, b) => elemsList.map { elem: Int => - ((1 + elem) * a + b) % MinHashLSH.HASH_PRIME + ((1L + elem) * a + b) % MinHashLSH.HASH_PRIME }.min.toDouble } // TODO: Output vectors of dimension numHashFunctions in SPARK-18450 http://git-wip-us.apache.org/repos/asf/spark/blob/8aceb961/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 14800d4..ddba738 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1294,14 +1294,14 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed, >>> mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345) >>> model = mh.fit(df) >>> model.transform(df).head() -Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([-1638925... +Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668... >>> data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),), ... (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),), ... (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)] >>> df2 = spark.createDataFrame(data2, ["id", "features"]) >>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0]) >>> model.approxNearestNeighbors(df2, key, 1).collect() -[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([-163892... +[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([6179668... >>> model.approxSimilarityJoin(df, df2, 0.6, distCol="JaccardDistance").select( ... col("datasetA.id").alias("idA"), ... col("datasetB.id").alias("idB"), @@ -1309,8 +1309,8 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed, +---+---+---+ |idA|idB|JaccardDistance| +---+---+---+ -| 1| 4|0.5| | 0| 5|0.5| +| 1| 4|0.5| +---+---+---+ ... >>> mhPath = temp_path + "/mh" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r28112 - in /dev/spark/2.4.0-SNAPSHOT-2018_07_14_04_02-e1de341-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat Jul 14 11:17:47 2018 New Revision: 28112 Log: Apache Spark 2.4.0-SNAPSHOT-2018_07_14_04_02-e1de341 docs [This commit notification would consist of 1467 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17091][SQL] Add rule to convert IN predicate to equivalent Parquet filter
Repository: spark Updated Branches: refs/heads/master f1a99ad58 -> e1de34113 [SPARK-17091][SQL] Add rule to convert IN predicate to equivalent Parquet filter ## What changes were proposed in this pull request? The original pr is: https://github.com/apache/spark/pull/18424 Add a new optimizer rule to convert an IN predicate to an equivalent Parquet filter and add `spark.sql.parquet.pushdown.inFilterThreshold` to control limit thresholds. Different data types have different limit thresholds, this is a copy of data for reference: Type | limit threshold -- | -- string | 370 int | 210 long | 285 double | 270 float | 220 decimal | Won't provide better performance before [SPARK-24549](https://issues.apache.org/jira/browse/SPARK-24549) ## How was this patch tested? unit tests and manual tests Author: Yuming Wang Closes #21603 from wangyum/SPARK-17091. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e1de3411 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e1de3411 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e1de3411 Branch: refs/heads/master Commit: e1de34113e057707dfc5ff54a8109b3ec7c16dfb Parents: f1a99ad Author: Yuming Wang Authored: Sat Jul 14 17:50:54 2018 +0800 Committer: hyukjinkwon Committed: Sat Jul 14 17:50:54 2018 +0800 -- .../org/apache/spark/sql/internal/SQLConf.scala | 15 +++ .../FilterPushdownBenchmark-results.txt | 96 ++-- .../datasources/parquet/ParquetFileFormat.scala | 15 ++- .../datasources/parquet/ParquetFilters.scala| 20 +++- .../parquet/ParquetFilterSuite.scala| 66 +- 5 files changed, 153 insertions(+), 59 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e1de3411/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 14dd528..699e939 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -386,6 +386,18 @@ object SQLConf { .booleanConf .createWithDefault(true) + val PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD = +buildConf("spark.sql.parquet.pushdown.inFilterThreshold") + .doc("The maximum number of values to filter push-down optimization for IN predicate. " + +"Large threshold won't necessarily provide much better performance. " + +"The experiment argued that 300 is the limit threshold. " + +"By setting this value to 0 this feature can be disabled. " + +"This configuration only has an effect when 'spark.sql.parquet.filterPushdown' is enabled.") + .internal() + .intConf + .checkValue(threshold => threshold >= 0, "The threshold must not be negative.") + .createWithDefault(10) + val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat") .doc("Whether to be compatible with the legacy Parquet format adopted by Spark 1.4 and prior " + "versions, when converting Parquet schema to Spark SQL schema and vice versa.") @@ -1485,6 +1497,9 @@ class SQLConf extends Serializable with Logging { def parquetFilterPushDownStringStartWith: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED) + def parquetFilterPushDownInFilterThreshold: Int = +getConf(PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD) + def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED) def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH) http://git-wip-us.apache.org/repos/asf/spark/blob/e1de3411/sql/core/benchmarks/FilterPushdownBenchmark-results.txt -- diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index 110669b..c44908b 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -417,120 +417,120 @@ Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz InSet -> InFilters (values count: 5, distribution: 10): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative -Parquet Vectorized7477 / 7587 2.1 475.4 1.0X -Parquet Vectorized (Pushdown) 7862 / 8346 2.0 499.9 1.0X -Native ORC Vectorized 6447 / 7021 2.4 409.9 1.2X -Native ORC Vectorized