svn commit: r28120 - in /dev/spark/2.3.3-SNAPSHOT-2018_07_14_22_01-f9a2b0a-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-07-14 Thread pwendell
Author: pwendell
Date: Sun Jul 15 05:15:27 2018
New Revision: 28120

Log:
Apache Spark 2.3.3-SNAPSHOT-2018_07_14_22_01-f9a2b0a docs


[This commit notification would consist of 1443 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-24807][CORE] Adding files/jars twice: output a warning and add a note

2018-07-14 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 3e7dc8296 -> 69993217f


[SPARK-24807][CORE] Adding files/jars twice: output a warning and add a note

## What changes were proposed in this pull request?

In the PR, I propose to output an warning if the `addFile()` or `addJar()` 
methods are callled more than once for the same path. Currently, overwriting of 
already added files is not supported. New comments and warning are reflected 
the existing behaviour.

Author: Maxim Gekk 

Closes #21771 from MaxGekk/warning-on-adding-file.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69993217
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69993217
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69993217

Branch: refs/heads/master
Commit: 69993217fc4f5e5e41a297702389e86fe534dc2f
Parents: 3e7dc82
Author: Maxim Gekk 
Authored: Sat Jul 14 22:07:49 2018 -0700
Committer: Xiao Li 
Committed: Sat Jul 14 22:07:49 2018 -0700

--
 R/pkg/R/context.R   |  2 ++
 core/src/main/scala/org/apache/spark/SparkContext.scala | 12 
 .../org/apache/spark/api/java/JavaSparkContext.scala|  6 ++
 python/pyspark/context.py   |  4 
 4 files changed, 24 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/R/pkg/R/context.R
--
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 8ec727d..3e996a5 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -305,6 +305,8 @@ setCheckpointDirSC <- function(sc, dirName) {
 #' Currently directories are only supported for Hadoop-supported filesystems.
 #' Refer Hadoop-supported filesystems at 
\url{https://wiki.apache.org/hadoop/HCFS}.
 #'
+#' Note: A path can be added only once. Subsequent additions of the same path 
are ignored.
+#'
 #' @rdname spark.addFile
 #' @param path The path of the file to be added
 #' @param recursive Whether to add files recursively from the path. Default is 
FALSE.

http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 74bfb5d..531384a 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1496,6 +1496,8 @@ class SparkContext(config: SparkConf) extends Logging {
* @param path can be either a local file, a file in HDFS (or other 
Hadoop-supported
* filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark 
jobs,
* use `SparkFiles.get(fileName)` to find its download location.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
*/
   def addFile(path: String): Unit = {
 addFile(path, false)
@@ -1516,6 +1518,8 @@ class SparkContext(config: SparkConf) extends Logging {
* use `SparkFiles.get(fileName)` to find its download location.
* @param recursive if true, a directory can be given in `path`. Currently 
directories are
* only supported for Hadoop-supported filesystems.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
*/
   def addFile(path: String, recursive: Boolean): Unit = {
 val uri = new Path(path).toUri
@@ -1555,6 +1559,9 @@ class SparkContext(config: SparkConf) extends Logging {
   Utils.fetchFile(uri.toString, new File(SparkFiles.getRootDirectory()), 
conf,
 env.securityManager, hadoopConfiguration, timestamp, useCache = false)
   postEnvironmentUpdate()
+} else {
+  logWarning(s"The path $path has been added already. Overwriting of added 
paths " +
+   "is not supported in the current version.")
 }
   }
 
@@ -1803,6 +1810,8 @@ class SparkContext(config: SparkConf) extends Logging {
*
* @param path can be either a local file, a file in HDFS (or other 
Hadoop-supported filesystems),
* an HTTP, HTTPS or FTP URI, or local:/path for a file on every worker node.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
*/
   def addJar(path: String) {
 def addJarFile(file: File): String = {
@@ -1849,6 +1858,9 @@ class SparkContext(config: SparkConf) extends Logging {
 if (addedJars.putIfAbsent(key, timestamp).isEmpty) {
   logInfo(s"Added JAR $path at $key with timestamp $timestamp")
   postEnvironmentUpdate()
+} else {
+  logWarning(s"The jar $path has been added already. Overwriting of 
added jars " +
+"is not supported in 

spark git commit: [SPARK-24776][SQL] Avro unit test: deduplicate code and replace deprecated methods

2018-07-14 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 43e4e851b -> 3e7dc8296


[SPARK-24776][SQL] Avro unit test: deduplicate code and replace deprecated 
methods

## What changes were proposed in this pull request?

Improve Avro unit test:
1. use QueryTest/SharedSQLContext/SQLTestUtils, instead of the duplicated test 
utils.
2. replace deprecated methods

This is a follow up PR for #21760, the PR passes pull request tests but failed 
in: 
https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Compile/job/spark-master-compile-maven-hadoop-2.6/7842/

This PR is to fix it.
## How was this patch tested?
Unit test.
Compile with different commands:

```
./build/mvn --force -DzincPort=3643 -DskipTests -Phadoop-2.6 
-Phive-thriftserver -Pkinesis-asl -Pspark-ganglia-lgpl -Pmesos -Pyarn  compile 
test-compile
./build/mvn --force -DzincPort=3643 -DskipTests -Phadoop-2.7 
-Phive-thriftserver -Pkinesis-asl -Pspark-ganglia-lgpl -Pmesos -Pyarn  compile 
test-compile
./build/mvn --force -DzincPort=3643 -DskipTests -Phadoop-3.1 
-Phive-thriftserver -Pkinesis-asl -Pspark-ganglia-lgpl -Pmesos -Pyarn  compile 
test-compile

```

Author: Gengliang Wang 

Closes #21768 from gengliangwang/improve_avro_test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e7dc829
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e7dc829
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e7dc829

Branch: refs/heads/master
Commit: 3e7dc82960fd3339eee16d83df66761ae6e3fe3d
Parents: 43e4e85
Author: Gengliang Wang 
Authored: Sat Jul 14 21:36:56 2018 -0700
Committer: Xiao Li 
Committed: Sat Jul 14 21:36:56 2018 -0700

--
 .../org/apache/spark/sql/avro/AvroSuite.scala   |  98 ++--
 .../org/apache/spark/sql/avro/TestUtils.scala   | 156 ---
 2 files changed, 45 insertions(+), 209 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3e7dc829/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
--
diff --git 
a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala 
b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
index c6c1e40..4f94d82 100644
--- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
+++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -31,32 +31,24 @@ import org.apache.avro.generic.{GenericData, 
GenericDatumWriter, GenericRecord}
 import org.apache.avro.generic.GenericData.{EnumSymbol, Fixed}
 import org.apache.commons.io.FileUtils
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.avro.SchemaConverters.IncompatibleSchemaException
+import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.sql.types._
 
-class AvroSuite extends SparkFunSuite {
+class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   val episodesFile = "src/test/resources/episodes.avro"
   val testFile = "src/test/resources/test.avro"
 
-  private var spark: SparkSession = _
-
   override protected def beforeAll(): Unit = {
 super.beforeAll()
-spark = SparkSession.builder()
-  .master("local[2]")
-  .appName("AvroSuite")
-  .config("spark.sql.files.maxPartitionBytes", 1024)
-  .getOrCreate()
-  }
-
-  override protected def afterAll(): Unit = {
-try {
-  spark.sparkContext.stop()
-} finally {
-  super.afterAll()
-}
+spark.conf.set("spark.sql.files.maxPartitionBytes", 1024)
+  }
+
+  def checkReloadMatchesSaved(originalFile: String, newFile: String): Unit = {
+val originalEntries = spark.read.avro(testFile).collect()
+val newEntries = spark.read.avro(newFile)
+checkAnswer(newEntries, originalEntries)
   }
 
   test("reading from multiple paths") {
@@ -68,7 +60,7 @@ class AvroSuite extends SparkFunSuite {
 val df = spark.read.avro(episodesFile)
 val fields = List("title", "air_date", "doctor")
 for (field <- fields) {
-  TestUtils.withTempDir { dir =>
+  withTempPath { dir =>
 val outputDir = s"$dir/${UUID.randomUUID}"
 df.write.partitionBy(field).avro(outputDir)
 val input = spark.read.avro(outputDir)
@@ -82,12 +74,12 @@ class AvroSuite extends SparkFunSuite {
 
   test("request no fields") {
 val df = spark.read.avro(episodesFile)
-df.registerTempTable("avro_table")
+df.createOrReplaceTempView("avro_table")
 assert(spark.sql("select count(*) from avro_table").collect().head === 
Row(8))
   }
 
   test("convert formats") {
-TestUtils.withTempDir { dir =>
+withTempPath { dir =>
   val df = spark.read.avro(episodesFile)
   df.write.parquet(dir.getCanonicalPath)
   

spark git commit: [SPARK-24718][SQL] Timestamp support pushdown to parquet data source

2018-07-14 Thread gurwls223
Repository: spark
Updated Branches:
  refs/heads/master 8aceb961c -> 43e4e851b


[SPARK-24718][SQL] Timestamp support pushdown to parquet data source

## What changes were proposed in this pull request?

`Timestamp` support pushdown to parquet data source.
Only `TIMESTAMP_MICROS` and `TIMESTAMP_MILLIS` support push down.

## How was this patch tested?

unit tests and benchmark tests

Author: Yuming Wang 

Closes #21741 from wangyum/SPARK-24718.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43e4e851
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43e4e851
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43e4e851

Branch: refs/heads/master
Commit: 43e4e851b642bbee535d22e1b9e72ec6b99f6ed4
Parents: 8aceb96
Author: Yuming Wang 
Authored: Sun Jul 15 11:13:49 2018 +0800
Committer: hyukjinkwon 
Committed: Sun Jul 15 11:13:49 2018 +0800

--
 .../org/apache/spark/sql/internal/SQLConf.scala |  11 ++
 .../FilterPushdownBenchmark-results.txt | 124 +++
 .../datasources/parquet/ParquetFileFormat.scala |   3 +-
 .../datasources/parquet/ParquetFilters.scala|  59 -
 .../benchmark/FilterPushdownBenchmark.scala |  37 +-
 .../parquet/ParquetFilterSuite.scala|  74 ++-
 6 files changed, 301 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/43e4e851/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 699e939..07d33fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -378,6 +378,15 @@ object SQLConf {
 .booleanConf
 .createWithDefault(true)
 
+  val PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED =
+buildConf("spark.sql.parquet.filterPushdown.timestamp")
+  .doc("If true, enables Parquet filter push-down optimization for 
Timestamp. " +
+"This configuration only has an effect when 
'spark.sql.parquet.filterPushdown' is " +
+"enabled and Timestamp stored as TIMESTAMP_MICROS or TIMESTAMP_MILLIS 
type.")
+.internal()
+.booleanConf
+.createWithDefault(true)
+
   val PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED =
 buildConf("spark.sql.parquet.filterPushdown.string.startsWith")
 .doc("If true, enables Parquet filter push-down optimization for string 
startsWith function. " +
@@ -1494,6 +1503,8 @@ class SQLConf extends Serializable with Logging {
 
   def parquetFilterPushDownDate: Boolean = 
getConf(PARQUET_FILTER_PUSHDOWN_DATE_ENABLED)
 
+  def parquetFilterPushDownTimestamp: Boolean = 
getConf(PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED)
+
   def parquetFilterPushDownStringStartWith: Boolean =
 getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/43e4e851/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
--
diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt 
b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
index c44908b..4f38cc4 100644
--- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
+++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
@@ -578,3 +578,127 @@ Native ORC Vectorized   11622 / 12196 
 1.4 7
 Native ORC Vectorized (Pushdown)11377 / 11654  1.4 
723.3   1.0X
 
 
+
+Pushdown benchmark for Timestamp
+
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 timestamp stored as INT96 row (value = CAST(7864320 AS timestamp)): 
Best/Avg Time(ms)Rate(M/s)   Per Row(ns)   Relative
+
+Parquet Vectorized4784 / 4956  3.3 
304.2   1.0X
+Parquet Vectorized (Pushdown) 4838 / 4917  3.3 
307.6   1.0X
+Native ORC Vectorized 3923 / 4173  4.0 
249.4   1.2X
+Native ORC Vectorized (Pushdown)   894 /  943 17.6 
 56.8   5.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 10% 

svn commit: r28118 - /dev/spark/v2.3.2-rc3-bin/

2018-07-14 Thread jshao
Author: jshao
Date: Sun Jul 15 03:04:30 2018
New Revision: 28118

Log:
Apache Spark v2.3.2-rc3

Added:
dev/spark/v2.3.2-rc3-bin/
dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz   (with props)
dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc
dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512
dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz   (with props)
dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc
dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.6.tgz   (with props)
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.6.tgz.asc
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.6.tgz.sha512
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.7.tgz   (with props)
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.7.tgz.asc
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-hadoop2.7.tgz.sha512
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-without-hadoop.tgz   (with props)
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-without-hadoop.tgz.asc
dev/spark/v2.3.2-rc3-bin/spark-2.3.2-bin-without-hadoop.tgz.sha512
dev/spark/v2.3.2-rc3-bin/spark-2.3.2.tgz   (with props)
dev/spark/v2.3.2-rc3-bin/spark-2.3.2.tgz.asc
dev/spark/v2.3.2-rc3-bin/spark-2.3.2.tgz.sha512

Added: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz
==
Binary file - no diff available.

Propchange: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz
--
svn:mime-type = application/octet-stream

Added: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc
==
--- dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc (added)
+++ dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.asc Sun Jul 15 03:04:30 2018
@@ -0,0 +1,16 @@
+-BEGIN PGP SIGNATURE-
+
+iQIcBAABCgAGBQJbSrCkAAoJENsLIaASlz/QxHIP/2+V6MiBci4mepIYNEA5M4A8
+n5hRYXbPDkK6i/tPlCtvdeW8XkcSJejzk+lPJdjQCxfqOsWCiGal42siV7XY/x96
++08b2XXzLOZ65RHSMJdE3M+qXgEs+kthCSg2Q/mZcssu83BqwNah0JTUIKi3oSmz
+11EnY1Pie4VCUn/ASdUPvmWeDTYpuziZnekMjI9B6WFx/gXHBOz8+6gJTpq6Eyq1
+VKrYCMtMtN6mYXh0yYqtYIXTQgA4/DJsmt2BVDrKOWweHkua3hBDNNBQWxB3kR2l
+tXawlYtIUxOERKL0lwatDqMoXIj7euEs0EfEPaZrYuulGN3s4yrwcOAuOo2Am5uA
+ltCFxlDwVulXPPMbkhv2RIQ4wGjSuMdW6mq94DJCG2SaE5HYgI7yh1MC+iyRu3Ib
+Y4xyQPApEj5HcoL0N/HX2FtcZAcf13CFqsfc6jZ+CWLT6xW57LO/mupn84jPHgao
+3s2d5l6c5uc3b5vZCmcpI9uy2B4Ts2W4Q39xKqlm6BEARqXDhYKQH09mLTsgLb9K
+xdFNKjrb99nSu1yqkJXrw9B95oaPPIGiIPmklfcxLcJ1sHiej7qmmuw511MJOhlJ
+Czngh9HPepjxIO0j3LaH0yT18gj7qy+Y9cqd2YD1LPnaYu3kIjgp8oP2draMwlxa
+/Z+tLGY+16MsFghl3uQe
+=FJmQ
+-END PGP SIGNATURE-

Added: dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512
==
--- dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512 (added)
+++ dev/spark/v2.3.2-rc3-bin/SparkR_2.3.2.tar.gz.sha512 Sun Jul 15 03:04:30 2018
@@ -0,0 +1,3 @@
+SparkR_2.3.2.tar.gz: 5D5225F0 8C8E27C8 579DABC7 5CDF37C8 024F2DE2 069583E9
+ 843781A9 30B501AF C72924AD C82DA242 2017D86A 26D0CE9C
+ 4F1BFDDB B35D7FB2 42F2A6C1 055EA0E8

Added: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz
==
Binary file - no diff available.

Propchange: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz
--
svn:mime-type = application/octet-stream

Added: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc
==
--- dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc (added)
+++ dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.asc Sun Jul 15 03:04:30 2018
@@ -0,0 +1,16 @@
+-BEGIN PGP SIGNATURE-
+
+iQIcBAABCgAGBQJbSrUeAAoJENsLIaASlz/QqWAP/i1Z/e5HINjqYcVYOjMHFVv2
+h50ezR7aL4sKFvqh5POaNDoXv3GP+4cRsdlOziLBEV2JxnE+Dnu8H4L8Y+SdH52x
+sC+8XbNzBGOJSh0XYQ4Ez8LnMlhCro4n5RgaJJIsRRzbuFvSKakNGL7lW1kxUHQX
+DiqXJ+wA5oRLYeITGE8YLjLgYgPwE8oC92WHmi/RWg5ES6dGmzF09X+7ccAmfHxt
+zOE6ARtLRJ2aeMw9s0t2DLfSznP8dsNXDz0xPHggWdJNmhLrkQfeBN5AZCPq8hwS
+manPzxX9Gb8UFjkRnljds+rGuVW29zVAmWL7rfi8Uv3QSv9oP5ZHyWbiYMDEI2/v
+R/EzwT/Gjk1NWk+W5RGVevNMJ/xMy0XfVdlzrkc8Svi8m93ojhxyJMNhVQdFN2PJ
+rJoqHHSH8ev4/W1GnW4oUwr06dAewTsmLOa/tSvdVjEk0BwNrsM6GwPOTZB3tPB6
+unfDdgOEvlAq2CRN9GnKftAruprk77fmE4frE10sg5Jms1ohLN503NTJx4gTBW7Y
+qCkdYrusPgmW8F6aA865jUylPg/BhIRXE5H4kPnnzgwNYzPjGYdMnuksb3Ls9Mr9
+BxvbkLToe9goSIjGbj+iy/5VGem7tKsKwiDlUN5StAqH+M3ZlEW2hpcILY4cM/VH
+KthInMLfa7Ofa9bEZMb7
+=jTWe
+-END PGP SIGNATURE-

Added: dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512
==
--- dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512 (added)
+++ dev/spark/v2.3.2-rc3-bin/pyspark-2.3.2.tar.gz.sha512 Sun Jul 15 03:04:30 
2018
@@ -0,0 +1,3 @@

[1/2] spark git commit: Preparing Spark release v2.3.2-rc3

2018-07-14 Thread jshao
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 9cf375f5b -> f9a2b0a87


Preparing Spark release v2.3.2-rc3


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3726dad
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3726dad
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3726dad

Branch: refs/heads/branch-2.3
Commit: b3726dadcf2997f20231873ec6e057dba433ae64
Parents: 9cf375f
Author: Saisai Shao 
Authored: Sun Jul 15 01:56:00 2018 +
Committer: Saisai Shao 
Committed: Sun Jul 15 01:56:00 2018 +

--
 R/pkg/DESCRIPTION | 2 +-
 assembly/pom.xml  | 2 +-
 common/kvstore/pom.xml| 2 +-
 common/network-common/pom.xml | 2 +-
 common/network-shuffle/pom.xml| 2 +-
 common/network-yarn/pom.xml   | 2 +-
 common/sketch/pom.xml | 2 +-
 common/tags/pom.xml   | 2 +-
 common/unsafe/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 docs/_config.yml  | 4 ++--
 examples/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml   | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/kafka-0-10-assembly/pom.xml  | 2 +-
 external/kafka-0-10-sql/pom.xml   | 2 +-
 external/kafka-0-10/pom.xml   | 2 +-
 external/kafka-0-8-assembly/pom.xml   | 2 +-
 external/kafka-0-8/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml | 2 +-
 external/kinesis-asl/pom.xml  | 2 +-
 external/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml| 2 +-
 hadoop-cloud/pom.xml  | 2 +-
 launcher/pom.xml  | 2 +-
 mllib-local/pom.xml   | 2 +-
 mllib/pom.xml | 2 +-
 pom.xml   | 2 +-
 python/pyspark/version.py | 2 +-
 repl/pom.xml  | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml   | 2 +-
 resource-managers/yarn/pom.xml| 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6ec4966..8df2635 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.3
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f8b15cc..57485fc 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.3.3-SNAPSHOT
+2.3.2
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/common/kvstore/pom.xml
--
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index e412a47..53e58c2 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.3.3-SNAPSHOT
+2.3.2
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d8f9a3d..d05647c 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.3.3-SNAPSHOT
+2.3.2
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b3726dad/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a1a4f87..8d46761 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml

[2/2] spark git commit: Preparing development version 2.3.3-SNAPSHOT

2018-07-14 Thread jshao
Preparing development version 2.3.3-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9a2b0a8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9a2b0a8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9a2b0a8

Branch: refs/heads/branch-2.3
Commit: f9a2b0a878f05131d76959236243e7f5caffeb96
Parents: b3726da
Author: Saisai Shao 
Authored: Sun Jul 15 01:56:15 2018 +
Committer: Saisai Shao 
Committed: Sun Jul 15 01:56:15 2018 +

--
 R/pkg/DESCRIPTION | 2 +-
 assembly/pom.xml  | 2 +-
 common/kvstore/pom.xml| 2 +-
 common/network-common/pom.xml | 2 +-
 common/network-shuffle/pom.xml| 2 +-
 common/network-yarn/pom.xml   | 2 +-
 common/sketch/pom.xml | 2 +-
 common/tags/pom.xml   | 2 +-
 common/unsafe/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 docs/_config.yml  | 4 ++--
 examples/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml   | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/kafka-0-10-assembly/pom.xml  | 2 +-
 external/kafka-0-10-sql/pom.xml   | 2 +-
 external/kafka-0-10/pom.xml   | 2 +-
 external/kafka-0-8-assembly/pom.xml   | 2 +-
 external/kafka-0-8/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml | 2 +-
 external/kinesis-asl/pom.xml  | 2 +-
 external/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml| 2 +-
 hadoop-cloud/pom.xml  | 2 +-
 launcher/pom.xml  | 2 +-
 mllib-local/pom.xml   | 2 +-
 mllib/pom.xml | 2 +-
 pom.xml   | 2 +-
 python/pyspark/version.py | 2 +-
 repl/pom.xml  | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml   | 2 +-
 resource-managers/yarn/pom.xml| 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df2635..6ec4966 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 57485fc..f8b15cc 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.3.2
+2.3.3-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/common/kvstore/pom.xml
--
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 53e58c2..e412a47 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.3.2
+2.3.3-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d05647c..d8f9a3d 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.3.2
+2.3.3-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f9a2b0a8/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 8d46761..a1a4f87 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-

[spark] Git Push Summary

2018-07-14 Thread jshao
Repository: spark
Updated Tags:  refs/tags/v2.3.2-rc3 [created] b3726dadc

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r28117 - in /dev/spark/2.4.0-SNAPSHOT-2018_07_14_16_01-8aceb96-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-07-14 Thread pwendell
Author: pwendell
Date: Sat Jul 14 23:15:42 2018
New Revision: 28117

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_07_14_16_01-8aceb96 docs


[This commit notification would consist of 1467 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-24754][ML] Minhash integer overflow

2018-07-14 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master e1de34113 -> 8aceb961c


[SPARK-24754][ML] Minhash integer overflow

## What changes were proposed in this pull request?

Use longs in calculating min hash to avoid bias due to int overflow.

## How was this patch tested?

Existing tests.

Author: Sean Owen 

Closes #21750 from srowen/SPARK-24754.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8aceb961
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8aceb961
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8aceb961

Branch: refs/heads/master
Commit: 8aceb961c3b8e462c6002dbe03be61b4fe194f47
Parents: e1de341
Author: Sean Owen 
Authored: Sat Jul 14 15:59:17 2018 -0500
Committer: Sean Owen 
Committed: Sat Jul 14 15:59:17 2018 -0500

--
 .../main/scala/org/apache/spark/ml/feature/MinHashLSH.scala| 2 +-
 python/pyspark/ml/feature.py   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8aceb961/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
index a67a3b0..a043033 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -66,7 +66,7 @@ class MinHashLSHModel private[ml](
   val elemsList = elems.toSparse.indices.toList
   val hashValues = randCoefficients.map { case (a, b) =>
 elemsList.map { elem: Int =>
-  ((1 + elem) * a + b) % MinHashLSH.HASH_PRIME
+  ((1L + elem) * a + b) % MinHashLSH.HASH_PRIME
 }.min.toDouble
   }
   // TODO: Output vectors of dimension numHashFunctions in SPARK-18450

http://git-wip-us.apache.org/repos/asf/spark/blob/8aceb961/python/pyspark/ml/feature.py
--
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 14800d4..ddba738 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1294,14 +1294,14 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, 
HasOutputCol, HasSeed,
 >>> mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345)
 >>> model = mh.fit(df)
 >>> model.transform(df).head()
-Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), 
hashes=[DenseVector([-1638925...
+Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), 
hashes=[DenseVector([6179668...
 >>> data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
 ...  (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
 ...  (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
 >>> df2 = spark.createDataFrame(data2, ["id", "features"])
 >>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0])
 >>> model.approxNearestNeighbors(df2, key, 1).collect()
-[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), 
hashes=[DenseVector([-163892...
+[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), 
hashes=[DenseVector([6179668...
 >>> model.approxSimilarityJoin(df, df2, 0.6, 
distCol="JaccardDistance").select(
 ... col("datasetA.id").alias("idA"),
 ... col("datasetB.id").alias("idB"),
@@ -1309,8 +1309,8 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, 
HasOutputCol, HasSeed,
 +---+---+---+
 |idA|idB|JaccardDistance|
 +---+---+---+
-|  1|  4|0.5|
 |  0|  5|0.5|
+|  1|  4|0.5|
 +---+---+---+
 ...
 >>> mhPath = temp_path + "/mh"


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r28112 - in /dev/spark/2.4.0-SNAPSHOT-2018_07_14_04_02-e1de341-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-07-14 Thread pwendell
Author: pwendell
Date: Sat Jul 14 11:17:47 2018
New Revision: 28112

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_07_14_04_02-e1de341 docs


[This commit notification would consist of 1467 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-17091][SQL] Add rule to convert IN predicate to equivalent Parquet filter

2018-07-14 Thread gurwls223
Repository: spark
Updated Branches:
  refs/heads/master f1a99ad58 -> e1de34113


[SPARK-17091][SQL] Add rule to convert IN predicate to equivalent Parquet filter

## What changes were proposed in this pull request?

The original pr is: https://github.com/apache/spark/pull/18424

Add a new optimizer rule to convert an IN predicate to an equivalent Parquet 
filter and add `spark.sql.parquet.pushdown.inFilterThreshold` to control limit 
thresholds. Different data types have different limit thresholds, this is a 
copy of data for reference:

Type | limit threshold
-- | --
string | 370
int | 210
long | 285
double | 270
float | 220
decimal | Won't provide better performance before 
[SPARK-24549](https://issues.apache.org/jira/browse/SPARK-24549)

## How was this patch tested?
unit tests and manual tests

Author: Yuming Wang 

Closes #21603 from wangyum/SPARK-17091.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e1de3411
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e1de3411
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e1de3411

Branch: refs/heads/master
Commit: e1de34113e057707dfc5ff54a8109b3ec7c16dfb
Parents: f1a99ad
Author: Yuming Wang 
Authored: Sat Jul 14 17:50:54 2018 +0800
Committer: hyukjinkwon 
Committed: Sat Jul 14 17:50:54 2018 +0800

--
 .../org/apache/spark/sql/internal/SQLConf.scala | 15 +++
 .../FilterPushdownBenchmark-results.txt | 96 ++--
 .../datasources/parquet/ParquetFileFormat.scala | 15 ++-
 .../datasources/parquet/ParquetFilters.scala| 20 +++-
 .../parquet/ParquetFilterSuite.scala| 66 +-
 5 files changed, 153 insertions(+), 59 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e1de3411/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 14dd528..699e939 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -386,6 +386,18 @@ object SQLConf {
 .booleanConf
 .createWithDefault(true)
 
+  val PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD =
+buildConf("spark.sql.parquet.pushdown.inFilterThreshold")
+  .doc("The maximum number of values to filter push-down optimization for 
IN predicate. " +
+"Large threshold won't necessarily provide much better performance. " +
+"The experiment argued that 300 is the limit threshold. " +
+"By setting this value to 0 this feature can be disabled. " +
+"This configuration only has an effect when 
'spark.sql.parquet.filterPushdown' is enabled.")
+  .internal()
+  .intConf
+  .checkValue(threshold => threshold >= 0, "The threshold must not be 
negative.")
+  .createWithDefault(10)
+
   val PARQUET_WRITE_LEGACY_FORMAT = 
buildConf("spark.sql.parquet.writeLegacyFormat")
 .doc("Whether to be compatible with the legacy Parquet format adopted by 
Spark 1.4 and prior " +
   "versions, when converting Parquet schema to Spark SQL schema and vice 
versa.")
@@ -1485,6 +1497,9 @@ class SQLConf extends Serializable with Logging {
   def parquetFilterPushDownStringStartWith: Boolean =
 getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED)
 
+  def parquetFilterPushDownInFilterThreshold: Int =
+getConf(PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD)
+
   def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
 
   def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)

http://git-wip-us.apache.org/repos/asf/spark/blob/e1de3411/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
--
diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt 
b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
index 110669b..c44908b 100644
--- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
+++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
@@ -417,120 +417,120 @@ Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
 
 InSet -> InFilters (values count: 5, distribution: 10): Best/Avg Time(ms)
Rate(M/s)   Per Row(ns)   Relative
 

-Parquet Vectorized7477 / 7587  2.1 
475.4   1.0X
-Parquet Vectorized (Pushdown) 7862 / 8346  2.0 
499.9   1.0X
-Native ORC Vectorized 6447 / 7021  2.4 
409.9   1.2X
-Native ORC Vectorized