spark git commit: [SPARK-24638][SQL] StringStartsWith support push down
Repository: spark Updated Branches: refs/heads/master f71e8da5e -> 03545ce6d [SPARK-24638][SQL] StringStartsWith support push down ## What changes were proposed in this pull request? `StringStartsWith` support push down. About 50% savings in compute time. ## How was this patch tested? unit tests, manual tests and performance test: ```scala cat < SPARK-24638.scala def benchmark(func: () => Unit): Long = { val start = System.currentTimeMillis() for(i <- 0 until 100) { func() } val end = System.currentTimeMillis() end - start } val path = "/tmp/spark/parquet/string/" spark.range(1000).selectExpr("concat(id, 'str', id) as id").coalesce(1).write.mode("overwrite").option("parquet.block.size", 1048576).parquet(path) val df = spark.read.parquet(path) spark.sql("set spark.sql.parquet.filterPushdown.string.startsWith=true") val pushdownEnable = benchmark(() => df.where("id like '98%'").count()) spark.sql("set spark.sql.parquet.filterPushdown.string.startsWith=false") val pushdownDisable = benchmark(() => df.where("id like '98%'").count()) val improvements = pushdownDisable - pushdownEnable println(s"improvements: $improvements") EOF bin/spark-shell -i SPARK-24638.scala ``` result: ```scala Loading SPARK-24638.scala... benchmark: (func: () => Unit)Long path: String = /tmp/spark/parquet/string/ df: org.apache.spark.sql.DataFrame = [id: string] res1: org.apache.spark.sql.DataFrame = [key: string, value: string] pushdownEnable: Long = 11608 res2: org.apache.spark.sql.DataFrame = [key: string, value: string] pushdownDisable: Long = 31981 improvements: Long = 20373 ``` Author: Yuming Wang Closes #21623 from wangyum/SPARK-24638. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/03545ce6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/03545ce6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/03545ce6 Branch: refs/heads/master Commit: 03545ce6de08bd0ad685c5f59b73bc22dfc40887 Parents: f71e8da Author: Yuming Wang Authored: Sat Jun 30 13:58:50 2018 +0800 Committer: Wenchen Fan Committed: Sat Jun 30 13:58:50 2018 +0800 -- .../org/apache/spark/sql/internal/SQLConf.scala | 11 +++ .../datasources/parquet/ParquetFileFormat.scala | 4 +- .../datasources/parquet/ParquetFilters.scala| 35 +++- .../parquet/ParquetFilterSuite.scala| 84 +++- 4 files changed, 130 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/03545ce6/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e1752ff..da1c34c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -378,6 +378,14 @@ object SQLConf { .booleanConf .createWithDefault(true) + val PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED = +buildConf("spark.sql.parquet.filterPushdown.string.startsWith") +.doc("If true, enables Parquet filter push-down optimization for string startsWith function. " + + "This configuration only has an effect when 'spark.sql.parquet.filterPushdown' is enabled.") +.internal() +.booleanConf +.createWithDefault(true) + val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat") .doc("Whether to be compatible with the legacy Parquet format adopted by Spark 1.4 and prior " + "versions, when converting Parquet schema to Spark SQL schema and vice versa.") @@ -1459,6 +1467,9 @@ class SQLConf extends Serializable with Logging { def parquetFilterPushDownDate: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_DATE_ENABLED) + def parquetFilterPushDownStringStartWith: Boolean = +getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED) + def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED) def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH) http://git-wip-us.apache.org/repos/asf/spark/blob/03545ce6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 9602a08..93de1fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.1.3 [created] b7eac07b9 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r27835 - in /dev/spark: v2.1.3-rc1-bin/ v2.1.3-rc1-docs/ v2.1.3-rc2-docs/
Author: vanzin Date: Fri Jun 29 21:37:13 2018 New Revision: 27835 Log: Remove Spark 2.1.3 RC directories. Removed: dev/spark/v2.1.3-rc1-bin/ dev/spark/v2.1.3-rc1-docs/ dev/spark/v2.1.3-rc2-docs/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r27834 - /dev/spark/v2.1.3-rc2-bin/ /release/spark/spark-2.1.3/
Author: vanzin Date: Fri Jun 29 21:31:17 2018 New Revision: 27834 Log: Move Spark 2.1.3-rc2 to release area. Added: release/spark/spark-2.1.3/ - copied from r27833, dev/spark/v2.1.3-rc2-bin/ Removed: dev/spark/v2.1.3-rc2-bin/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r27833 - in /dev/spark/2.4.0-SNAPSHOT-2018_06_29_12_01-f71e8da-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri Jun 29 19:15:55 2018 New Revision: 27833 Log: Apache Spark 2.4.0-SNAPSHOT-2018_06_29_12_01-f71e8da docs [This commit notification would consist of 1467 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24566][CORE] Fix spark.storage.blockManagerSlaveTimeoutMs default config
Repository: spark Updated Branches: refs/heads/master f6e6899a8 -> f71e8da5e [SPARK-24566][CORE] Fix spark.storage.blockManagerSlaveTimeoutMs default config This PR use spark.network.timeout in place of spark.storage.blockManagerSlaveTimeoutMs when it is not configured, as configuration doc said manual test Author: xueyu <278006...@qq.com> Closes #21575 from xueyumusic/slaveTimeOutConfig. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f71e8da5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f71e8da5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f71e8da5 Branch: refs/heads/master Commit: f71e8da5efde96aacc89e59c6e27b71fffcbc25f Parents: f6e6899 Author: xueyu <278006...@qq.com> Authored: Fri Jun 29 10:44:17 2018 -0700 Committer: Shixiong Zhu Committed: Fri Jun 29 10:44:49 2018 -0700 -- core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala| 5 ++--- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f71e8da5/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala -- diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index ff960b3..bcbc8df 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -74,10 +74,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) // "spark.network.timeout" uses "seconds", while `spark.storage.blockManagerSlaveTimeoutMs` uses // "milliseconds" - private val slaveTimeoutMs = -sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", "120s") private val executorTimeoutMs = -sc.conf.getTimeAsSeconds("spark.network.timeout", s"${slaveTimeoutMs}ms") * 1000 +sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", + s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s")}s") // "spark.network.timeoutInterval" uses "seconds", while // "spark.storage.blockManagerTimeoutIntervalMs" uses "milliseconds" http://git-wip-us.apache.org/repos/asf/spark/blob/f71e8da5/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala -- diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index d35bea4..1ce2f81 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -634,7 +634,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( slave.hostname, externalShufflePort, sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", - s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s") * 1000L}ms"), + s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s")}s"), sc.conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s")) slave.shuffleRegistered = true } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org