[spark] branch master updated (cb0cddf -> 4664a08)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from cb0cddf [SPARK-21870][SQL] Split aggregation code into small functions add 4664a08 [SPARK-28968][ML] Add HasNumFeatures in the scala side No new revisions were added by this update. Summary of changes: .../apache/spark/ml/feature/FeatureHasher.scala| 19 ++-- .../org/apache/spark/ml/feature/HashingTF.scala| 20 - .../ml/param/shared/SharedParamsCodeGen.scala | 7 +++--- .../spark/ml/param/shared/sharedParams.scala | 25 +++--- project/MimaExcludes.scala | 6 ++ python/pyspark/ml/param/_shared_params_code_gen.py | 3 ++- python/pyspark/ml/param/shared.py | 5 +++-- 7 files changed, 43 insertions(+), 42 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (36f8e53 -> cb0cddf)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 36f8e53 [SPARK-28802][DOC][SQL] Document DESCRIBE DATABASE statement in SQL Reference add cb0cddf [SPARK-21870][SQL] Split aggregation code into small functions No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/dsl/package.scala| 8 +- .../expressions/codegen/CodeGenerator.scala| 51 .../catalyst/expressions/codegen/javaCode.scala| 5 +- .../sql/catalyst/expressions/nullExpressions.scala | 12 +- .../org/apache/spark/sql/internal/SQLConf.scala| 11 + .../execution/aggregate/HashAggregateExec.scala| 309 - .../sql/execution/WholeStageCodegenSuite.scala | 21 ++ 7 files changed, 348 insertions(+), 69 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-26046][SS] Add StreamingQueryManager.listListeners()
This is an automated email from the ASF dual-hosted git repository. jtorres pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3929d16 [SPARK-26046][SS] Add StreamingQueryManager.listListeners() 3929d16 is described below commit 3929d166043deb104dc3f3180ab43be54c50937d Author: Mukul Murthy AuthorDate: Thu Sep 5 14:27:54 2019 -0700 [SPARK-26046][SS] Add StreamingQueryManager.listListeners() ### What changes were proposed in this pull request? Add a listListeners() method to StreamingQueryManager that lists all StreamingQueryListeners that have been added to that manager. ### Why are the changes needed? While it's best practice to keep handles on all listeners added, it's still nice to have an API to be able to list what listeners have been added to a StreamingQueryManager. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Modified existing unit tests to use the new API instead of using reflection. Closes #25518 from mukulmurthy/26046-listener. Authored-by: Mukul Murthy Signed-off-by: Jose Torres --- .../spark/sql/streaming/StreamingQueryManager.scala | 10 ++ .../spark/sql/streaming/StreamingQueryListenerSuite.scala | 15 --- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index abee5f6..9765956 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -21,6 +21,7 @@ import java.util.UUID import java.util.concurrent.TimeUnit import javax.annotation.concurrent.GuardedBy +import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.hadoop.fs.Path @@ -199,6 +200,15 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo listenerBus.removeListener(listener) } + /** + * List all [[StreamingQueryListener]]s attached to this [[StreamingQueryManager]]. + * + * @since 3.0.0 + */ + def listListeners(): Array[StreamingQueryListener] = { +listenerBus.listeners.asScala.toArray + } + /** Post a listener event */ private[sql] def postListenerEvent(event: StreamingQueryListener.Event): Unit = { listenerBus.post(event) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala index 43b..d964048 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala @@ -47,7 +47,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { after { spark.streams.active.foreach(_.stop()) assert(spark.streams.active.isEmpty) -assert(addedListeners().isEmpty) +assert(spark.streams.listListeners().isEmpty) // Make sure we don't leak any events to the next test spark.sparkContext.listenerBus.waitUntilEmpty(1) } @@ -223,7 +223,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { assert(isListenerActive(listener1) === false) assert(isListenerActive(listener2)) } finally { - addedListeners().foreach(spark.streams.removeListener) + spark.streams.listListeners().foreach(spark.streams.removeListener) } } @@ -362,10 +362,10 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { assert(session1.streams.ne(session2.streams)) withListenerAdded(collector1, session1) { - assert(addedListeners(session1).nonEmpty) + assert(session1.streams.listListeners().nonEmpty) withListenerAdded(collector2, session2) { -assert(addedListeners(session2).nonEmpty) +assert(session2.streams.listListeners().nonEmpty) // query on session1 should send events only to collector1 runQuery(session1) @@ -440,13 +440,6 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { } } - private def addedListeners(session: SparkSession = spark): Array[StreamingQueryListener] = { -val listenerBusMethod = - PrivateMethod[StreamingQueryListenerBus]('listenerBus) -val listenerBus = session.streams invokePrivate listenerBusMethod() -listenerBus.listeners.toArray.map(_.asInstanceOf[StreamingQueryListener]) - } - /** Collects events from the StreamingQueryListener for testing */ class EventCollector extends StreamingQueryListener { // to catch
[spark] branch master updated: [SPARK-28770][CORE][TEST] Fix ReplayListenerSuite tests that sometimes fail
This is an automated email from the ASF dual-hosted git repository. irashid pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 151b954 [SPARK-28770][CORE][TEST] Fix ReplayListenerSuite tests that sometimes fail 151b954 is described below commit 151b954e52c66f78a72530ab69f38100101f6cb7 Author: Wing Yew Poon AuthorDate: Thu Sep 5 15:55:22 2019 -0500 [SPARK-28770][CORE][TEST] Fix ReplayListenerSuite tests that sometimes fail ### What changes were proposed in this pull request? `ReplayListenerSuite` depends on a listener class to listen for replayed events. This class was implemented by extending `EventLoggingListener`. `EventLoggingListener` does not log executor metrics update events, but uses them to update internal state; on a stage completion event, it then logs stage executor metrics events using this internal state. As executor metrics update events do not get written to the event log, they do not get replayed. The internal state of the replay listene [...] We reimplement the replay listener to simply buffer each and every event it receives. This makes it a simpler yet better tool for verifying the events that get sent through the ReplayListenerBus. ### Why are the changes needed? As explained above. Tests sometimes fail due to events being received by the `EventLoggingListener` that do not get logged (and thus do not get replayed) but influence other events that get logged. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing unit tests. Closes #25673 from wypoon/SPARK-28770. Authored-by: Wing Yew Poon Signed-off-by: Imran Rashid --- .../spark/scheduler/ReplayListenerSuite.scala | 36 ++ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala index e796137..d65b5cb 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala @@ -21,12 +21,14 @@ import java.io._ import java.net.URI import java.util.concurrent.atomic.AtomicInteger +import scala.collection.mutable.ArrayBuffer + import org.apache.hadoop.fs.Path import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods._ import org.scalatest.BeforeAndAfter -import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.io.{CompressionCodec, LZ4CompressionCodec} import org.apache.spark.util.{JsonProtocol, JsonProtocolSuite, Utils} @@ -62,7 +64,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath) val logData = fileSystem.open(logFilePath) -val eventMonster = new EventMonster(conf) +val eventMonster = new EventBufferingListener try { val replayer = new ReplayListenerBus() replayer.addListener(eventMonster) @@ -108,7 +110,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath) val replayer = new ReplayListenerBus() -val eventMonster = new EventMonster(conf) +val eventMonster = new EventBufferingListener replayer.addListener(eventMonster) // Verify the replay returns the events given the input maybe truncated. @@ -145,7 +147,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath) val logData = fileSystem.open(logFilePath) -val eventMonster = new EventMonster(conf) +val eventMonster = new EventBufferingListener try { val replayer = new ReplayListenerBus() replayer.addListener(eventMonster) @@ -207,7 +209,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp // Replay events val logData = EventLoggingListener.openEventLog(eventLog.getPath(), fileSystem) -val eventMonster = new EventMonster(conf) +val eventMonster = new EventBufferingListener try { val replayer = new ReplayListenerBus() replayer.addListener(eventMonster) @@ -219,11 +221,11 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp // Verify the same events are replayed in the same order assert(sc.eventLogger.isDefined) val originalEvents = sc.eventLogger.get.loggedEvents + .map(JsonProtocol.sparkEventFromJson(_)) val replayedEvents = eventMonster.loggedEvents +
svn commit: r35567 - /release/spark/spark-2.4.3/
Author: dongjoon Date: Thu Sep 5 15:56:04 2019 New Revision: 35567 Log: Remove previous 2.4.3 release from mirror network Removed: release/spark/spark-2.4.3/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] maropu commented on issue #218: Add Gengliang Wang to committer list
maropu commented on issue #218: Add Gengliang Wang to committer list URL: https://github.com/apache/spark-website/pull/218#issuecomment-528382488 oh, congrats!! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (c81fd0c -> 0647906)
This is an automated email from the ASF dual-hosted git repository. yumwang pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from c81fd0c [SPARK-28974][SQL] centralize the Data Source V2 table capability checks add 0647906 [SPARK-28910][SQL] Prevent schema verification when connecting to in memory derby No new revisions were added by this update. Summary of changes: .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc that partition column can be numeric, date or timestamp type
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new a1471f9 [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc that partition column can be numeric, date or timestamp type a1471f9 is described below commit a1471f95a4b2ff2ff5c70403458d796429ee857a Author: Sean Owen AuthorDate: Thu Sep 5 18:32:45 2019 +0900 [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc that partition column can be numeric, date or timestamp type ### What changes were proposed in this pull request? `DataFrameReader.json()` accepts a partition column that is of numeric, date or timestamp type, according to the implementation in `JDBCRelation.scala`. Update the scaladoc accordingly, to match the documentation in `sql-data-sources-jdbc.md` too. ### Why are the changes needed? scaladoc is incorrect. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? N/A Closes #25687 from srowen/SPARK-28977. Authored-by: Sean Owen Signed-off-by: HyukjinKwon --- R/pkg/R/SQLContext.R | 3 ++- python/pyspark/sql/readwriter.py | 3 ++- sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index c819a7d..c281aa0 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -655,7 +655,8 @@ loadDF <- function(x = NULL, ...) { #' #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname} #' @param tableName the name of the table in the external database -#' @param partitionColumn the name of a column of integral type that will be used for partitioning +#' @param partitionColumn the name of a column of numeric, date, or timestamp type +#'that will be used for partitioning. #' @param lowerBound the minimum value of \code{partitionColumn} used to decide partition stride #' @param upperBound the maximum value of \code{partitionColumn} used to decide partition stride #' @param numPartitions the number of partitions, This, along with \code{lowerBound} (inclusive), diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index ea7cc80..4396699 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -526,7 +526,8 @@ class DataFrameReader(OptionUtils): :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: the name of the table -:param column: the name of an integer column that will be used for partitioning; +:param column: the name of a column of numeric, date, or timestamp type + that will be used for partitioning; if this parameter is specified, then ``numPartitions``, ``lowerBound`` (inclusive), and ``upperBound`` (exclusive) will form partition strides for generated WHERE clause expressions used to split the column diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 85cd3f0..c71f871 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -248,7 +248,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * * @param url JDBC database url of the form `jdbc:subprotocol:subname`. * @param table Name of the table in the external database. - * @param columnName the name of a column of integral type that will be used for partitioning. + * @param columnName the name of a column of numeric, date, or timestamp type + * that will be used for partitioning. * @param lowerBound the minimum value of `columnName` used to decide partition stride. * @param upperBound the maximum value of `columnName` used to decide partition stride. * @param numPartitions the number of partitions. This, along with `lowerBound` (inclusive), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (103d50b -> c81fd0c)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 103d50b [SPARK-28272][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part3.sql' into UDF test base add c81fd0c [SPARK-28974][SQL] centralize the Data Source V2 table capability checks No new revisions were added by this update. Summary of changes: .../datasources/v2/DataSourceV2Strategy.scala | 4 + ...pportCheck.scala => TableCapabilityCheck.scala} | 52 +++-- .../datasources/v2/V2WriteSupportCheck.scala | 61 -- .../sql/internal/BaseSessionStateBuilder.scala | 5 +- .../v2/V2StreamingScanSupportCheckSuite.scala | 129 - .../spark/sql/sources/v2/InsertIntoTests.scala | 2 +- ...Suite.scala => TableCapabilityCheckSuite.scala} | 124 +--- .../spark/sql/hive/HiveSessionStateBuilder.scala | 5 +- 8 files changed, 157 insertions(+), 225 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/{V2StreamingScanSupportCheck.scala => TableCapabilityCheck.scala} (51%) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2WriteSupportCheck.scala delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2StreamingScanSupportCheckSuite.scala rename sql/core/src/test/scala/org/apache/spark/sql/sources/v2/{V2WriteSupportCheckSuite.scala => TableCapabilityCheckSuite.scala} (51%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (be04c97 -> 103d50b)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from be04c97 [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base add 103d50b [SPARK-28272][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part3.sql' into UDF test base No new revisions were added by this update. Summary of changes: .../aggregates_part3.sql => udf/pgSQL/udf-aggregates_part3.sql} | 8 +--- .../pgSQL/udf-aggregates_part3.sql.out} | 8 2 files changed, 9 insertions(+), 7 deletions(-) copy sql/core/src/test/resources/sql-tests/inputs/{pgSQL/aggregates_part3.sql => udf/pgSQL/udf-aggregates_part3.sql} (98%) copy sql/core/src/test/resources/sql-tests/results/{pgSQL/aggregates_part3.sql.out => udf/pgSQL/udf-aggregates_part3.sql.out} (74%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new be04c97 [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base be04c97 is described below commit be04c972623df0c44d92ba55a9efadf59a27089e Author: HyukjinKwon AuthorDate: Thu Sep 5 18:34:44 2019 +0900 [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base ### What changes were proposed in this pull request? This PR proposes to port `pgSQL/aggregates_part4.sql` into UDF test base. Diff comparing to 'pgSQL/aggregates_part3.sql' ```diff ``` ### Why are the changes needed? To improve test coverage in UDFs. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Manually tested via: ```bash build/sbt "sql/test-only *SQLQueryTestSuite -- -z udf/pgSQL/udf-aggregates_part4.sql" ``` as guided in https://issues.apache.org/jira/browse/SPARK-27921 Closes #25677 from HyukjinKwon/SPARK-28971. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon --- .../inputs/udf/pgSQL/udf-aggregates_part4.sql | 421 + .../results/udf/pgSQL/udf-aggregates_part4.sql.out | 5 + 2 files changed, 426 insertions(+) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql new file mode 100644 index 000..7c3 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql @@ -0,0 +1,421 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- AGGREGATES [Part 4] +-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L607-L997 + +-- This test file was converted from pgSQL/aggregates_part4.sql. + +-- [SPARK-27980] Ordered-Set Aggregate Functions +-- ordered-set aggregates + +-- select p, percentile_cont(p) within group (order by x::float8) +-- from generate_series(1,5) x, +-- (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p) +-- group by p order by p; + +-- select p, percentile_cont(p order by p) within group (order by x) -- error +-- from generate_series(1,5) x, +-- (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p) +-- group by p order by p; + +-- select p, sum() within group (order by x::float8) -- error +-- from generate_series(1,5) x, +-- (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p) +-- group by p order by p; + +-- select p, percentile_cont(p,p) -- error +-- from generate_series(1,5) x, +-- (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p) +-- group by p order by p; + +-- select percentile_cont(0.5) within group (order by b) from aggtest; +-- select percentile_cont(0.5) within group (order by b), sum(b) from aggtest; +-- select percentile_cont(0.5) within group (order by thousand) from tenk1; +-- select percentile_disc(0.5) within group (order by thousand) from tenk1; +-- [SPARK-28661] Hypothetical-Set Aggregate Functions +-- select rank(3) within group (order by x) +-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x); +-- select cume_dist(3) within group (order by x) +-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x); +-- select percent_rank(3) within group (order by x) +-- from (values (1),(1),(2),(2),(3),(3),(4),(5)) v(x); +-- select dense_rank(3) within group (order by x) +-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x); + +-- [SPARK-27980] Ordered-Set Aggregate Functions +-- select percentile_disc(array[0,0.1,0.25,0.5,0.75,0.9,1]) within group (order by thousand) +-- from tenk1; +-- select percentile_cont(array[0,0.25,0.5,0.75,1]) within group (order by thousand) +-- from tenk1; +-- select percentile_disc(array[[null,1,0.5],[0.75,0.25,null]]) within group (order by thousand) +-- from tenk1; +-- select percentile_cont(array[0,1,0.25,0.75,0.5,1,0.3,0.32,0.35,0.38,0.4]) within group (order by x) +-- from generate_series(1,6) x; + +-- [SPARK-27980] Ordered-Set Aggregate Functions +-- [SPARK-28382] Array Functions: unnest +-- select ten, mode() within group (order by string4) from tenk1 group by ten; + +-- select percentile_disc(array[0.25,0.5,0.75]) within group (order by x) +-- from unnest('{fred,jim,fred,jack,jill,fred,jill,jim,jim,sheila,jim,sheila}'::text[]) u(x); + +-- [SPARK-28669] System Information Functions +-- check collation propagates up in suitable cases: +-- select pg_collation_for(percentile_disc(1) within group (order by x collate "POSIX")) +-- from (values ('fred'),('jim')) v(x); + +--
[spark] branch master updated (f8bc91f -> 36559b6)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from f8bc91f [SPARK-28782][SQL] Generator support in aggregate expressions add 36559b6 [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc that partition column can be numeric, date or timestamp type No new revisions were added by this update. Summary of changes: R/pkg/R/SQLContext.R | 3 ++- python/pyspark/sql/readwriter.py | 3 ++- sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (dde3931 -> f8bc91f)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from dde3931 [SPARK-28878][SQL] Remove extra project for DSv2 reads with columnar batches add f8bc91f [SPARK-28782][SQL] Generator support in aggregate expressions No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/analysis/Analyzer.scala | 52 ++ .../apache/spark/sql/GeneratorFunctionSuite.scala | 34 ++ 2 files changed, 86 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (b9edd44 -> dde3931)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from b9edd44 [SPARK-28964] Add the provider information to the table properties in saveAsTable add dde3931 [SPARK-28878][SQL] Remove extra project for DSv2 reads with columnar batches No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/v2/DataSourceV2Strategy.scala | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (84a4d3a -> b9edd44)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 84a4d3a [SPARK-28976][CORE] Use KeyLock to simplify MapOutputTracker.getStatuses add b9edd44 [SPARK-28964] Add the provider information to the table properties in saveAsTable No new revisions were added by this update. Summary of changes: .../main/scala/org/apache/spark/sql/DataFrameWriter.scala | 10 -- .../v2/DataSourceV2DataFrameSessionCatalogSuite.scala | 13 - 2 files changed, 20 insertions(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-28976][CORE] Use KeyLock to simplify MapOutputTracker.getStatuses
This is an automated email from the ASF dual-hosted git repository. zsxwing pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 84a4d3a [SPARK-28976][CORE] Use KeyLock to simplify MapOutputTracker.getStatuses 84a4d3a is described below commit 84a4d3a17ccbf7e0cb75dffbbdc20a26715f7323 Author: Shixiong Zhu AuthorDate: Wed Sep 4 23:20:27 2019 -0700 [SPARK-28976][CORE] Use KeyLock to simplify MapOutputTracker.getStatuses ### What changes were proposed in this pull request? Use `KeyLock` added in #25612 to simplify `MapOutputTracker.getStatuses`. It also has some improvement after the refactoring: - `InterruptedException` is no longer sallowed. - When a shuffle block is fetched, we don't need to wake up unrelated sleeping threads. ### Why are the changes needed? `MapOutputTracker.getStatuses` is pretty hard to maintain right now because it has a special lock mechanism which we needs to pay attention to whenever updating this method. As we can use `KeyLock` to hide the complexity of locking behind a dedicated lock class, it's better to refactor it to make it easy to understand and maintain. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Existing tests. Closes #25680 from zsxwing/getStatuses. Authored-by: Shixiong Zhu Signed-off-by: Shixiong Zhu --- .../scala/org/apache/spark/MapOutputTracker.scala | 50 +- 1 file changed, 10 insertions(+), 40 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 5c820f5..d878fc5 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -678,8 +678,11 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr val mapStatuses: Map[Int, Array[MapStatus]] = new ConcurrentHashMap[Int, Array[MapStatus]]().asScala - /** Remembers which map output locations are currently being fetched on an executor. */ - private val fetching = new HashSet[Int] + /** + * A [[KeyLock]] whose key is a shuffle id to ensure there is only one thread fetching + * the same shuffle block. + */ + private val fetchingLock = new KeyLock[Int] // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result. override def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int) @@ -707,51 +710,18 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") val startTimeNs = System.nanoTime() - var fetchedStatuses: Array[MapStatus] = null - fetching.synchronized { -// Someone else is fetching it; wait for them to be done -while (fetching.contains(shuffleId)) { - try { -fetching.wait() - } catch { -case e: InterruptedException => - } -} - -// Either while we waited the fetch happened successfully, or -// someone fetched it in between the get and the fetching.synchronized. -fetchedStatuses = mapStatuses.get(shuffleId).orNull + fetchingLock.withLock(shuffleId) { +var fetchedStatuses = mapStatuses.get(shuffleId).orNull if (fetchedStatuses == null) { - // We have to do the fetch, get others to wait for us. - fetching += shuffleId -} - } - - if (fetchedStatuses == null) { -// We won the race to fetch the statuses; do so -logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint) -// This try-finally prevents hangs due to timeouts: -try { + logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint) val fetchedBytes = askTracker[Array[Byte]](GetMapOutputStatuses(shuffleId)) fetchedStatuses = MapOutputTracker.deserializeMapStatuses(fetchedBytes) logInfo("Got the output locations") mapStatuses.put(shuffleId, fetchedStatuses) -} finally { - fetching.synchronized { -fetching -= shuffleId -fetching.notifyAll() - } } - } - logDebug(s"Fetching map output statuses for shuffle $shuffleId took " + -s"${TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs)} ms") - - if (fetchedStatuses != null) { +logDebug(s"Fetching map output statuses for shuffle $shuffleId took " + + s"${TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs)} ms") fetchedStatuses - } else { -logError("Missing all output locations for