[spark] branch master updated (cb0cddf -> 4664a08)

2019-09-05 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from cb0cddf  [SPARK-21870][SQL] Split aggregation code into small functions
 add 4664a08  [SPARK-28968][ML] Add HasNumFeatures in the scala side

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/ml/feature/FeatureHasher.scala| 19 ++--
 .../org/apache/spark/ml/feature/HashingTF.scala| 20 -
 .../ml/param/shared/SharedParamsCodeGen.scala  |  7 +++---
 .../spark/ml/param/shared/sharedParams.scala   | 25 +++---
 project/MimaExcludes.scala |  6 ++
 python/pyspark/ml/param/_shared_params_code_gen.py |  3 ++-
 python/pyspark/ml/param/shared.py  |  5 +++--
 7 files changed, 43 insertions(+), 42 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (36f8e53 -> cb0cddf)

2019-09-05 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 36f8e53  [SPARK-28802][DOC][SQL] Document DESCRIBE DATABASE statement 
in SQL Reference
 add cb0cddf  [SPARK-21870][SQL] Split aggregation code into small functions

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/sql/catalyst/dsl/package.scala|   8 +-
 .../expressions/codegen/CodeGenerator.scala|  51 
 .../catalyst/expressions/codegen/javaCode.scala|   5 +-
 .../sql/catalyst/expressions/nullExpressions.scala |  12 +-
 .../org/apache/spark/sql/internal/SQLConf.scala|  11 +
 .../execution/aggregate/HashAggregateExec.scala| 309 -
 .../sql/execution/WholeStageCodegenSuite.scala |  21 ++
 7 files changed, 348 insertions(+), 69 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-26046][SS] Add StreamingQueryManager.listListeners()

2019-09-05 Thread jtorres
This is an automated email from the ASF dual-hosted git repository.

jtorres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 3929d16  [SPARK-26046][SS] Add StreamingQueryManager.listListeners()
3929d16 is described below

commit 3929d166043deb104dc3f3180ab43be54c50937d
Author: Mukul Murthy 
AuthorDate: Thu Sep 5 14:27:54 2019 -0700

[SPARK-26046][SS] Add StreamingQueryManager.listListeners()

### What changes were proposed in this pull request?

Add a listListeners() method to StreamingQueryManager that lists all 
StreamingQueryListeners that have been added to that manager.

### Why are the changes needed?

While it's best practice to keep handles on all listeners added, it's still 
nice to have an API to be able to list what listeners have been added to a 
StreamingQueryManager.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Modified existing unit tests to use the new API instead of using reflection.

Closes #25518 from mukulmurthy/26046-listener.

Authored-by: Mukul Murthy 
Signed-off-by: Jose Torres 
---
 .../spark/sql/streaming/StreamingQueryManager.scala   | 10 ++
 .../spark/sql/streaming/StreamingQueryListenerSuite.scala | 15 ---
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index abee5f6..9765956 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -21,6 +21,7 @@ import java.util.UUID
 import java.util.concurrent.TimeUnit
 import javax.annotation.concurrent.GuardedBy
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.hadoop.fs.Path
@@ -199,6 +200,15 @@ class StreamingQueryManager private[sql] (sparkSession: 
SparkSession) extends Lo
 listenerBus.removeListener(listener)
   }
 
+  /**
+   * List all [[StreamingQueryListener]]s attached to this 
[[StreamingQueryManager]].
+   *
+   * @since 3.0.0
+   */
+  def listListeners(): Array[StreamingQueryListener] = {
+listenerBus.listeners.asScala.toArray
+  }
+
   /** Post a listener event */
   private[sql] def postListenerEvent(event: StreamingQueryListener.Event): 
Unit = {
 listenerBus.post(event)
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 43b..d964048 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -47,7 +47,7 @@ class StreamingQueryListenerSuite extends StreamTest with 
BeforeAndAfter {
   after {
 spark.streams.active.foreach(_.stop())
 assert(spark.streams.active.isEmpty)
-assert(addedListeners().isEmpty)
+assert(spark.streams.listListeners().isEmpty)
 // Make sure we don't leak any events to the next test
 spark.sparkContext.listenerBus.waitUntilEmpty(1)
   }
@@ -223,7 +223,7 @@ class StreamingQueryListenerSuite extends StreamTest with 
BeforeAndAfter {
   assert(isListenerActive(listener1) === false)
   assert(isListenerActive(listener2))
 } finally {
-  addedListeners().foreach(spark.streams.removeListener)
+  spark.streams.listListeners().foreach(spark.streams.removeListener)
 }
   }
 
@@ -362,10 +362,10 @@ class StreamingQueryListenerSuite extends StreamTest with 
BeforeAndAfter {
 assert(session1.streams.ne(session2.streams))
 
 withListenerAdded(collector1, session1) {
-  assert(addedListeners(session1).nonEmpty)
+  assert(session1.streams.listListeners().nonEmpty)
 
   withListenerAdded(collector2, session2) {
-assert(addedListeners(session2).nonEmpty)
+assert(session2.streams.listListeners().nonEmpty)
 
 // query on session1 should send events only to collector1
 runQuery(session1)
@@ -440,13 +440,6 @@ class StreamingQueryListenerSuite extends StreamTest with 
BeforeAndAfter {
 }
   }
 
-  private def addedListeners(session: SparkSession = spark): 
Array[StreamingQueryListener] = {
-val listenerBusMethod =
-  PrivateMethod[StreamingQueryListenerBus]('listenerBus)
-val listenerBus = session.streams invokePrivate listenerBusMethod()
-listenerBus.listeners.toArray.map(_.asInstanceOf[StreamingQueryListener])
-  }
-
   /** Collects events from the StreamingQueryListener for testing */
   class EventCollector extends StreamingQueryListener {
 // to catch 

[spark] branch master updated: [SPARK-28770][CORE][TEST] Fix ReplayListenerSuite tests that sometimes fail

2019-09-05 Thread irashid
This is an automated email from the ASF dual-hosted git repository.

irashid pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 151b954  [SPARK-28770][CORE][TEST] Fix ReplayListenerSuite tests that 
sometimes fail
151b954 is described below

commit 151b954e52c66f78a72530ab69f38100101f6cb7
Author: Wing Yew Poon 
AuthorDate: Thu Sep 5 15:55:22 2019 -0500

[SPARK-28770][CORE][TEST] Fix ReplayListenerSuite tests that sometimes fail

### What changes were proposed in this pull request?

`ReplayListenerSuite` depends on a listener class to listen for replayed 
events. This class was implemented by extending `EventLoggingListener`. 
`EventLoggingListener` does not log executor metrics update events, but uses 
them to update internal state; on a stage completion event, it then logs stage 
executor metrics events using this internal state. As executor metrics update 
events do not get written to the event log, they do not get replayed. The 
internal state of the replay listene [...]

We reimplement the replay listener to simply buffer each and every event it 
receives. This makes it a simpler yet better tool for verifying the events that 
get sent through the ReplayListenerBus.

### Why are the changes needed?

As explained above. Tests sometimes fail due to events being received by 
the `EventLoggingListener` that do not get logged (and thus do not get 
replayed) but influence other events that get logged.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Existing unit tests.

Closes #25673 from wypoon/SPARK-28770.

Authored-by: Wing Yew Poon 
Signed-off-by: Imran Rashid 
---
 .../spark/scheduler/ReplayListenerSuite.scala  | 36 ++
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala 
b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index e796137..d65b5cb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -21,12 +21,14 @@ import java.io._
 import java.net.URI
 import java.util.concurrent.atomic.AtomicInteger
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.hadoop.fs.Path
 import org.json4s.JsonAST.JValue
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, 
SparkFunSuite}
+import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.{CompressionCodec, LZ4CompressionCodec}
 import org.apache.spark.util.{JsonProtocol, JsonProtocolSuite, Utils}
@@ -62,7 +64,7 @@ class ReplayListenerSuite extends SparkFunSuite with 
BeforeAndAfter with LocalSp
 
 val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
 val logData = fileSystem.open(logFilePath)
-val eventMonster = new EventMonster(conf)
+val eventMonster = new EventBufferingListener
 try {
   val replayer = new ReplayListenerBus()
   replayer.addListener(eventMonster)
@@ -108,7 +110,7 @@ class ReplayListenerSuite extends SparkFunSuite with 
BeforeAndAfter with LocalSp
 val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
 val replayer = new ReplayListenerBus()
 
-val eventMonster = new EventMonster(conf)
+val eventMonster = new EventBufferingListener
 replayer.addListener(eventMonster)
 
 // Verify the replay returns the events given the input maybe truncated.
@@ -145,7 +147,7 @@ class ReplayListenerSuite extends SparkFunSuite with 
BeforeAndAfter with LocalSp
 
 val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
 val logData = fileSystem.open(logFilePath)
-val eventMonster = new EventMonster(conf)
+val eventMonster = new EventBufferingListener
 try {
   val replayer = new ReplayListenerBus()
   replayer.addListener(eventMonster)
@@ -207,7 +209,7 @@ class ReplayListenerSuite extends SparkFunSuite with 
BeforeAndAfter with LocalSp
 
 // Replay events
 val logData = EventLoggingListener.openEventLog(eventLog.getPath(), 
fileSystem)
-val eventMonster = new EventMonster(conf)
+val eventMonster = new EventBufferingListener
 try {
   val replayer = new ReplayListenerBus()
   replayer.addListener(eventMonster)
@@ -219,11 +221,11 @@ class ReplayListenerSuite extends SparkFunSuite with 
BeforeAndAfter with LocalSp
 // Verify the same events are replayed in the same order
 assert(sc.eventLogger.isDefined)
 val originalEvents = sc.eventLogger.get.loggedEvents
+  .map(JsonProtocol.sparkEventFromJson(_))
 val replayedEvents = eventMonster.loggedEvents
+  

svn commit: r35567 - /release/spark/spark-2.4.3/

2019-09-05 Thread dongjoon
Author: dongjoon
Date: Thu Sep  5 15:56:04 2019
New Revision: 35567

Log:
Remove previous 2.4.3 release from mirror network

Removed:
release/spark/spark-2.4.3/


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] maropu commented on issue #218: Add Gengliang Wang to committer list

2019-09-05 Thread GitBox
maropu commented on issue #218: Add Gengliang Wang to committer list
URL: https://github.com/apache/spark-website/pull/218#issuecomment-528382488
 
 
   oh, congrats!!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (c81fd0c -> 0647906)

2019-09-05 Thread yumwang
This is an automated email from the ASF dual-hosted git repository.

yumwang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from c81fd0c  [SPARK-28974][SQL] centralize the Data Source V2 table 
capability checks
 add 0647906  [SPARK-28910][SQL] Prevent schema verification when 
connecting to in memory derby

No new revisions were added by this update.

Summary of changes:
 .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-2.4 updated: [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc that partition column can be numeric, date or timestamp type

2019-09-05 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
 new a1471f9  [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc 
that partition column can be numeric, date or timestamp type
a1471f9 is described below

commit a1471f95a4b2ff2ff5c70403458d796429ee857a
Author: Sean Owen 
AuthorDate: Thu Sep 5 18:32:45 2019 +0900

[SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc that 
partition column can be numeric, date or timestamp type

### What changes were proposed in this pull request?

`DataFrameReader.json()` accepts a partition column that is of numeric, 
date or timestamp type, according to the implementation in 
`JDBCRelation.scala`. Update the scaladoc accordingly, to match the 
documentation in `sql-data-sources-jdbc.md` too.

### Why are the changes needed?

scaladoc is incorrect.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #25687 from srowen/SPARK-28977.

Authored-by: Sean Owen 
Signed-off-by: HyukjinKwon 
---
 R/pkg/R/SQLContext.R   | 3 ++-
 python/pyspark/sql/readwriter.py   | 3 ++-
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index c819a7d..c281aa0 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -655,7 +655,8 @@ loadDF <- function(x = NULL, ...) {
 #'
 #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}
 #' @param tableName the name of the table in the external database
-#' @param partitionColumn the name of a column of integral type that will be 
used for partitioning
+#' @param partitionColumn the name of a column of numeric, date, or timestamp 
type
+#'that will be used for partitioning.
 #' @param lowerBound the minimum value of \code{partitionColumn} used to 
decide partition stride
 #' @param upperBound the maximum value of \code{partitionColumn} used to 
decide partition stride
 #' @param numPartitions the number of partitions, This, along with 
\code{lowerBound} (inclusive),
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index ea7cc80..4396699 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -526,7 +526,8 @@ class DataFrameReader(OptionUtils):
 
 :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
 :param table: the name of the table
-:param column: the name of an integer column that will be used for 
partitioning;
+:param column: the name of a column of numeric, date, or timestamp type
+   that will be used for partitioning;
if this parameter is specified, then ``numPartitions``, 
``lowerBound``
(inclusive), and ``upperBound`` (exclusive) will form 
partition strides
for generated WHERE clause expressions used to split 
the column
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 85cd3f0..c71f871 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -248,7 +248,8 @@ class DataFrameReader private[sql](sparkSession: 
SparkSession) extends Logging {
*
* @param url JDBC database url of the form `jdbc:subprotocol:subname`.
* @param table Name of the table in the external database.
-   * @param columnName the name of a column of integral type that will be used 
for partitioning.
+   * @param columnName the name of a column of numeric, date, or timestamp type
+   *   that will be used for partitioning.
* @param lowerBound the minimum value of `columnName` used to decide 
partition stride.
* @param upperBound the maximum value of `columnName` used to decide 
partition stride.
* @param numPartitions the number of partitions. This, along with 
`lowerBound` (inclusive),


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (103d50b -> c81fd0c)

2019-09-05 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 103d50b  [SPARK-28272][SQL][PYTHON][TESTS] Convert and port 
'pgSQL/aggregates_part3.sql' into UDF test base
 add c81fd0c  [SPARK-28974][SQL] centralize the Data Source V2 table 
capability checks

No new revisions were added by this update.

Summary of changes:
 .../datasources/v2/DataSourceV2Strategy.scala  |   4 +
 ...pportCheck.scala => TableCapabilityCheck.scala} |  52 +++--
 .../datasources/v2/V2WriteSupportCheck.scala   |  61 --
 .../sql/internal/BaseSessionStateBuilder.scala |   5 +-
 .../v2/V2StreamingScanSupportCheckSuite.scala  | 129 -
 .../spark/sql/sources/v2/InsertIntoTests.scala |   2 +-
 ...Suite.scala => TableCapabilityCheckSuite.scala} | 124 +---
 .../spark/sql/hive/HiveSessionStateBuilder.scala   |   5 +-
 8 files changed, 157 insertions(+), 225 deletions(-)
 rename 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/{V2StreamingScanSupportCheck.scala
 => TableCapabilityCheck.scala} (51%)
 delete mode 100644 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2WriteSupportCheck.scala
 delete mode 100644 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2StreamingScanSupportCheckSuite.scala
 rename 
sql/core/src/test/scala/org/apache/spark/sql/sources/v2/{V2WriteSupportCheckSuite.scala
 => TableCapabilityCheckSuite.scala} (51%)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (be04c97 -> 103d50b)

2019-09-05 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from be04c97  [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 
'pgSQL/aggregates_part4.sql' into UDF test base
 add 103d50b  [SPARK-28272][SQL][PYTHON][TESTS] Convert and port 
'pgSQL/aggregates_part3.sql' into UDF test base

No new revisions were added by this update.

Summary of changes:
 .../aggregates_part3.sql => udf/pgSQL/udf-aggregates_part3.sql}   | 8 +---
 .../pgSQL/udf-aggregates_part3.sql.out}   | 8 
 2 files changed, 9 insertions(+), 7 deletions(-)
 copy sql/core/src/test/resources/sql-tests/inputs/{pgSQL/aggregates_part3.sql 
=> udf/pgSQL/udf-aggregates_part3.sql} (98%)
 copy 
sql/core/src/test/resources/sql-tests/results/{pgSQL/aggregates_part3.sql.out 
=> udf/pgSQL/udf-aggregates_part3.sql.out} (74%)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base

2019-09-05 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new be04c97  [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 
'pgSQL/aggregates_part4.sql' into UDF test base
be04c97 is described below

commit be04c972623df0c44d92ba55a9efadf59a27089e
Author: HyukjinKwon 
AuthorDate: Thu Sep 5 18:34:44 2019 +0900

[SPARK-28971][SQL][PYTHON][TESTS] Convert and port 
'pgSQL/aggregates_part4.sql' into UDF test base

### What changes were proposed in this pull request?

This PR proposes to port `pgSQL/aggregates_part4.sql` into UDF test base.

Diff comparing to 'pgSQL/aggregates_part3.sql'


```diff
```




### Why are the changes needed?

To improve test coverage in UDFs.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Manually tested via:

```bash
 build/sbt "sql/test-only *SQLQueryTestSuite -- -z 
udf/pgSQL/udf-aggregates_part4.sql"
```

as guided in https://issues.apache.org/jira/browse/SPARK-27921

Closes #25677 from HyukjinKwon/SPARK-28971.

Authored-by: HyukjinKwon 
Signed-off-by: HyukjinKwon 
---
 .../inputs/udf/pgSQL/udf-aggregates_part4.sql  | 421 +
 .../results/udf/pgSQL/udf-aggregates_part4.sql.out |   5 +
 2 files changed, 426 insertions(+)

diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql
 
b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql
new file mode 100644
index 000..7c3
--- /dev/null
+++ 
b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql
@@ -0,0 +1,421 @@
+--
+-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+--
+--
+-- AGGREGATES [Part 4]
+-- 
https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L607-L997
+
+-- This test file was converted from pgSQL/aggregates_part4.sql.
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- ordered-set aggregates
+
+-- select p, percentile_cont(p) within group (order by x::float8)
+-- from generate_series(1,5) x,
+--  (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) 
v(p)
+-- group by p order by p;
+
+-- select p, percentile_cont(p order by p) within group (order by x)  -- error
+-- from generate_series(1,5) x,
+--  (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) 
v(p)
+-- group by p order by p;
+
+-- select p, sum() within group (order by x::float8)  -- error
+-- from generate_series(1,5) x,
+--  (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) 
v(p)
+-- group by p order by p;
+
+-- select p, percentile_cont(p,p)  -- error
+-- from generate_series(1,5) x,
+--  (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) 
v(p)
+-- group by p order by p;
+
+-- select percentile_cont(0.5) within group (order by b) from aggtest;
+-- select percentile_cont(0.5) within group (order by b), sum(b) from aggtest;
+-- select percentile_cont(0.5) within group (order by thousand) from tenk1;
+-- select percentile_disc(0.5) within group (order by thousand) from tenk1;
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- select rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+-- select cume_dist(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+-- select percent_rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4),(5)) v(x);
+-- select dense_rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- select percentile_disc(array[0,0.1,0.25,0.5,0.75,0.9,1]) within group 
(order by thousand)
+-- from tenk1;
+-- select percentile_cont(array[0,0.25,0.5,0.75,1]) within group (order by 
thousand)
+-- from tenk1;
+-- select percentile_disc(array[[null,1,0.5],[0.75,0.25,null]]) within group 
(order by thousand)
+-- from tenk1;
+-- select percentile_cont(array[0,1,0.25,0.75,0.5,1,0.3,0.32,0.35,0.38,0.4]) 
within group (order by x)
+-- from generate_series(1,6) x;
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- [SPARK-28382] Array Functions: unnest
+-- select ten, mode() within group (order by string4) from tenk1 group by ten;
+
+-- select percentile_disc(array[0.25,0.5,0.75]) within group (order by x)
+-- from 
unnest('{fred,jim,fred,jack,jill,fred,jill,jim,jim,sheila,jim,sheila}'::text[]) 
u(x);
+
+-- [SPARK-28669] System Information Functions
+-- check collation propagates up in suitable cases:
+-- select pg_collation_for(percentile_disc(1) within group (order by x collate 
"POSIX"))
+--   from (values ('fred'),('jim')) v(x);
+
+-- 

[spark] branch master updated (f8bc91f -> 36559b6)

2019-09-05 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from f8bc91f  [SPARK-28782][SQL] Generator support in aggregate expressions
 add 36559b6  [SPARK-28977][DOCS][SQL] Fix DataFrameReader.json docs to doc 
that partition column can be numeric, date or timestamp type

No new revisions were added by this update.

Summary of changes:
 R/pkg/R/SQLContext.R   | 3 ++-
 python/pyspark/sql/readwriter.py   | 3 ++-
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (dde3931 -> f8bc91f)

2019-09-05 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from dde3931  [SPARK-28878][SQL] Remove extra project for DSv2 reads with 
columnar batches
 add f8bc91f  [SPARK-28782][SQL] Generator support in aggregate expressions

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/analysis/Analyzer.scala | 52 ++
 .../apache/spark/sql/GeneratorFunctionSuite.scala  | 34 ++
 2 files changed, 86 insertions(+)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (b9edd44 -> dde3931)

2019-09-05 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from b9edd44  [SPARK-28964] Add the provider information to the table 
properties in saveAsTable
 add dde3931  [SPARK-28878][SQL] Remove extra project for DSv2 reads with 
columnar batches

No new revisions were added by this update.

Summary of changes:
 .../sql/execution/datasources/v2/DataSourceV2Strategy.scala | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (84a4d3a -> b9edd44)

2019-09-05 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git.


from 84a4d3a  [SPARK-28976][CORE] Use KeyLock to simplify 
MapOutputTracker.getStatuses
 add b9edd44  [SPARK-28964] Add the provider information to the table 
properties in saveAsTable

No new revisions were added by this update.

Summary of changes:
 .../main/scala/org/apache/spark/sql/DataFrameWriter.scala   | 10 --
 .../v2/DataSourceV2DataFrameSessionCatalogSuite.scala   | 13 -
 2 files changed, 20 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-28976][CORE] Use KeyLock to simplify MapOutputTracker.getStatuses

2019-09-05 Thread zsxwing
This is an automated email from the ASF dual-hosted git repository.

zsxwing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 84a4d3a  [SPARK-28976][CORE] Use KeyLock to simplify 
MapOutputTracker.getStatuses
84a4d3a is described below

commit 84a4d3a17ccbf7e0cb75dffbbdc20a26715f7323
Author: Shixiong Zhu 
AuthorDate: Wed Sep 4 23:20:27 2019 -0700

[SPARK-28976][CORE] Use KeyLock to simplify MapOutputTracker.getStatuses

### What changes were proposed in this pull request?

Use `KeyLock` added in #25612 to simplify `MapOutputTracker.getStatuses`. 
It also has some improvement after the refactoring:
- `InterruptedException` is no longer sallowed.
- When a shuffle block is fetched, we don't need to wake up unrelated 
sleeping threads.

### Why are the changes needed?

`MapOutputTracker.getStatuses` is pretty hard to maintain right now because 
it has a special lock mechanism which we needs to pay attention to whenever 
updating this method. As we can use `KeyLock` to hide the complexity of locking 
behind a dedicated lock class, it's better to refactor it to make it easy to 
understand and maintain.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Existing tests.

Closes #25680 from zsxwing/getStatuses.

Authored-by: Shixiong Zhu 
Signed-off-by: Shixiong Zhu 
---
 .../scala/org/apache/spark/MapOutputTracker.scala  | 50 +-
 1 file changed, 10 insertions(+), 40 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala 
b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 5c820f5..d878fc5 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -678,8 +678,11 @@ private[spark] class MapOutputTrackerWorker(conf: 
SparkConf) extends MapOutputTr
   val mapStatuses: Map[Int, Array[MapStatus]] =
 new ConcurrentHashMap[Int, Array[MapStatus]]().asScala
 
-  /** Remembers which map output locations are currently being fetched on an 
executor. */
-  private val fetching = new HashSet[Int]
+  /**
+   * A [[KeyLock]] whose key is a shuffle id to ensure there is only one 
thread fetching
+   * the same shuffle block.
+   */
+  private val fetchingLock = new KeyLock[Int]
 
   // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded 
in the result.
   override def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, 
endPartition: Int)
@@ -707,51 +710,18 @@ private[spark] class MapOutputTrackerWorker(conf: 
SparkConf) extends MapOutputTr
 if (statuses == null) {
   logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching 
them")
   val startTimeNs = System.nanoTime()
-  var fetchedStatuses: Array[MapStatus] = null
-  fetching.synchronized {
-// Someone else is fetching it; wait for them to be done
-while (fetching.contains(shuffleId)) {
-  try {
-fetching.wait()
-  } catch {
-case e: InterruptedException =>
-  }
-}
-
-// Either while we waited the fetch happened successfully, or
-// someone fetched it in between the get and the fetching.synchronized.
-fetchedStatuses = mapStatuses.get(shuffleId).orNull
+  fetchingLock.withLock(shuffleId) {
+var fetchedStatuses = mapStatuses.get(shuffleId).orNull
 if (fetchedStatuses == null) {
-  // We have to do the fetch, get others to wait for us.
-  fetching += shuffleId
-}
-  }
-
-  if (fetchedStatuses == null) {
-// We won the race to fetch the statuses; do so
-logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint)
-// This try-finally prevents hangs due to timeouts:
-try {
+  logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint)
   val fetchedBytes = 
askTracker[Array[Byte]](GetMapOutputStatuses(shuffleId))
   fetchedStatuses = 
MapOutputTracker.deserializeMapStatuses(fetchedBytes)
   logInfo("Got the output locations")
   mapStatuses.put(shuffleId, fetchedStatuses)
-} finally {
-  fetching.synchronized {
-fetching -= shuffleId
-fetching.notifyAll()
-  }
 }
-  }
-  logDebug(s"Fetching map output statuses for shuffle $shuffleId took " +
-s"${TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs)} 
ms")
-
-  if (fetchedStatuses != null) {
+logDebug(s"Fetching map output statuses for shuffle $shuffleId took " +
+  s"${TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs)} 
ms")
 fetchedStatuses
-  } else {
-logError("Missing all output locations for