Repository: spark Updated Branches: refs/heads/branch-2.0 1696bcfad -> 3253ae7f7
[SPARK-18111][SQL] Wrong approximate quantile answer when multiple records have the minimum value(for branch 2.0) ## What changes were proposed in this pull request? When multiple records have the minimum value, the answer of `StatFunctions.multipleApproxQuantiles` is wrong. ## How was this patch tested? add a test case Author: wangzhenhua <wangzhen...@huawei.com> Closes #15732 from wzhfy/percentile2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3253ae7f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3253ae7f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3253ae7f Branch: refs/heads/branch-2.0 Commit: 3253ae7f722a996cf0af21608e1a27d5d2a12004 Parents: 1696bcf Author: wangzhenhua <wangzhen...@huawei.com> Authored: Wed Nov 2 11:49:30 2016 -0700 Committer: Reynold Xin <r...@databricks.com> Committed: Wed Nov 2 11:49:30 2016 -0700 ---------------------------------------------------------------------- .../spark/sql/execution/stat/StatFunctions.scala | 4 +++- .../org/apache/spark/sql/DataFrameStatSuite.scala | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index 7e2ebe8..acc42a0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -337,7 +337,9 @@ object StatFunctions extends Logging { res.prepend(head) // If necessary, add the minimum element: val currHead = currentSamples.head - if (currHead.value < head.value) { + // don't add the minimum element if `currentSamples` has only one element (both `currHead` and + // `head` point to the same element) + if (currHead.value <= head.value && currentSamples.length > 1) { res.prepend(currentSamples.head) } res.toArray http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 73026c7..571e2ad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -152,6 +152,19 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { } } + test("approximate quantile, multiple records with the minimum value in a partition") { + val data = Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5) + val df = spark.sparkContext.makeRDD(data, 4).toDF("col") + val epsilons = List(0.1, 0.05, 0.001) + val quantile = 0.5 + val expected = 1 + for (epsilon <- epsilons) { + val Array(answer) = df.stat.approxQuantile("col", Array(quantile), epsilon) + val error = 2 * data.length * epsilon + assert(math.abs(answer - expected) < error) + } + } + test("crosstab") { val rng = new Random() val data = Seq.tabulate(25)(i => (rng.nextInt(5), rng.nextInt(10))) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org