Repository: spark Updated Branches: refs/heads/branch-1.6 528e37352 -> f0cc511ec
[SPARK-13444][MLLIB] QuantileDiscretizer chooses bad splits on large DataFrames ## What changes were proposed in this pull request? Change line 113 of QuantileDiscretizer.scala to `val requiredSamples = math.max(numBins * numBins, 10000.0)` so that `requiredSamples` is a `Double`. This will fix the division in line 114 which currently results in zero if `requiredSamples < dataset.count` ## How was the this patch tested? Manual tests. I was having a problems using QuantileDiscretizer with my a dataset and after making this change QuantileDiscretizer behaves as expected. Author: Oliver Pierson <o...@gatech.edu> Author: Oliver Pierson <opier...@umd.edu> Closes #11319 from oliverpierson/SPARK-13444. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f0cc511e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f0cc511e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f0cc511e Branch: refs/heads/branch-1.6 Commit: f0cc511ecaff8b5b36bdcce2d014ebbd8b8a37e5 Parents: 528e373 Author: Oliver Pierson <o...@gatech.edu> Authored: Thu Feb 25 13:24:46 2016 +0000 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Mar 4 12:52:16 2016 -0800 ---------------------------------------------------------------------- .../spark/ml/feature/QuantileDiscretizer.scala | 11 +++++++++-- .../ml/feature/QuantileDiscretizerSuite.scala | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/f0cc511e/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index c9467b2..e9baf2d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -102,6 +102,13 @@ final class QuantileDiscretizer(override val uid: String) @Since("1.6.0") object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] with Logging { + + /** + * Minimum number of samples required for finding splits, regardless of number of bins. If + * the dataset has fewer rows than this value, the entire dataset will be used. + */ + private[spark] val minSamplesRequired: Int = 10000 + /** * Sampling from the given dataset to collect quantile statistics. */ @@ -109,8 +116,8 @@ object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] wi val totalSamples = dataset.count() require(totalSamples > 0, "QuantileDiscretizer requires non-empty input dataset but was given an empty input.") - val requiredSamples = math.max(numBins * numBins, 10000) - val fraction = math.min(requiredSamples / dataset.count(), 1.0) + val requiredSamples = math.max(numBins * numBins, minSamplesRequired) + val fraction = math.min(requiredSamples.toDouble / dataset.count(), 1.0) dataset.sample(withReplacement = false, fraction, new XORShiftRandom(seed).nextInt()).collect() } http://git-wip-us.apache.org/repos/asf/spark/blob/f0cc511e/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala index 0d1b940..16c5529 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala @@ -71,6 +71,26 @@ class QuantileDiscretizerSuite } } + test("Test splits on dataset larger than minSamplesRequired") { + val sqlCtx = SQLContext.getOrCreate(sc) + import sqlCtx.implicits._ + + val datasetSize = QuantileDiscretizer.minSamplesRequired + 1 + val numBuckets = 5 + val df = sc.parallelize((1.0 to datasetSize by 1.0).map(Tuple1.apply)).toDF("input") + val discretizer = new QuantileDiscretizer() + .setInputCol("input") + .setOutputCol("result") + .setNumBuckets(numBuckets) + .setSeed(1) + + val result = discretizer.fit(df).transform(df) + val observedNumBuckets = result.select("result").distinct.count + + assert(observedNumBuckets === numBuckets, + "Observed number of buckets does not equal expected number of buckets.") + } + test("read/write") { val t = new QuantileDiscretizer() .setInputCol("myInputCol") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org