Repository: spark Updated Branches: refs/heads/branch-1.6 5f7440b25 -> d59a08f7c
Revert "[SPARK-13444][MLLIB] QuantileDiscretizer chooses bad splits on large DataFrames" This reverts commit cb869a143d338985c3d99ef388dd78b1e3d90a73. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d59a08f7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d59a08f7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d59a08f7 Branch: refs/heads/branch-1.6 Commit: d59a08f7c1c455d86e7ee3d6522a3e9c55f9ee02 Parents: 5f7440b Author: Xiangrui Meng <m...@databricks.com> Authored: Thu Feb 25 12:28:03 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Feb 25 12:28:03 2016 -0800 ---------------------------------------------------------------------- .../spark/ml/feature/QuantileDiscretizer.scala | 11 ++--------- .../ml/feature/QuantileDiscretizerSuite.scala | 20 -------------------- 2 files changed, 2 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/d59a08f7/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index cd5085a..7bf67c6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -97,13 +97,6 @@ final class QuantileDiscretizer(override val uid: String) @Since("1.6.0") object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] with Logging { - - /** - * Minimum number of samples required for finding splits, regardless of number of bins. If - * the dataset has fewer rows than this value, the entire dataset will be used. - */ - private[spark] val minSamplesRequired: Int = 10000 - /** * Sampling from the given dataset to collect quantile statistics. */ @@ -111,8 +104,8 @@ object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] wi val totalSamples = dataset.count() require(totalSamples > 0, "QuantileDiscretizer requires non-empty input dataset but was given an empty input.") - val requiredSamples = math.max(numBins * numBins, minSamplesRequired) - val fraction = math.min(requiredSamples.toDouble / dataset.count(), 1.0) + val requiredSamples = math.max(numBins * numBins, 10000) + val fraction = math.min(requiredSamples / dataset.count(), 1.0) dataset.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect() } http://git-wip-us.apache.org/repos/asf/spark/blob/d59a08f7/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala index 32bfa43..3a4f6d2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala @@ -71,26 +71,6 @@ class QuantileDiscretizerSuite } } - test("Test splits on dataset larger than minSamplesRequired") { - val sqlCtx = SQLContext.getOrCreate(sc) - import sqlCtx.implicits._ - - val datasetSize = QuantileDiscretizer.minSamplesRequired + 1 - val numBuckets = 5 - val df = sc.parallelize((1.0 to datasetSize by 1.0).map(Tuple1.apply)).toDF("input") - val discretizer = new QuantileDiscretizer() - .setInputCol("input") - .setOutputCol("result") - .setNumBuckets(numBuckets) - .setSeed(1) - - val result = discretizer.fit(df).transform(df) - val observedNumBuckets = result.select("result").distinct.count - - assert(observedNumBuckets === numBuckets, - "Observed number of buckets does not equal expected number of buckets.") - } - test("read/write") { val t = new QuantileDiscretizer() .setInputCol("myInputCol") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org