Github user huaxingao commented on a diff in the pull request: https://github.com/apache/spark/pull/19715#discussion_r150450280 --- Diff: mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala --- @@ -146,4 +146,172 @@ class QuantileDiscretizerSuite val model = discretizer.fit(df) assert(model.hasParent) } + + test("Multiple Columns: Test observed number of buckets and their sizes match expected values") { + val spark = this.spark + import spark.implicits._ + + val datasetSize = 100000 + val numBuckets = 5 + val data1 = Array.range(1, 100001, 1).map(_.toDouble) + val data2 = Array.range(1, 200000, 2).map(_.toDouble) + val data = (0 until 100000).map { idx => + (data1(idx), data2(idx)) + } --- End diff -- Yes. Will change to data1.zip(data2)
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org