Github user MLnick commented on a diff in the pull request: https://github.com/apache/spark/pull/17819#discussion_r143724079 --- Diff: mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala --- @@ -187,6 +188,196 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa } } } + + test("multiple columns: Bucket continuous features, without -inf,inf") { + // Check a set of valid feature values. + val splits = Array(Array(-0.5, 0.0, 0.5), Array(-0.1, 0.3, 0.5)) + val validData1 = Array(-0.5, -0.3, 0.0, 0.2) + val validData2 = Array(0.5, 0.3, 0.0, -0.1) + val expectedBuckets1 = Array(0.0, 0.0, 1.0, 1.0) + val expectedBuckets2 = Array(1.0, 1.0, 0.0, 0.0) + + val data = (0 until validData1.length).map { idx => + (validData1(idx), validData2(idx), expectedBuckets1(idx), expectedBuckets2(idx)) + } + val dataFrame: DataFrame = data.toSeq.toDF("feature1", "feature2", "expected1", "expected2") + + val bucketizer1: Bucketizer = new Bucketizer() + .setInputCols(Array("feature1", "feature2")) + .setOutputCols(Array("result1", "result2")) + .setSplitsArray(splits) + + assert(bucketizer1.isBucketizeMultipleColumns()) + + bucketizer1.transform(dataFrame).select("result1", "expected1", "result2", "expected2") + .collect().foreach { + case Row(r1: Double, e1: Double, r2: Double, e2: Double) => + assert(r1 === e1, + s"The feature value is not correct after bucketing. Expected $e1 but found $r1") + assert(r2 === e2, + s"The feature value is not correct after bucketing. Expected $e2 but found $r2") + } + + // Check for exceptions when using a set of invalid feature values. + val invalidData1: Array[Double] = Array(-0.9) ++ validData1 --- End diff -- Is this type annotation required?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org