Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/21195#discussion_r185984500 --- Diff: mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala --- @@ -182,6 +184,40 @@ class BisectingKMeansSuite model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) } + + test("BisectingKMeans with Array input") { + val featuresColNameD = "array_double_features" + val featuresColNameF = "array_float_features" + val doubleUDF = udf { (features: Vector) => + val featureArray = Array.fill[Double](features.size)(0.0) + features.foreachActive((idx, value) => featureArray(idx) = value.toFloat) + featureArray + } + val floatUDF = udf { (features: Vector) => + val featureArray = Array.fill[Float](features.size)(0.0f) + features.foreachActive((idx, value) => featureArray(idx) = value.toFloat) + featureArray + } + val newdatasetD = dataset.withColumn(featuresColNameD, doubleUDF(col("features"))) + .drop("features") + val newdatasetF = dataset.withColumn(featuresColNameF, floatUDF(col("features"))) + .drop("features") + assert(newdatasetD.schema(featuresColNameD).dataType.equals(new ArrayType(DoubleType, false))) + assert(newdatasetF.schema(featuresColNameF).dataType.equals(new ArrayType(FloatType, false))) + + val bkmD = new BisectingKMeans() + .setK(k).setMaxIter(1).setFeaturesCol(featuresColNameD).setSeed(1) + val bkmF = new BisectingKMeans() + .setK(k).setMaxIter(1).setFeaturesCol(featuresColNameF).setSeed(1) + val modelD = bkmD.fit(newdatasetD) + val modelF = bkmF.fit(newdatasetF) + val transformedD = modelD.transform(newdatasetD) + val transformedF = modelF.transform(newdatasetF) + val predictDifference = transformedD.select("prediction") + .except(transformedF.select("prediction")) + assert(predictDifference.count() == 0) --- End diff -- This only verifies it handles `Array[Double]` and `Array[Float]` the same way. But it doesn't guarantee that the result is correct. We can define a method that takes a dataset, apply one iteration, and return the cost. ~~~scala def trainAndComputeCost(dataset: DataFrame): Double = { val model = new BisectingKMeans() .setK(k).setMaxIter(1).setSeed(1) .fit(dataset) model.computeCost(dataset) } val trueCost = trainAndComputeCost(dataset) val floatArrayCost = trainAndComputeCost(newDatasetF) assert(floatArrayCost === trueCost) val doubleArrayCost = trainAndComputeCost(newDatasetD) assert(doubleArrayCost === trueCost) ~~~ We can map the original dataset to single precision to have exact match. Or we can test equality with a threshold. See https://github.com/apache/spark/blob/master/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org