Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/21195#discussion_r185983646 --- Diff: mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala --- @@ -182,6 +184,40 @@ class BisectingKMeansSuite model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) } + + test("BisectingKMeans with Array input") { + val featuresColNameD = "array_double_features" + val featuresColNameF = "array_float_features" + val doubleUDF = udf { (features: Vector) => + val featureArray = Array.fill[Double](features.size)(0.0) + features.foreachActive((idx, value) => featureArray(idx) = value.toFloat) + featureArray + } + val floatUDF = udf { (features: Vector) => + val featureArray = Array.fill[Float](features.size)(0.0f) + features.foreachActive((idx, value) => featureArray(idx) = value.toFloat) + featureArray + } + val newdatasetD = dataset.withColumn(featuresColNameD, doubleUDF(col("features"))) + .drop("features") --- End diff -- * Unnecessary to drop `features`. Or you can simply replace the features column: ~~~scala val newdatasetD = dataset.withColumn(FEATURES, doubleUDF(col(FEATURES))) ~~~
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org