Github user srowen commented on a diff in the pull request: https://github.com/apache/spark/pull/19340#discussion_r142001976 --- Diff: mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala --- @@ -546,10 +574,88 @@ object KMeans { .run(data) } + private[spark] def validateInitMode(initMode: String): Boolean = { + initMode match { + case KMeans.RANDOM => true + case KMeans.K_MEANS_PARALLEL => true + case _ => false + } + } + private[spark] def validateDistanceMeasure(distanceMeasure: String): Boolean = { + distanceMeasure match { + case DistanceSuite.EUCLIDEAN => true + case DistanceSuite.COSINE => true + case _ => false + } + } +} + +/** + * A vector with its norm for fast distance computation. + * + * @see [[org.apache.spark.mllib.clustering.KMeans#fastSquaredDistance]] + */ +private[clustering] +class VectorWithNorm(val vector: Vector, val norm: Double) extends Serializable { + + def this(vector: Vector) = this(vector, Vectors.norm(vector, 2.0)) + + def this(array: Array[Double]) = this(Vectors.dense(array)) + + /** Converts the vector to a dense vector. */ + def toDense: VectorWithNorm = new VectorWithNorm(Vectors.dense(vector.toArray), norm) +} + + +private[spark] abstract class DistanceSuite extends Serializable { + + /** + * Returns the index of the closest center to the given point, as well as the squared distance. + */ + def findClosest( --- End diff -- It seems like this should have a default implementation then that does the obvious thing
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org