Github user holdenk commented on a diff in the pull request: https://github.com/apache/spark/pull/21942#discussion_r216021077 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala --- @@ -160,15 +160,88 @@ class StandardScalerModel private[ml] ( @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) - val scaler = new feature.StandardScalerModel(std, mean, $(withStd), $(withMean)) - - // TODO: Make the transformer natively in ml framework to avoid extra conversion. - val transformer: Vector => Vector = v => scaler.transform(OldVectors.fromML(v)).asML + val transformer: Vector => Vector = v => transform(v) val scale = udf(transformer) dataset.withColumn($(outputCol), scale(col($(inputCol)))) } + /** + * Since `shift` will be only used in `withMean` branch, we have it as + * `lazy val` so it will be evaluated in that branch. Note that we don't + * want to create this array multiple times in `transform` function. + */ + private lazy val shift: Array[Double] = mean.toArray --- End diff -- How does this interplay with serialization? Would it make sense to evaluate this before we serialize the UDF so it isn't done on each worker or?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org