Github user WeichenXu123 commented on a diff in the pull request: https://github.com/apache/spark/pull/17123#discussion_r162703633 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala --- @@ -105,20 +106,21 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String transformSchema(dataset.schema) val (filteredDataset, keepInvalid) = { if (getHandleInvalid == Bucketizer.SKIP_INVALID) { - // "skip" NaN option is set, will filter out NaN values in the dataset + // "skip" NaN/NULL option is set, will filter out NaN/NULL values in the dataset (dataset.na.drop().toDF(), false) } else { (dataset.toDF(), getHandleInvalid == Bucketizer.KEEP_INVALID) } } - val bucketizer: UserDefinedFunction = udf { (feature: Double) => --- End diff -- As @cloud-fan suggested, `Option[Double]` is better. :-)
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org