Github user viirya commented on the issue: https://github.com/apache/spark/pull/19229 I ran the test codes to benchmark RDD-version and DataFrame version with this `ImputerModel` change: import org.apache.spark.ml.feature._ import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types._ import spark.implicits._ import scala.util.Random def genData(): DataFrame = { val seed = 123l val random = new Random(seed) val n = 10000 val m = 100 val rows = sc.parallelize(1 to n).map(i=> Row(Array.fill(m)(random.nextDouble): _*)) val struct = new StructType(Array.range(0,m,1).map(i => StructField(s"c$i",DoubleType,true))) val df = spark.createDataFrame(rows, struct) df.cache() df.count() df } for (strategy <- Seq("mean", "median"); k <- Seq(1,10,100)) { val imputer = new Imputer().setStrategy(strategy).setInputCols(Array.range(0,k,1).map(i=>s"c$i")).setOutputCols(Array.range(0,k,1).map(i=>s"o$i")) var duration = 0.0 for (i<- 0 until 10) { val df = genData() val start = System.nanoTime() val model = imputer.fit(df) val end = System.nanoTime() val df2 = genData() val start2 = System.nanoTime() model.transform(df2).count val end2 = System.nanoTime() duration += ((end - start) + (end2 - start2)) / 1e9 } println((strategy, k, duration/10)) }
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org