Github user viirya commented on the issue:

    https://github.com/apache/spark/pull/19229
  
    I ran the test codes to benchmark RDD-version and DataFrame version with 
this `ImputerModel` change:
    
        import org.apache.spark.ml.feature._
        import org.apache.spark.sql.{DataFrame, Row}
        import org.apache.spark.sql.types._
        import spark.implicits._
        import scala.util.Random
    
        def genData(): DataFrame = {
          val seed = 123l
          val random = new Random(seed)
          val n = 10000
          val m = 100
          val rows = sc.parallelize(1 to n).map(i=> 
Row(Array.fill(m)(random.nextDouble): _*))
          val struct = new StructType(Array.range(0,m,1).map(i => 
StructField(s"c$i",DoubleType,true)))
          val df = spark.createDataFrame(rows, struct)
          df.cache()
          df.count()
          df
        }
    
        for (strategy <- Seq("mean", "median"); k <- Seq(1,10,100)) {
          val imputer = new 
Imputer().setStrategy(strategy).setInputCols(Array.range(0,k,1).map(i=>s"c$i")).setOutputCols(Array.range(0,k,1).map(i=>s"o$i"))
          var duration = 0.0
          for (i<- 0 until 10) {
            val df = genData()
    
            val start = System.nanoTime()
            val model = imputer.fit(df)
            val end = System.nanoTime()
    
            val df2 = genData()
    
            val start2 = System.nanoTime()
            model.transform(df2).count
            val end2 = System.nanoTime()
    
            duration += ((end - start) + (end2 - start2)) / 1e9
          }
          println((strategy, k, duration/10))
        }



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to