[ https://issues.apache.org/jira/browse/SPARK-7276?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Reynold Xin updated SPARK-7276: ------------------------------- Description: The code snippet demonstrates the problem. {code} val sparkConf = new SparkConf().setAppName("Spark Test").setMaster(System.getProperty("spark.master", "local[4]")) val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val custs = Seq( Row(1, "Bob", 21, 80.5), Row(2, "Bobby", 21, 80.5), Row(3, "Jean", 21, 80.5), Row(4, "Fatime", 21, 80.5) ) var fields = List( StructField("id", IntegerType, true), StructField("a", IntegerType, true), StructField("b", StringType, true), StructField("target", DoubleType, false)) val schema = StructType(fields) var rdd = sc.parallelize(custs) var df = sqlContext.createDataFrame(rdd, schema) for (i <- 1 to 200) { val now = System.currentTimeMillis df = df.withColumn("a_new_col_" + i, df("a") + i) println(s"$i -> " + (System.currentTimeMillis - now)) } df.show() {code} was: The code snippet demonstrates the problem. val sparkConf = new SparkConf().setAppName("Spark Test").setMaster(System.getProperty("spark.master", "local[4]")) val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val custs = Seq( Row(1, "Bob", 21, 80.5), Row(2, "Bobby", 21, 80.5), Row(3, "Jean", 21, 80.5), Row(4, "Fatime", 21, 80.5) ) var fields = List( StructField("id", IntegerType, true), StructField("a", IntegerType, true), StructField("b", StringType, true), StructField("target", DoubleType, false)) val schema = StructType(fields) var rdd = sc.parallelize(custs) var df = sqlContext.createDataFrame(rdd, schema) for (i <- 1 to 200) { val now = System.currentTimeMillis df = df.withColumn("a_new_col_" + i, df("a") + i) println(s"$i -> " + (System.currentTimeMillis - now)) } df.show() > withColumn is very slow on dataframe with large number of columns > ----------------------------------------------------------------- > > Key: SPARK-7276 > URL: https://issues.apache.org/jira/browse/SPARK-7276 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.3.1 > Reporter: Alexandre CLEMENT > > The code snippet demonstrates the problem. > {code} > val sparkConf = new SparkConf().setAppName("Spark > Test").setMaster(System.getProperty("spark.master", "local[4]")) > val sc = new SparkContext(sparkConf) > val sqlContext = new SQLContext(sc) > import sqlContext.implicits._ > val custs = Seq( > Row(1, "Bob", 21, 80.5), > Row(2, "Bobby", 21, 80.5), > Row(3, "Jean", 21, 80.5), > Row(4, "Fatime", 21, 80.5) > ) > var fields = List( > StructField("id", IntegerType, true), > StructField("a", IntegerType, true), > StructField("b", StringType, true), > StructField("target", DoubleType, false)) > val schema = StructType(fields) > var rdd = sc.parallelize(custs) > var df = sqlContext.createDataFrame(rdd, schema) > for (i <- 1 to 200) { > val now = System.currentTimeMillis > df = df.withColumn("a_new_col_" + i, df("a") + i) > println(s"$i -> " + (System.currentTimeMillis - now)) > } > df.show() > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org