[ https://issues.apache.org/jira/browse/SPARK-11596?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15019158#comment-15019158 ]
Cristian commented on SPARK-11596: ---------------------------------- Although you are right that this does not reproduce without caching. It does get slower because it needs to recompute the chain everytime, but without caching it actually finishes all 100 iterations in a reasonable amount of time. With caching it virtually stops after about 20 iterations and spend all the time in toString I suspect this is because the query plans are different and there is some query plan triggered by caching that has a very expensive toString ? > SQL execution very slow for nested query plans because of > DataFrame.withNewExecutionId > -------------------------------------------------------------------------------------- > > Key: SPARK-11596 > URL: https://issues.apache.org/jira/browse/SPARK-11596 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.5.1 > Reporter: Cristian > Attachments: screenshot-1.png > > > For nested query plans like a recursive unionAll, withExecutionId is > extremely slow, likely because of repeated string concatenation in > QueryPlan.simpleString > Test case: > {code} > (1 to 100).foldLeft[Option[DataFrame]] (None) { (curr, idx) => > println(s"PROCESSING >>>>>>>>>>> $idx") > val df = sqlContext.sparkContext.parallelize((0 to > 10).zipWithIndex).toDF("A", "B") > val union = curr.map(_.unionAll(df)).getOrElse(df) > union.cache() > println(">>" + union.count) > //union.show() > Some(union) > } > {code} > Stack trace: > {quote} > scala.collection.TraversableOnce$class.addString(TraversableOnce.scala:320) > scala.collection.AbstractIterator.addString(Iterator.scala:1157) > scala.collection.TraversableOnce$class.mkString(TraversableOnce.scala:286) > scala.collection.AbstractIterator.mkString(Iterator.scala:1157) > scala.collection.TraversableOnce$class.mkString(TraversableOnce.scala:288) > scala.collection.AbstractIterator.mkString(Iterator.scala:1157) > org.apache.spark.sql.catalyst.trees.TreeNode.argString(TreeNode.scala:364) > org.apache.spark.sql.catalyst.trees.TreeNode.simpleString(TreeNode.scala:367) > org.apache.spark.sql.catalyst.plans.QueryPlan.simpleString(QueryPlan.scala:168) > org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:401) > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:403) > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:403) > scala.collection.immutable.List.foreach(List.scala:318) > org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:403) > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:403) > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:403) > scala.collection.immutable.List.foreach(List.scala:318) > org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:403) > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:403) > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:403) > scala.collection.immutable.List.foreach(List.scala:318) > org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:403) > org.apache.spark.sql.catalyst.trees.TreeNode.treeString(TreeNode.scala:372) > org.apache.spark.sql.catalyst.trees.TreeNode.toString(TreeNode.scala:369) > org.apache.spark.sql.SQLContext$QueryExecution.stringOrError(SQLContext.scala:936) > org.apache.spark.sql.SQLContext$QueryExecution.toString(SQLContext.scala:949) > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:52) > org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:1903) > org.apache.spark.sql.DataFrame.collect(DataFrame.scala:1384) > org.apache.spark.sql.DataFrame.count(DataFrame.scala:1402) > {quote} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org