[ https://issues.apache.org/jira/browse/SPARK-24481?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16504965#comment-16504965 ]
Andrew Conegliano commented on SPARK-24481: ------------------------------------------- Thanks Marco. Forgot to mention, this error doesn't happen in 2.0.2 and 2.2.0. And for 2.3.0, even though the error pops up, the code will still run because it disables wholestagecodegen to run it. The main problem is that in a spark streaming context, the error pops up for every message so logs fill disk very quickly. > GeneratedIteratorForCodegenStage1 grows beyond 64 KB > ---------------------------------------------------- > > Key: SPARK-24481 > URL: https://issues.apache.org/jira/browse/SPARK-24481 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.3.0 > Environment: Emr 5.13.0 and Databricks Cloud 4.0 > Reporter: Andrew Conegliano > Priority: Major > Attachments: log4j-active(1).log > > > Similar to other "grows beyond 64 KB" errors. Happens with large case > statement: > {code:java} > import org.apache.spark.sql.functions._ > import scala.collection.mutable > import org.apache.spark.sql.Column > var rdd = sc.parallelize(Array("""{ > "event": > { > "timestamp": 1521086591110, > "event_name": "yu", > "page": > { > "page_url": "https://", > "page_name": "es" > }, > "properties": > { > "id": "87", > "action": "action", > "navigate_action": "navigate_action" > } > } > } > """)) > var df = spark.read.json(rdd) > df = > df.select("event.properties.id","event.timestamp","event.page.page_url","event.properties.action","event.page.page_name","event.event_name","event.properties.navigate_action") > .toDF("id","event_time","url","action","page_name","event_name","navigation_action") > var a = "case " > for(i <- 1 to 300){ > a = a + s"when action like '$i%' THEN '$i' " > } > a = a + " else null end as task_id" > val expression = expr(a) > df = df.filter("id is not null and id <> '' and event_time is not null") > val transformationExpressions: mutable.HashMap[String, Column] = > mutable.HashMap( > "action" -> expr("coalesce(action, navigation_action) as action"), > "task_id" -> expression > ) > for((col, expr) <- transformationExpressions) > df = df.withColumn(col, expr) > df = df.filter("(action is not null and action <> '') or (page_name is not > null and page_name <> '')") > df.show > {code} > > Exception: > {code:java} > 18/06/07 01:06:34 ERROR CodeGenerator: failed to compile: > org.codehaus.janino.InternalCompilerException: Compiling "GeneratedClass": > Code of method > "project_doConsume$(Lorg/apache/spark/sql/catalyst/expressions/GeneratedClass$GeneratedIteratorForCodegenStage1;Lorg/apache/spark/sql/catalyst/InternalRow;)V" > of class > "org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1" > grows beyond 64 KB > org.codehaus.janino.InternalCompilerException: Compiling "GeneratedClass": > Code of method > "project_doConsume$(Lorg/apache/spark/sql/catalyst/expressions/GeneratedClass$GeneratedIteratorForCodegenStage1;Lorg/apache/spark/sql/catalyst/InternalRow;)V" > of class > "org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1" > grows beyond 64 KB > at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:361) > at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:234) > at > org.codehaus.janino.SimpleCompiler.compileToClassLoader(SimpleCompiler.java:446) > at > org.codehaus.janino.ClassBodyEvaluator.compileToClass(ClassBodyEvaluator.java:313) > at > org.codehaus.janino.ClassBodyEvaluator.cook(ClassBodyEvaluator.java:235) > at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:204) > at org.codehaus.commons.compiler.Cookable.cook(Cookable.java:80) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:1444) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1523) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1520) > at > com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3522) > at > com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2315) > at > com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2278) > at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2193) > at com.google.common.cache.LocalCache.get(LocalCache.java:3932) > at com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3936) > at > com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4806) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.compile(CodeGenerator.scala:1392) > at > org.apache.spark.sql.execution.WholeStageCodegenExec.liftedTree1$1(WholeStageCodegenExec.scala:579) > at > org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:578) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:135) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$3.apply(SparkPlan.scala:167) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:164) > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:61) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:70) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:45) > at > org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectResult(Dataset.scala:2759) > at > org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3331) > at > org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2488) > at > org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2488) > at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3315) > at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:88) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:124) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3314) > at org.apache.spark.sql.Dataset.head(Dataset.scala:2488) > at org.apache.spark.sql.Dataset.take(Dataset.scala:2702) > at org.apache.spark.sql.Dataset.showString(Dataset.scala:258) > at org.apache.spark.sql.Dataset.show(Dataset.scala:727) > at org.apache.spark.sql.Dataset.show(Dataset.scala:686) > at org.apache.spark.sql.Dataset.show(Dataset.scala:695) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-687647945500165:1) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-687647945500165:51) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-687647945500165:53) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw$$iw$$iw$$iw.<init>(command-687647945500165:55) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw$$iw$$iw.<init>(command-687647945500165:57) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw$$iw.<init>(command-687647945500165:59) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw$$iw.<init>(command-687647945500165:61) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$$iw.<init>(command-687647945500165:63) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read.<init>(command-687647945500165:65) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$.<init>(command-687647945500165:69) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$read$.<clinit>(command-687647945500165) > at > line7b2cd01e0857498cbfa87d4dfaadb85d46.$eval$.$print$lzycompute(<notebook>:7) > at line7b2cd01e0857498cbfa87d4dfaadb85d46.$eval$.$print(<notebook>:6) > at line7b2cd01e0857498cbfa87d4dfaadb85d46.$eval.$print(<notebook>) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786) > at > scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047) > at > scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638) > at > scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637) > at > scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31) > at > scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19) > at > scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637) > at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569) > at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565) > at > com.databricks.backend.daemon.driver.DriverILoop.execute(DriverILoop.scala:186) > at > com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply$mcV$sp(ScalaDriverLocal.scala:189) > at > com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply(ScalaDriverLocal.scala:189) > at > com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply(ScalaDriverLocal.scala:189) > at > com.databricks.backend.daemon.driver.DriverLocal$TrapExitInternal$.trapExit(DriverLocal.scala:500) > at > com.databricks.backend.daemon.driver.DriverLocal$TrapExit$.apply(DriverLocal.scala:456) > at > com.databricks.backend.daemon.driver.ScalaDriverLocal.repl(ScalaDriverLocal.scala:189) > at > com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$3.apply(DriverLocal.scala:249) > at > com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$3.apply(DriverLocal.scala:229) > at > com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:188) > at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58) > at > com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:183) > at > com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:43) > at > com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:221) > at > com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:43) > at > com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:229) > at > com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:601) > at > com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:601) > at scala.util.Try$.apply(Try.scala:192) > at > com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:596) > at > com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:486) > at > com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:554) > at > com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:391) > at > com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:348) > at > com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:215) > at java.lang.Thread.run(Thread.java:748){code} > > Log file is attached -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org