[ 
https://issues.apache.org/jira/browse/SPARK-24481?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Andrew Conegliano updated SPARK-24481:
--------------------------------------
    Description: 
Similar to other "grows beyond 64 KB" errors.  Happens with large case 
statement:
{code:java}
// Databricks notebook source
import org.apache.spark.sql.functions._
import scala.collection.mutable
import org.apache.spark.sql.Column

var rdd = sc.parallelize(Array("""{
"event":
{
"timestamp": 1521086591110,
"event_name": "yu",
"page":
{
"page_url": "https://";,
"page_name": "es"
},
"properties":
{
"id": "87",
"action": "action",
"navigate_action": "navigate_action"
}
}
}
"""))

var df = spark.read.json(rdd)
df = 
df.select("event.properties.id","event.timestamp","event.page.page_url","event.properties.action","event.page.page_name","event.event_name","event.properties.navigate_action")
.toDF("id","event_time","url","action","page_name","event_name","navigation_action")

var a = "case "
for(i <- 1 to 300){
  a = a + s"when action like '$i%' THEN '$i' "
}
a = a + " else null end as task_id"

val expression = expr(a)

df = df.filter("id is not null and id <> '' and event_time is not null")

val transformationExpressions: mutable.HashMap[String, Column] = 
mutable.HashMap(
"action" -> expr("coalesce(action, navigation_action) as action"),
"task_id" -> expression
)

for((col, expr) <- transformationExpressions)
df = df.withColumn(col, expr)

df = df.filter("(action is not null and action <> '') or (page_name is not null 
and page_name <> '')")

df.show
{code}
Log file is attached

  was:
Similar to other "grows beyond 64 KB" errors.  Happens with large case 
statement:
{code:java}
// Databricks notebook source
import org.apache.spark.sql.functions._
import scala.collection.mutable
import org.apache.spark.sql.Column

var rdd = sc.parallelize(Array("""{
"event":
{
"timestamp": 1521086591110,
"event_name": "yu",
"page":
{
"page_url": "https://";,
"page_name": "es"
},
"properties":
{
"id": "87",
"action": "action",
"navigate_action": "navigate_action"
}
}
}
"""))

var df = spark.read.json(rdd)
df = 
df.select("event.properties.id","event.timestamp","event.page.page_url","event.properties.action","event.page.page_name","event.event_name","event.properties.navigate_action")
.toDF("id","event_time","url","action","page_name","event_name","navigation_action")

var a = "case "
for(i <- 1 to 300)
a = a + s"when action like '$i%' THEN '$i' "
a = a + " else null end as task_id"

val expression = expr(a)

df = df.filter("id is not null and id <> '' and event_time is not null")

val transformationExpressions: mutable.HashMap[String, Column] = 
mutable.HashMap(
"action" -> expr("coalesce(action, navigation_action) as action"),
"task_id" -> expression
)

for((col, expr) <- transformationExpressions)
df = df.withColumn(col, expr)

df = df.filter("(action is not null and action <> '') or (page_name is not null 
and page_name <> '')")

df.show
{code}
Log file is attached


> GeneratedIteratorForCodegenStage1 grows beyond 64 KB
> ----------------------------------------------------
>
>                 Key: SPARK-24481
>                 URL: https://issues.apache.org/jira/browse/SPARK-24481
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.3.0
>         Environment: Emr 5.13.0
>            Reporter: Andrew Conegliano
>            Priority: Major
>         Attachments: log4j-active(1).log
>
>
> Similar to other "grows beyond 64 KB" errors.  Happens with large case 
> statement:
> {code:java}
> // Databricks notebook source
> import org.apache.spark.sql.functions._
> import scala.collection.mutable
> import org.apache.spark.sql.Column
> var rdd = sc.parallelize(Array("""{
> "event":
> {
> "timestamp": 1521086591110,
> "event_name": "yu",
> "page":
> {
> "page_url": "https://";,
> "page_name": "es"
> },
> "properties":
> {
> "id": "87",
> "action": "action",
> "navigate_action": "navigate_action"
> }
> }
> }
> """))
> var df = spark.read.json(rdd)
> df = 
> df.select("event.properties.id","event.timestamp","event.page.page_url","event.properties.action","event.page.page_name","event.event_name","event.properties.navigate_action")
> .toDF("id","event_time","url","action","page_name","event_name","navigation_action")
> var a = "case "
> for(i <- 1 to 300){
>   a = a + s"when action like '$i%' THEN '$i' "
> }
> a = a + " else null end as task_id"
> val expression = expr(a)
> df = df.filter("id is not null and id <> '' and event_time is not null")
> val transformationExpressions: mutable.HashMap[String, Column] = 
> mutable.HashMap(
> "action" -> expr("coalesce(action, navigation_action) as action"),
> "task_id" -> expression
> )
> for((col, expr) <- transformationExpressions)
> df = df.withColumn(col, expr)
> df = df.filter("(action is not null and action <> '') or (page_name is not 
> null and page_name <> '')")
> df.show
> {code}
> Log file is attached



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to