[ 
https://issues.apache.org/jira/browse/SPARK-15258?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15297896#comment-15297896
 ] 

Kazuaki Ishizaki commented on SPARK-15258:
------------------------------------------

{{commit d642b273544bb77ef7f584326aa2d214649ac61b}} as of May 23 can reproduce 
this issue.

The following smaller code can generate code for 
{{agg_doAggregateWithoutKey()}} over 64k at whole stage codegen.

{code}
package org.apache.spark.sql

import org.apache.spark._
import org.apache.spark.sql._

object NestedCases {
  val q10 =
    """CASE
      | WHEN b0 = true THEN 'EEE'
      | WHEN b0 = false AND s0 LIKE '%peptidoglycan%' THEN 'X1'
      | ELSE 'UNKNOWN'
      |END AS s10"""
      .stripMargin

  val q20 =
    """CASE
      |
      | WHEN s10 = 'EEE' AND (s0 LIKE '%chitin%' OR s0 LIKE '%chitosan%') THEN 
'DDD2'
      | WHEN s10 = 'EEE' AND s1 = 'autotroph' AND s0 LIKE '%cellulose%' THEN 
'DDD1'
      | WHEN s10 = 'EEE' AND b1 = true AND s1 = 'heterotroph' THEN 'DDD0'
      | ELSE 'UNKNOWN'
      |END AS s20"""
      .stripMargin

  val q30 =
    """CASE
      | WHEN s20 = 'DDD0' AND b2 = true THEN 'CCC'
      | WHEN s20 = 'DDD1' AND s2 LIKE '%flowers%' THEN 'X2'
      | WHEN s20 = 'DDD2' AND s0 LIKE '%chitosan%' THEN 'X3'
      | ELSE 'UNKNOWN'
      |END AS s30"""
      .stripMargin

  val q40 =
    """CASE
      | WHEN s30 = 'CCC' AND s3 = 'fur' THEN 'BBB'
      | WHEN s30 = 'CCC' AND s3 = 'feathers' THEN 'X4'
      | WHEN s30 = 'CCC' AND s3 = 'scales' THEN 'X5'
      | WHEN s30 = 'CCC' AND s3 = 'permeable skin' THEN 'X6'
      | ELSE 'UNKNOWN'
      |END AS s40"""
      .stripMargin

  val q50 =
    """CASE
      | WHEN s40 = 'BBB' AND s2 LIKE '%upper and lower incisors%' THEN 'AAA'
      | WHEN s40 = 'BBB' AND s2 LIKE '%webbed wings%' THEN 'X7'
      | WHEN s40 = 'BBB' AND s2 LIKE '%opposable thumbs%' THEN 'X8'
      | ELSE 'UNKNOWN'
      |END AS `s50`"""
      .stripMargin

  val rows = Seq(Lifeform(false, false, false, "", "", null, null))

  def main(args: Array[String]) : Unit = {
    val sqlContext =
      new SQLContext(new SparkContext(new 
SparkConf().setMaster("local").setAppName("Nested Cases")))

    sqlContext.createDataFrame(rows)
      .cache() // Need this to stop the optimizer from pre-evaluating everything
      // and trimming the case statements.
      // Alternatively loading rows from a file seems to have same effect
      .selectExpr("*", q10).selectExpr("*", q20).selectExpr("*", q30)
      .selectExpr("*", q40).selectExpr("*", q50)
      .filter("s50 = 'AAA'") // force spark to evaluate case statements
      .count()
  }
}
{code}



> Nested/Chained case statements generate codegen over 64k exception
> ------------------------------------------------------------------
>
>                 Key: SPARK-15258
>                 URL: https://issues.apache.org/jira/browse/SPARK-15258
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 1.6.1
>            Reporter: Jonathan Gray
>         Attachments: NestedCases.scala
>
>
> Nested/Chained case-when expressions generate a codegen goes beyound 64k 
> exception.
> A test attached demonstrates this behaviour.
> I'd like to try and fix this but don't really know the best place to start.  
> Ideally, I'd like to avoid the codegen fallback as with large volumes this 
> hurts performance.
> This is similar(ish) to SPARK-13242 but I'd like to see if there are any 
> alternatives to the codegen fallback approach.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to