[ 
https://issues.apache.org/jira/browse/SPARK-35500?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yahui Liu updated SPARK-35500:
------------------------------
    Description: 
Reproduce steps:
 # create a new table with array type: create table test_code_gen(a array<int>);
 # Add 
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator = 
DEBUG to log4j.properties;
 # Enter spark-shell, fire a query: spark.sql("select * from 
test_code_gen").collect
 # Everytime, Dataset.collect is called, SpecificSafeProjection class is 
generated, but the code for the class cannot be reused because everytime the id 
for two variables in the generated class is changed: MapObjects_loopValue and 
MapObjects_loopIsNull. So even the class generated before has been cached, new 
code cannot match the cache key so that new code need to be compiled again 
which cost some time.  
{code:java}
object MapObjects {
  private val curId = new java.util.concurrent.atomic.AtomicInteger()
 val id = curId.getAndIncrement()
 val loopValue = s"MapObjects_loopValue$id"
 val loopIsNull = if (elementNullable) {
   s"MapObjects_loopIsNull$id"
 } else {
   "false"
 }
{code}
First time run: 
 # 
 # The time cost for compile is increasing with the growth of column number, 
for wide table, this cost can more than 2s. 
 # 

 

         First time run:

          class SpecificSafeProjection extends 
org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {

                    private int MapObjects_loopValue{color:#FF0000}1{color};
                    private boolean 
MapObjects_loopIsNull{color:#FF0000}1{color};
                    private UTF8String 
MapObjects_loopValue{color:#FF0000}2{color};
                    private boolean 
MapObjects_loopIsNull{color:#FF0000}2{color};

          }

         Second time run:



          class SpecificSafeProjection extends 
org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {

                    private int MapObjects_loopValue{color:#FF0000}3{color};
                    private boolean 
MapObjects_loopIsNull{color:#FF0000}3{color};
                    private UTF8String 
MapObjects_loopValue{color:#FF0000}4{color};
                    private boolean 
MapObjects_loopIsNull{color:#FF0000}4{color};

          }

  was:
Reproduce steps:
 # create a new table with array type: create table test_code_gen(a array<int>);
 # Add 
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator = 
DEBUG to log4j.properties;
 # Enter spark-shell, fire a query: spark.sql("select * from 
test_code_gen").collect
 # Everytime, Dataset.collect is called, SpecificSafeProjection class is 
generated, but the code for the class cannot be reused because everytime the id 
for two variables in the generated class is changed: MapObjects_loopValue and 
MapObjects_loopIsNull. So even the class generated before has been cached, new 
code cannot match the cache key so that new code need to be compiled again 
which cost some time.  

object MapObjects {
 private val curId = new java.util.concurrent.atomic.AtomicInteger()
 # The time cost for compile is increasing with the growth of column number, 
for wide table, this cost can more than 2s. 


> GenerateSafeProjection.generate will generate SpecificSafeProjection class, 
> but if column is array type or map type, the code cannot be reused which 
> impact the query performance
> ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: SPARK-35500
>                 URL: https://issues.apache.org/jira/browse/SPARK-35500
>             Project: Spark
>          Issue Type: Improvement
>          Components: SQL
>    Affects Versions: 3.1.0
>            Reporter: Yahui Liu
>            Priority: Minor
>              Labels: codegen
>
> Reproduce steps:
>  # create a new table with array type: create table test_code_gen(a 
> array<int>);
>  # Add 
> log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator 
> = DEBUG to log4j.properties;
>  # Enter spark-shell, fire a query: spark.sql("select * from 
> test_code_gen").collect
>  # Everytime, Dataset.collect is called, SpecificSafeProjection class is 
> generated, but the code for the class cannot be reused because everytime the 
> id for two variables in the generated class is changed: MapObjects_loopValue 
> and MapObjects_loopIsNull. So even the class generated before has been 
> cached, new code cannot match the cache key so that new code need to be 
> compiled again which cost some time.  
> {code:java}
> object MapObjects {
>   private val curId = new java.util.concurrent.atomic.AtomicInteger()
>  val id = curId.getAndIncrement()
>  val loopValue = s"MapObjects_loopValue$id"
>  val loopIsNull = if (elementNullable) {
>    s"MapObjects_loopIsNull$id"
>  } else {
>    "false"
>  }
> {code}
> First time run: 
>  # 
>  # The time cost for compile is increasing with the growth of column number, 
> for wide table, this cost can more than 2s. 
>  # 
>  
>          First time run:
>           class SpecificSafeProjection extends 
> org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
>                     private int MapObjects_loopValue{color:#FF0000}1{color};
>                     private boolean 
> MapObjects_loopIsNull{color:#FF0000}1{color};
>                     private UTF8String 
> MapObjects_loopValue{color:#FF0000}2{color};
>                     private boolean 
> MapObjects_loopIsNull{color:#FF0000}2{color};
>           }
>          Second time run:
>           class SpecificSafeProjection extends 
> org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
>                     private int MapObjects_loopValue{color:#FF0000}3{color};
>                     private boolean 
> MapObjects_loopIsNull{color:#FF0000}3{color};
>                     private UTF8String 
> MapObjects_loopValue{color:#FF0000}4{color};
>                     private boolean 
> MapObjects_loopIsNull{color:#FF0000}4{color};
>           }



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to