[jira] [Updated] (SPARK-14616) TreeNodeException running Q44 and 58 on Parquet tables
[ https://issues.apache.org/jira/browse/SPARK-14616?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] JESSE CHEN updated SPARK-14616: --- Description: {code:title=tpcds q44} select asceding.rnk, i1.i_product_name best_performing, i2.i_product_name worst_performing from(select * from (select item_sk,rank() over (order by rank_col asc) rnk from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col from store_sales ss1 where ss_store_sk = 4 group by ss_item_sk having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit) rank_col from store_sales where ss_store_sk = 4 and ss_addr_sk is null group by ss_store_sk))V1)V11 where rnk < 11) asceding, (select * from (select item_sk,rank() over (order by rank_col desc) rnk from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col from store_sales ss1 where ss_store_sk = 4 group by ss_item_sk having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit) rank_col from store_sales where ss_store_sk = 4 and ss_addr_sk is null group by ss_store_sk))V2)V21 where rnk < 11) descending, item i1, item i2 where asceding.rnk = descending.rnk and i1.i_item_sk=asceding.item_sk and i2.i_item_sk=descending.item_sk order by asceding.rnk limit 100; {code} {noformat} bin/spark-sql --driver-memory 10g --verbose --master yarn-client --packages com.databricks:spark-csv_2.10:1.3.0 --executor-memory 4g --num-executors 80 --executor-cores 2 --database hadoopds1g -f q44.sql {noformat} {noformat} org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree: Exchange SinglePartition, None +- WholeStageCodegen : +- Project [item_sk#0,rank_col#1] : +- Filter havingCondition#219: boolean :+- TungstenAggregate(key=[ss_item_sk#12], functions=[(avg(ss_net_profit#32),mode=Final,isDistinct=false)], output=[havingCondition#219,item_sk#0,rank_col#1]) : +- INPUT +- Exchange hashpartitioning(ss_item_sk#12,200), None +- WholeStageCodegen : +- TungstenAggregate(key=[ss_item_sk#12], functions=[(avg(ss_net_profit#32),mode=Partial,isDistinct=false)], output=[ss_item_sk#12,sum#612,count#613L]) : +- Project [ss_item_sk#12,ss_net_profit#32] :+- Filter (ss_store_sk#17 = 4) : +- INPUT +- Scan ParquetRelation: hadoopds1g.store_sales[ss_item_sk#12,ss_net_profit#32,ss_store_sk#17] InputPaths: hdfs://bigaperf116.svl.ibm.com:8020/apps/hive/warehouse/hadoopds1g.db/store_sales, PushedFilters: [EqualTo(ss_store_sk,4)] at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:47) at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:105) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116) at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:60) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116) at org.apache.spark.sql.execution.Window.doExecute(Window.scala:288) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116) at org.apache.spark.sql.execution.InputAdapter.upstream(WholeStageCodegen.scala:176) at org.apache.spark.sql.execution.Filter.upstream(basicOperators.scala:73) at org.apache.spark.sql.execution.Project.upstream(basicOperators.scala:35) at org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:279) at org.apache.spark.sql.execution.SparkPlan$$anonfun$exec
[jira] [Updated] (SPARK-14616) TreeNodeException running Q44 and 58 on Parquet tables
[ https://issues.apache.org/jira/browse/SPARK-14616?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] JESSE CHEN updated SPARK-14616: --- Environment: (was: spark 1.5.1 (official binary distribution) running on hadoop yarn 2.6 with parquet 1.5.0 (both from cdh5.4.8)) > TreeNodeException running Q44 and 58 on Parquet tables > -- > > Key: SPARK-14616 > URL: https://issues.apache.org/jira/browse/SPARK-14616 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 2.0.0 >Reporter: JESSE CHEN > > {code:title=/tmp/bug.py} > from pyspark import SparkContext > from pyspark.sql import SQLContext, Row > sc = SparkContext() > sqlc = SQLContext(sc) > R = Row('id', 'foo') > r = sqlc.createDataFrame(sc.parallelize([R('abc', 'foo')])) > q = sqlc.createDataFrame(sc.parallelize([R('', > 'bar')])) > q.write.parquet('/tmp/1.parq') > q = sqlc.read.parquet('/tmp/1.parq') > j = r.join(q, r.id == q.id) > print j.count() > {code} > {noformat} > [user@sandbox test]$ spark-submit --executor-memory=32g /tmp/bug.py > [user@sandbox test]$ hadoop fs -rmr /tmp/1.parq > {noformat} > {noformat} > 15/11/04 04:28:38 INFO codegen.GenerateUnsafeProjection: Code generated in > 119.90324 ms > Traceback (most recent call last): > File "/tmp/bug.py", line 13, in > print j.count() > File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line > 268, in count > File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", > line 538, in __call__ > File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 36, > in deco > File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", > line 300, in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o148.count. > : org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, > tree: > TungstenAggregate(key=[], functions=[(count(1),mode=Final,isDistinct=false)], > output=[count#10L]) > TungstenExchange SinglePartition > TungstenAggregate(key=[], > functions=[(count(1),mode=Partial,isDistinct=false)], > output=[currentCount#13L]) >TungstenProject > BroadcastHashJoin [id#0], [id#8], BuildRight > TungstenProject [id#0] > Scan PhysicalRDD[id#0,foo#1] > ConvertToUnsafe > Scan ParquetRelation[hdfs:///tmp/1.parq][id#8] > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) > at > org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:69) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:174) > at > org.apache.spark.sql.DataFrame$$anonfun$collect$1.apply(DataFrame.scala:1385) > at > org.apache.spark.sql.DataFrame$$anonfun$collect$1.apply(DataFrame.scala:1385) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56) > at > org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:1903) > at org.apache.spark.sql.DataFrame.collect(DataFrame.scala:1384) > at org.apache.spark.sql.DataFrame.count(DataFrame.scala:1402) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > at py4j.Gateway.invoke(Gateway.java:259) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:207) > at java.lang.Thread.run(Thread.java:745) > {noformat} > Note this happens only under following condition: > # executor memory >= 32GB (doesn't fail with up to 31 GB) > # the ID in the q dataframe has exactly 24 chars (doesn't fail with less or > more then 24 chars) > # q is read from parquet -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
[jira] [Updated] (SPARK-14616) TreeNodeException running Q44 and 58 on Parquet tables
[ https://issues.apache.org/jira/browse/SPARK-14616?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] JESSE CHEN updated SPARK-14616: --- Affects Version/s: (was: 1.5.1) 2.0.0 > TreeNodeException running Q44 and 58 on Parquet tables > -- > > Key: SPARK-14616 > URL: https://issues.apache.org/jira/browse/SPARK-14616 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 2.0.0 > Environment: spark 1.5.1 (official binary distribution) running on > hadoop yarn 2.6 with parquet 1.5.0 (both from cdh5.4.8) >Reporter: JESSE CHEN > > {code:title=/tmp/bug.py} > from pyspark import SparkContext > from pyspark.sql import SQLContext, Row > sc = SparkContext() > sqlc = SQLContext(sc) > R = Row('id', 'foo') > r = sqlc.createDataFrame(sc.parallelize([R('abc', 'foo')])) > q = sqlc.createDataFrame(sc.parallelize([R('', > 'bar')])) > q.write.parquet('/tmp/1.parq') > q = sqlc.read.parquet('/tmp/1.parq') > j = r.join(q, r.id == q.id) > print j.count() > {code} > {noformat} > [user@sandbox test]$ spark-submit --executor-memory=32g /tmp/bug.py > [user@sandbox test]$ hadoop fs -rmr /tmp/1.parq > {noformat} > {noformat} > 15/11/04 04:28:38 INFO codegen.GenerateUnsafeProjection: Code generated in > 119.90324 ms > Traceback (most recent call last): > File "/tmp/bug.py", line 13, in > print j.count() > File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line > 268, in count > File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", > line 538, in __call__ > File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 36, > in deco > File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", > line 300, in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o148.count. > : org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, > tree: > TungstenAggregate(key=[], functions=[(count(1),mode=Final,isDistinct=false)], > output=[count#10L]) > TungstenExchange SinglePartition > TungstenAggregate(key=[], > functions=[(count(1),mode=Partial,isDistinct=false)], > output=[currentCount#13L]) >TungstenProject > BroadcastHashJoin [id#0], [id#8], BuildRight > TungstenProject [id#0] > Scan PhysicalRDD[id#0,foo#1] > ConvertToUnsafe > Scan ParquetRelation[hdfs:///tmp/1.parq][id#8] > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) > at > org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:69) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:174) > at > org.apache.spark.sql.DataFrame$$anonfun$collect$1.apply(DataFrame.scala:1385) > at > org.apache.spark.sql.DataFrame$$anonfun$collect$1.apply(DataFrame.scala:1385) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56) > at > org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:1903) > at org.apache.spark.sql.DataFrame.collect(DataFrame.scala:1384) > at org.apache.spark.sql.DataFrame.count(DataFrame.scala:1402) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > at py4j.Gateway.invoke(Gateway.java:259) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:207) > at java.lang.Thread.run(Thread.java:745) > {noformat} > Note this happens only under following condition: > # executor memory >= 32GB (doesn't fail with up to 31 GB) > # the ID in the q dataframe has exactly 24 chars (doesn't fail with less or > more then 24 chars) > # q is read from parquet -- This message was sent by Atlassian JIRA (v6.3.4#6332) ---