[ https://issues.apache.org/jira/browse/SPARK-11087?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14993771#comment-14993771 ]
patcharee edited comment on SPARK-11087 at 11/6/15 2:56 PM: ------------------------------------------------------------ Hi, I found a scenario where the predicate does not work again. [~zzhan] Can you please have a look? First create a hive table >> hive> create table people(name string, address string, phone string) partitioned by(age int) stored as orc; Then use spark shell local mode to insert data and then query >> 120 import org.apache.spark.sql.Row 121 import org.apache.spark.{SparkConf, SparkContext} 122 import org.apache.spark.sql.types._ 123 import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType} 124 sqlContext.setConf("hive.exec.dynamic.partition.mode","nonstrict") 125 val records = (1 to 10).map( i => Row(s"name_$i", s"address_$i", s"phone_$i", i )) 126 val schemaString = "name address phone age" 127 val schema = StructType(schemaString.split(" ").map(fieldName => if (fieldName.equals("age")) StructField(fieldName, IntegerType, true) else StructField(fieldName, StringType, true))) 128 val x = sc.parallelize(records) 129 val rDF = sqlContext.createDataFrame(x, schema) 130 rDF.write.format("org.apache.spark.sql.hive.orc.DefaultSource").mode(org.apache.spark.sql.SaveMode.Append).partitionBy("age").saveAsTable("people") 131 sqlContext.setConf("spark.sql.orc.filterPushdown", "true") 132 val people = sqlContext.read.format("orc").load("/user/hive/warehouse/people") 133 people.registerTempTable("people") 134 sqlContext.sql("SELECT * FROM people WHERE age = 3 and name = 'name_3'").count Below is a part of the log message from the last command >> 15/11/06 15:40:36 INFO HadoopRDD: Input split: hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000:0+453 15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate 15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate 15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null 15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null 15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with {include: [true, true, false, false], offset: 0, length: 9223372036854775807} 15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with {include: [true, true, false, false], offset: 0, length: 9223372036854775807} 15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read. Using file schema. 15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read. Using file schema. 15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126] 15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126] 15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size: 15 type: array-backed] 15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size: 15 type: array-backed] 15/11/06 15:40:36 INFO GeneratePredicate: Code generated in 5.063287 ms was (Author: patcharee): Hi, I found a scenario where the predicate does not work again. [~zzhan] Can you please have a look? First create a hive table >> hive> create table people(name string, address string, phone string) partitioned by(age int) stored as orc; Then use spark shell local mode to insert data and then query >> 120 import org.apache.spark.sql.Row 121 import org.apache.spark.{SparkConf, SparkContext} 122 import org.apache.spark.sql.types._ 123 import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType} 124 sqlContext.setConf("hive.exec.dynamic.partition.mode","nonstrict") 125 val records = (1 to 10).map( i => Row(s"name_$i", s"address_$i", s"phone_$i", i )) 126 val schemaString = "name address phone age" 127 val schema = StructType(schemaString.split(" ").map(fieldName => if (fieldName.equals("age")) StructField(fieldName, IntegerType, true) else StructField(fieldName, StringType, true))) 128 val x = sc.parallelize(records) 129 val rDF = sqlContext.createDataFrame(x, schema) 130 rDF.write.format("org.apache.spark.sql.hive.orc.DefaultSource").mode(org.apache.spark.sql.SaveMode.Append).partitionBy("age").saveAsTable("people") 131 sqlContext.setConf("spark.sql.orc.filterPushdown", "true") 132 val people = sqlContext.read.format("orc").load("/user/hive/warehouse/people") 133 people.registerTempTable("people") 134 sqlContext.sql("SELECT * FROM people WHERE age = 3 and name = 'name_3'").count 15/11/06 15:40:36 INFO HadoopRDD: Input split: hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000:0+453 15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate 15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate 15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null 15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null 15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with {include: [true, true, false, false], offset: 0, length: 9223372036854775807} 15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with {include: [true, true, false, false], offset: 0, length: 9223372036854775807} 15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read. Using file schema. 15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read. Using file schema. 15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126] 15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126] 15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size: 15 type: array-backed] 15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size: 15 type: array-backed] 15/11/06 15:40:36 INFO GeneratePredicate: Code generated in 5.063287 ms > spark.sql.orc.filterPushdown does not work, No ORC pushdown predicate > --------------------------------------------------------------------- > > Key: SPARK-11087 > URL: https://issues.apache.org/jira/browse/SPARK-11087 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.5.1 > Environment: orc file version 0.12 with HIVE_8732 > hive version 1.2.1.2.3.0.0-2557 > Reporter: patcharee > Priority: Minor > > I have an external hive table stored as partitioned orc file (see the table > schema below). I tried to query from the table with where clause> > hiveContext.setConf("spark.sql.orc.filterPushdown", "true") > hiveContext.sql("select u, v from 4D where zone = 2 and x = 320 and y = > 117")). > But from the log file with debug logging level on, the ORC pushdown predicate > was not generated. > Unfortunately my table was not sorted when I inserted the data, but I > expected the ORC pushdown predicate should be generated (because of the where > clause) though > Table schema > ================================ > hive> describe formatted 4D; > OK > # col_name data_type comment > > date int > hh int > x int > y int > height float > u float > v float > w float > ph float > phb float > t float > p float > pb float > qvapor float > qgraup float > qnice float > qnrain float > tke_pbl float > el_pbl float > qcloud float > > # Partition Information > # col_name data_type comment > > zone int > z int > year int > month int > > # Detailed Table Information > Database: default > Owner: patcharee > CreateTime: Thu Jul 09 16:46:54 CEST 2015 > LastAccessTime: UNKNOWN > Protect Mode: None > Retention: 0 > Location: hdfs://helmhdfs/apps/hive/warehouse/wrf_tables/4D > > Table Type: EXTERNAL_TABLE > Table Parameters: > EXTERNAL TRUE > comment this table is imported from rwf_data/*/wrf/* > last_modified_by patcharee > last_modified_time 1439806692 > orc.compress ZLIB > transient_lastDdlTime 1439806692 > > # Storage Information > SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde > InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat > OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat > > Compressed: No > Num Buckets: -1 > Bucket Columns: [] > Sort Columns: [] > Storage Desc Params: > serialization.format 1 > Time taken: 0.388 seconds, Fetched: 58 row(s) > ================================ > Data was inserted into this table by another spark job> > df.write.format("org.apache.spark.sql.hive.orc.DefaultSource").mode(org.apache.spark.sql.SaveMode.Append).partitionBy("zone","z","year","month").saveAsTable("4D") -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org