[ https://issues.apache.org/jira/browse/SPARK-25175?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16593185#comment-16593185 ]
Chenxiao Mao edited comment on SPARK-25175 at 8/27/18 6:45 AM: --------------------------------------------------------------- Thorough investigation about ORC tables {code:java} val data = spark.range(5).selectExpr("id as a", "id * 2 as B", "id * 3 as c", "id * 4 as C") spark.conf.set("spark.sql.caseSensitive", true) data.write.format("orc").mode("overwrite").save("/user/hive/warehouse/orc_data") $> hive --orcfiledump /user/hive/warehouse/orc_data/part-00001-9716d241-9ad9-4d56-8de3-7bc482067614-c000.snappy.orc Structure for /user/hive/warehouse/orc_data/part-00001-9716d241-9ad9-4d56-8de3-7bc482067614-c000.snappy.orc Type: struct<a:bigint,B:bigint,c:bigint,C:bigint> CREATE TABLE orc_data_source_lower (a LONG, b LONG, c LONG) USING orc LOCATION '/user/hive/warehouse/orc_data' CREATE TABLE orc_data_source_upper (A LONG, B LONG, C LONG) USING orc LOCATION '/user/hive/warehouse/orc_data' CREATE TABLE orc_hive_serde_lower (a LONG, b LONG, c LONG) STORED AS orc LOCATION '/user/hive/warehouse/orc_data' CREATE TABLE orc_hive_serde_upper (A LONG, B LONG, C LONG) STORED AS orc LOCATION '/user/hive/warehouse/orc_data' DESC EXTENDED orc_data_source_lower; DESC EXTENDED orc_data_source_upper; DESC EXTENDED orc_hive_serde_lower; DESC EXTENDED orc_hive_serde_upper; spark.conf.set("spark.sql.hive.convertMetastoreOrc", false) {code} ||no.||caseSensitive||table columns||select column||orc column (select via data source table, hive impl)||orc column (select via data source table, native impl)||orc column (select via hive serde table)|| |1|true|a, b, c|a|a |a|IndexOutOfBoundsException | |2| | |b|B |null|IndexOutOfBoundsException | |3| | |c|c |c|IndexOutOfBoundsException | |4| | |A|AnalysisException|AnalysisException|AnalysisException| |5| | |B|AnalysisException|AnalysisException|AnalysisException| |6| | |C|AnalysisException|AnalysisException|AnalysisException| |7| |A, B, C|a|AnalysisException |AnalysisException|AnalysisException| |8| | |b|AnalysisException |AnalysisException|AnalysisException | |9| | |c|AnalysisException |AnalysisException|AnalysisException | |10| | |A|a |null|IndexOutOfBoundsException | |11| | |B|B |B|IndexOutOfBoundsException | |12| | |C|c |C|IndexOutOfBoundsException | |13|false|a, b, c|a|a |a|IndexOutOfBoundsException | |14| | |b|B |B|IndexOutOfBoundsException | |15| | |c|c |c|IndexOutOfBoundsException | |16| | |A|a |a|IndexOutOfBoundsException | |17| | |B|B |B|IndexOutOfBoundsException | |18| | |C|c |c|IndexOutOfBoundsException | |19| |A, B, C|a|a |a|IndexOutOfBoundsException | |20| | |b|B |B|IndexOutOfBoundsException | |21| | |c|c |c|IndexOutOfBoundsException | |22| | |A|a |a|IndexOutOfBoundsException | |23| | |B|B |B|IndexOutOfBoundsException | |24| | |C|c |c|IndexOutOfBoundsException | was (Author: seancxmao): Thorough investigation about ORC tables {code} val data = spark.range(5).selectExpr("id as a", "id * 2 as B", "id * 3 as c", "id * 4 as C") spark.conf.set("spark.sql.caseSensitive", true) data.write.format("orc").mode("overwrite").save("/user/hive/warehouse/orc_data") $> hive --orcfiledump /user/hive/warehouse/orc_data/part-00001-9716d241-9ad9-4d56-8de3-7bc482067614-c000.snappy.orc Structure for /user/hive/warehouse/orc_data/part-00001-9716d241-9ad9-4d56-8de3-7bc482067614-c000.snappy.orc Type: struct<a:bigint,B:bigint,c:bigint,C:bigint> CREATE TABLE orc_data_source_lower (a LONG, b LONG, c LONG) USING orc LOCATION '/user/hive/warehouse/orc_data' CREATE TABLE orc_data_source_upper (A LONG, B LONG, C LONG) USING orc LOCATION '/user/hive/warehouse/orc_data' CREATE TABLE orc_hive_serde_lower (a LONG, b LONG, c LONG) STORED AS orc LOCATION '/user/hive/warehouse/orc_data' CREATE TABLE orc_hive_serde_upper (A LONG, B LONG, C LONG) STORED AS orc LOCATION '/user/hive/warehouse/orc_data' DESC EXTENDED orc_data_source_lower; DESC EXTENDED orc_data_source_upper; DESC EXTENDED orc_hive_serde_lower; DESC EXTENDED orc_hive_serde_upper; spark.conf.set("spark.sql.hive.convertMetastoreOrc", false) {code} ||no.||caseSensitive||table columns||select column||orc column (select via data source table)||orc column (select via hive serde table)|| |1|true|a, b, c|a|a |IndexOutOfBoundsException | |2| | |b|B |IndexOutOfBoundsException | |3| | |c|c |IndexOutOfBoundsException | |4| | |A|AnalysisException|AnalysisException| |5| | |B|AnalysisException|AnalysisException| |6| | |C|AnalysisException|AnalysisException| |7| |A, B, C|a|AnalysisException |AnalysisException| |8| | |b|AnalysisException |AnalysisException | |9| | |c|AnalysisException |AnalysisException | |10| | |A|a |IndexOutOfBoundsException | |11| | |B|B |IndexOutOfBoundsException | |12| | |C|c |IndexOutOfBoundsException | |13|false|a, b, c|a|a |IndexOutOfBoundsException | |14| | |b|B |IndexOutOfBoundsException | |15| | |c|c |IndexOutOfBoundsException | |16| | |A|a |IndexOutOfBoundsException | |17| | |B|B |IndexOutOfBoundsException | |18| | |C|c |IndexOutOfBoundsException | |19| |A, B, C|a|a |IndexOutOfBoundsException | |20| | |b|B |IndexOutOfBoundsException | |21| | |c|c |IndexOutOfBoundsException | |22| | |A|a |IndexOutOfBoundsException | |23| | |B|B |IndexOutOfBoundsException | |24| | |C|c |IndexOutOfBoundsException | > Case-insensitive field resolution when reading from ORC > ------------------------------------------------------- > > Key: SPARK-25175 > URL: https://issues.apache.org/jira/browse/SPARK-25175 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.3.1 > Reporter: Chenxiao Mao > Priority: Major > > SPARK-25132 adds support for case-insensitive field resolution when reading > from Parquet files. We found ORC files have similar issues. Since Spark has 2 > OrcFileFormat, we should add support for both. > * Since SPARK-2883, Spark supports ORC inside sql/hive module with Hive > dependency. This hive OrcFileFormat doesn't support case-insensitive field > resolution at all. > * SPARK-20682 adds a new ORC data source inside sql/core. This native > OrcFileFormat supports case-insensitive field resolution, however it cannot > handle duplicate fields. -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org