[ https://issues.apache.org/jira/browse/HIVE-14044?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
David Nies updated HIVE-14044: ------------------------------ Description: When {{\n}} characters are contained in Avro files that are used as data bases for an external table, the result of {{SELECT}} queries may be corrupt. I encountered this error when querying hive both from {{beeline}} and from JDBC. h3. Steps to reproduce (used files are attached to ticket) # Create an {{.avro}} file that contains newline characters in a value of a map: {code} avro-tools fromjson --schema-file test.schema test.json > test.avro {code} # Copy {{.avro}} file to HDFS {code} hdfs dfs -copyFromLocal test.avro /some/location/ {code} # Create an external table in beeline containing this {{.avro}}: {code} beeline> CREATE EXTERNAL TABLE broken_newline_map ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '/user/dnies/hive-test/broken-newline/db' TBLPROPERTIES ('avro.schema.literal'=' { "type" : "record", "name" : "myEntry", "namespace" : "myNamespace", "fields" : [ { "name" : "foo", "type" : "long" }, { "name" : "bar", "type" : { "type" : "map", "values" : "string" } } ] } '); {code} # Now, selecting may return corrupt results: {code} jdbc:hive2://my-server:10000/> select * from broken_newline_map; +-------------------------+---------------------------------------------------+--+ | broken_newline_map.foo | broken_newline_map.bar | +-------------------------+---------------------------------------------------+--+ | 1 | {"key2":"value2","key1":"value1\nafter newline"} | | 2 | {"key2":"new value2","key1":"new value"} | +-------------------------+---------------------------------------------------+--+ 2 rows selected (1.661 seconds) jdbc:hive2://my-server:10000/> select foo, map_keys(bar), map_values(bar) from broken_newline_map; +-------+------------------+-----------------------------+--+ | foo | _c1 | _c2 | +-------+------------------+-----------------------------+--+ | 1 | ["key2","key1"] | ["value2","value1"] | | NULL | NULL | NULL | | 2 | ["key2","key1"] | ["new value2","new value"] | +-------+------------------+-----------------------------+--+ 3 rows selected (28.05 seconds) {code} Obviously, the last result set contains corrupt entries (line 2) and incorrect entries (line 1). I also encountered this when doing this query with JDBC. was: When {{\n}} characters are contained in Avro files that are used as data bases for an external table, the result of {{SELECT}} queries may be corrupt. I encountered this error when querying hive both from {{beeline}} and from JDBC. h3. Steps to reproduce (used files are attached to ticket) # Create an {{.avro}} file that contains newline characters in a value of a map: {code} avro-tools fromjson --schema-file test.schema test.json > test.avro {code} # Copy {{.avro}} file to HDFS {code} hdfs dfs -copyFromLocal test.avro /some/location/ {code} # Create an external table in beeline containing this {{.avro}}: {code} beeline> CREATE EXTERNAL TABLE broken_newline_map ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '/user/dnies/hive-test/broken-newline/db' TBLPROPERTIES ('avro.schema.literal'=' { "type" : "record", "name" : "myEntry", "namespace" : "myNamespace", "fields" : [ { "name" : "foo", "type" : "long" }, { "name" : "bar", "type" : { "type" : "map", "values" : "string" } } ] } '); {code} # Now, selecting may return corrupt results: {code} jdbc:hive2://my-server:10000/> select * from broken_newline_map; +-------------------------+---------------------------------------------------+--+ | broken_newline_map.foo | broken_newline_map.bar | +-------------------------+---------------------------------------------------+--+ | 1 | {"key2":"value2","key1":"value1\nafter newline"} | | 2 | {"key2":"new value2","key1":"new value"} | +-------------------------+---------------------------------------------------+--+ 2 rows selected (1.661 seconds) jdbc:hive2://my-server:10000/> select foo, map_keys(bar), map_values(bar) from broken_newline_map; +-------+------------------+-----------------------------+--+ | foo | _c1 | _c2 | +-------+------------------+-----------------------------+--+ | 1 | ["key2","key1"] | ["value2","value1"] | | NULL | NULL | NULL | | 2 | ["key2","key1"] | ["new value2","new value"] | +-------+------------------+-----------------------------+--+ 3 rows selected (28.05 seconds) {code} Obviously, the last result set contains corrupt entries (line 2). I also encountered this when doing this query with JDBC. > Newlines in Avro maps cause external table to return corrupt values > ------------------------------------------------------------------- > > Key: HIVE-14044 > URL: https://issues.apache.org/jira/browse/HIVE-14044 > Project: Hive > Issue Type: Bug > Environment: Hive version: 1.1.0-cdh5.5.1 (bundled with cloudera > 5.5.1) > Reporter: David Nies > Attachments: test.json, test.schema > > > When {{\n}} characters are contained in Avro files that are used as data > bases for an external table, the result of {{SELECT}} queries may be corrupt. > I encountered this error when querying hive both from {{beeline}} and from > JDBC. > h3. Steps to reproduce (used files are attached to ticket) > # Create an {{.avro}} file that contains newline characters in a value of a > map: > {code} > avro-tools fromjson --schema-file test.schema test.json > test.avro > {code} > # Copy {{.avro}} file to HDFS > {code} > hdfs dfs -copyFromLocal test.avro /some/location/ > {code} > # Create an external table in beeline containing this {{.avro}}: > {code} > beeline> CREATE EXTERNAL TABLE broken_newline_map > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' > STORED AS > INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' > LOCATION '/user/dnies/hive-test/broken-newline/db' > TBLPROPERTIES ('avro.schema.literal'=' > { > "type" : "record", > "name" : "myEntry", > "namespace" : "myNamespace", > "fields" : [ { > "name" : "foo", > "type" : "long" > }, { > "name" : "bar", > "type" : { > "type" : "map", > "values" : "string" > } > } ] > } > '); > {code} > # Now, selecting may return corrupt results: > {code} > jdbc:hive2://my-server:10000/> select * from broken_newline_map; > +-------------------------+---------------------------------------------------+--+ > | broken_newline_map.foo | broken_newline_map.bar > | > +-------------------------+---------------------------------------------------+--+ > | 1 | {"key2":"value2","key1":"value1\nafter newline"} > | > | 2 | {"key2":"new value2","key1":"new value"} > | > +-------------------------+---------------------------------------------------+--+ > 2 rows selected (1.661 seconds) > jdbc:hive2://my-server:10000/> select foo, map_keys(bar), map_values(bar) > from broken_newline_map; > +-------+------------------+-----------------------------+--+ > | foo | _c1 | _c2 | > +-------+------------------+-----------------------------+--+ > | 1 | ["key2","key1"] | ["value2","value1"] | > | NULL | NULL | NULL | > | 2 | ["key2","key1"] | ["new value2","new value"] | > +-------+------------------+-----------------------------+--+ > 3 rows selected (28.05 seconds) > {code} > Obviously, the last result set contains corrupt entries (line 2) and > incorrect entries (line 1). I also encountered this when doing this query > with JDBC. -- This message was sent by Atlassian JIRA (v6.3.4#6332)