[ https://issues.apache.org/jira/browse/HIVE-20825?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17291347#comment-17291347 ]
Ashish Doneriya commented on HIVE-20825: ---------------------------------------- [~tomzeng] Working fine in Hive 3.1.2 > Hive ACID Merge generates invalid ORC files (bucket files 0 or 3 bytes in > length) causing the "Not a valid ORC file" error > -------------------------------------------------------------------------------------------------------------------------- > > Key: HIVE-20825 > URL: https://issues.apache.org/jira/browse/HIVE-20825 > Project: Hive > Issue Type: Bug > Components: Hive, ORC, Transactions > Affects Versions: 2.3.1, 2.3.2, 2.3.4 > Environment: Hive 2.3.x on Amazon EMR 5.8.0 to 5.18.0. Open source > build of Hive 2.3.4 > Reporter: Tom Zeng > Priority: Major > Attachments: hive-merge-invalid-orc-repro.hql, > hive-merge-invalid-orc-repro.log > > > When using Hive ACID Merge (supported with the ORC format) to update/insert > data, bucket files with 0 byte or 3 bytes (file content contains three > characters: ORC) are generated during MERGE INTO operations which finish with > no errors. Subsequent queries on the base table will get "Not a valid ORC > file" error. > > The following script can be used to reproduce the issue(note that with small > amount of data like this increasing the number of buckets could result in > query working, but with large data set it will fail no matter what bucket > size): > set hive.auto.convert.join=false; > set hive.enforce.bucketing=true; > set hive.exec.dynamic.partition.mode = nonstrict; > set hive.support.concurrency=true; > set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; > drop table if exists mergedelta_txt_1; > drop table if exists mergedelta_txt_2; > CREATE TABLE mergedelta_txt_1 ( > id_str varchar(12), time_key int, value bigint) > PARTITIONED BY (date_key int) > ROW FORMAT DELIMITED > STORED AS TEXTFILE; > CREATE TABLE mergedelta_txt_2 ( > id_str varchar(12), time_key int, value bigint) > PARTITIONED BY (date_key int) > ROW FORMAT DELIMITED > STORED AS TEXTFILE; > INSERT INTO TABLE mergedelta_txt_1 > partition(date_key=20170103) > VALUES > ("AB94LIENR0",46700,12345676836978), > ("AB94LIENR1",46825,12345676836978), > ("AB94LIENS0",46709,12345676836978), > ("AB94LIENS1",46834,12345676836978), > ("AB94LIENT0",46709,12345676836978), > ("AB94LIENT1",46834,12345676836978), > ("AB94LIENU0",46718,12345676836978), > ("AB94LIENU1",46844,12345676836978), > ("AB94LIENV0",46719,12345676836978), > ("AB94LIENV1",46844,12345676836978), > ("AB94LIENW0",46728,12345676836978), > ("AB94LIENW1",46854,12345676836978), > ("AB94LIENX0",46728,12345676836978), > ("AB94LIENX1",46854,12345676836978), > ("AB94LIENY0",46737,12345676836978), > ("AB94LIENY1",46863,12345676836978), > ("AB94LIENZ0",46738,12345676836978), > ("AB94LIENZ1",46863,12345676836978), > ("AB94LIERA0",47176,12345676836982), > ("AB94LIERA1",47302,12345676836982); > INSERT INTO TABLE mergedelta_txt_2 > partition(date_key=20170103) > VALUES > ("AB94LIENT1",46834,12345676836978), > ("AB94LIENU0",46718,12345676836978), > ("AB94LIENU1",46844,12345676836978), > ("AB94LIENV0",46719,12345676836978), > ("AB94LIENV1",46844,12345676836978), > ("AB94LIENW0",46728,12345676836978), > ("AB94LIENW1",46854,12345676836978), > ("AB94LIENX0",46728,12345676836978), > ("AB94LIENX1",46854,12345676836978), > ("AB94LIENY0",46737,12345676836978), > ("AB94LIENY1",46863,12345676836978), > ("AB94LIENZ0",46738,12345676836978), > ("AB94LIENZ1",46863,12345676836978), > ("AB94LIERA0",47176,12345676836982), > ("AB94LIERA1",47302,12345676836982), > ("AB94LIERA2",47418,12345676836982), > ("AB94LIERB0",47176,12345676836982), > ("AB94LIERB1",47302,12345676836982), > ("AB94LIERB2",47418,12345676836982), > ("AB94LIERC0",47185,12345676836982); > DROP TABLE IF EXISTS mergebase_1; > CREATE TABLE mergebase_1 ( > id_str varchar(12) , time_key int , value bigint) > PARTITIONED BY (date_key int) > CLUSTERED BY (id_str,time_key) INTO 4 BUCKETS > STORED AS ORC > TBLPROPERTIES ( > 'orc.compress'='SNAPPY', > 'pk_columns'='id_str,date_key,time_key', > 'NO_AUTO_COMPACTION'='true', > 'transactional'='true'); > MERGE INTO mergebase_1 AS base > USING (SELECT * > FROM ( > SELECT id_str ,time_key ,value, date_key, rank() OVER (PARTITION BY > id_str,date_key,time_key ORDER BY id_str,date_key,time_key) AS rk > FROM mergedelta_txt_1 > DISTRIBUTE BY date_key > ) rankedtbl > WHERE rankedtbl.rk=1 > ) AS delta > ON delta.id_str=base.id_str AND delta.date_key=base.date_key AND > delta.time_key=base.time_key > WHEN MATCHED THEN UPDATE SET value=delta.value > WHEN NOT MATCHED THEN INSERT VALUES ( delta.id_str , delta.time_key , > delta.value, delta.date_key); > MERGE INTO mergebase_1 AS base > USING (SELECT * > FROM ( > SELECT id_str ,time_key ,value, date_key, rank() OVER (PARTITION BY > id_str,date_key,time_key ORDER BY id_str,date_key,time_key) AS rk > FROM mergedelta_txt_2 > DISTRIBUTE BY date_key > ) rankedtbl > WHERE rankedtbl.rk=1 > ) AS delta > ON delta.id_str=base.id_str AND delta.date_key=base.date_key AND > delta.time_key=base.time_key > WHEN MATCHED THEN UPDATE SET value=delta.value > WHEN NOT MATCHED THEN INSERT VALUES ( delta.id_str , delta.time_key , > delta.value, delta.date_key); > select count(*) from mergebase_1; -- This message was sent by Atlassian Jira (v8.3.4#803005)