This is an automated email from the ASF dual-hosted git repository. klcopp pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 387f0da HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora) 387f0da is described below commit 387f0da9155a0e7b47ec39aeb9002c2b4cd75656 Author: Karen Coppage <karenlcopp...@gmail.com> AuthorDate: Tue Aug 25 10:03:57 2020 +0200 HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora) Closes #1388 --- .../ql/io/parquet/ParquetRecordReaderBase.java | 3 + .../hadoop/hive/ql/stats/BasicStatsNoJobTask.java | 2 +- .../hadoop/hive/ql/TestTxnCommandsForMmTable.java | 81 ++++++++++++++++++++-- 3 files changed, 80 insertions(+), 6 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java index 577051d..c52bc9d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java @@ -75,6 +75,9 @@ public class ParquetRecordReaderBase { final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf ) throws IOException { + if (oldSplit.getLength() == 0) { + return null; + } ParquetInputSplit split; if (oldSplit instanceof FileSplit) { final Path finalPath = ((FileSplit) oldSplit).getPath(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java index 53b3065..c6533cf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java @@ -187,7 +187,7 @@ public class BasicStatsNoJobTask implements IStatsProcessor { Utilities.FILE_OP_LOGGER.debug("Computing stats for {}", file); if (!file.isDirectory()) { InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(partish.getInputFormatClass(), jc); - InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partish.getLocation() }); + InputSplit dummySplit = new FileSplit(file.getPath(), 0, -1, new String[] { partish.getLocation() }); if (file.getLen() == 0) { numFiles += 1; } else { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java index 535bf11..4d25f88 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql; import java.io.File; +import java.util.Collections; import java.util.List; import org.apache.hadoop.fs.FileStatus; @@ -481,10 +482,85 @@ public class TestTxnCommandsForMmTable extends TxnCommandsBaseForTests { verifyDirAndResult(0, true); } + @Test + public void testImpalaTruncatedMmTableVectorized() throws Exception { + testImpalaTruncatedMmTable(true); + } + + @Test + public void testImpalaTruncatedMmTableNonVectorized() throws Exception { + testImpalaTruncatedMmTable(false); + } + + /** + * Impala truncates insert-only tables by writing a base directory (like insert overwrite) containing a completely + * empty file. Make sure that Hive reads these bases correctly. + * + * @throws Exception + */ + private void testImpalaTruncatedMmTable(boolean vectorized) throws Exception { + if (!vectorized) { + d.getConf().setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); + } + FileSystem fs = FileSystem.get(hiveConf); + FileStatus[] status; + Path tblLocation = new Path(TEST_WAREHOUSE_DIR + "/" + + (TableExtended.MMTBL).toString().toLowerCase()); + + // 1. Insert two rows to an MM table + runStatementOnDriver("drop table " + TableExtended.MMTBL); + runStatementOnDriver("create table " + TableExtended.MMTBL + "(a int,b int) stored as parquet " + + "TBLPROPERTIES ('transactional'='true', 'transactional_properties'='insert_only')"); + runStatementOnDriver("insert into " + TableExtended.MMTBL + "(a,b) values(1,2)"); + runStatementOnDriver("insert into " + TableExtended.MMTBL + "(a,b) values(3,4)"); + status = fs.listStatus(tblLocation, FileUtils.STAGING_DIR_PATH_FILTER); + // There should be 2 delta dirs in the location + Assert.assertEquals(2, status.length); + for (int i = 0; i < status.length; i++) { + Assert.assertTrue(status[i].getPath().getName().matches("delta_.*")); + } + + // 2. Simulate Impala truncating the table: write a base dir (base_0000003) containing a file with no data. We + // have to delete this file (it's not completely empty, it contains metadata) and create completely empty file + runStatementOnDriver("insert overwrite table " + TableExtended.MMTBL + " select * from " + + TableExtended.MMTBL + " where 1=2"); + status = fs.listStatus(tblLocation, FileUtils.STAGING_DIR_PATH_FILTER); + // There should be 2 delta dirs, plus 1 base dir in the location + Assert.assertEquals(3, status.length); + verifyDir(2, true); + Path basePath = new Path(tblLocation, "base_0000003"); + Assert.assertTrue("Deleting file under base failed", fs.delete(new Path(basePath, "000000_0"))); + fs.create(new Path(basePath, "empty")); + + // 3. Verify query result. Selecting from a truncated table should return nothing. + List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b"); + Assert.assertEquals(Collections.emptyList(), rs); + + // 4. Perform a major compaction. Cleaner should remove the 2 delta dirs. + runStatementOnDriver("alter table "+ TableExtended.MMTBL + " compact 'MAJOR'"); + runWorker(hiveConf); + runCleaner(hiveConf); + verifyDir(0, true); + rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b"); + Assert.assertEquals(Collections.emptyList(), rs); + if (!vectorized) { + d.getConf().setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); + } + } + private void verifyDirAndResult(int expectedDeltas) throws Exception { verifyDirAndResult(expectedDeltas, false); } private void verifyDirAndResult(int expectedDeltas, boolean expectBaseDir) throws Exception { + verifyDir(expectedDeltas, expectBaseDir); + + // Verify query result + int [][] resultData = new int[][] {{1,2}, {3,4}}; + List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b"); + Assert.assertEquals(stringifyValues(resultData), rs); + } + + private void verifyDir(int expectedDeltas, boolean expectBaseDir) throws Exception { FileSystem fs = FileSystem.get(hiveConf); // Verify the content of subdirs FileStatus[] status = fs.listStatus(new Path(TEST_WAREHOUSE_DIR + "/" + @@ -508,10 +584,5 @@ public class TestTxnCommandsForMmTable extends TxnCommandsBaseForTests { } else { Assert.assertEquals("0 base directories expected", 0, sawBaseTimes); } - - // Verify query result - int [][] resultData = new int[][] {{1,2}, {3,4}}; - List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b"); - Assert.assertEquals(stringifyValues(resultData), rs); } }