[ https://issues.apache.org/jira/browse/HIVE-14448?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15414106#comment-15414106 ]
Saket Saurabh commented on HIVE-14448: -------------------------------------- While I was investigating test failures for some other scenarios, I realized that schema evolution also breaks when ETL strategy is chosen, which I believe might be related to this JIRA. I think it would be good to add another test that evolves the schema, as [~prasanth_j] suggested. Even with the current patch, the following schema evolution test still fails with IndexOutOfBoundsException: {code:title=ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java|borderStyle=solid} @Test public void testAcidWithSchemaEvolution() throws Exception { hiveConf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, "ETL"); String tblName = "acidTblWithSchemaEvol"; runStatementOnDriver("drop table if exists " + tblName); runStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')"); runStatementOnDriver("INSERT INTO " + tblName + " VALUES (1, 'foo'), (2, 'bar')"); // Major compact to create a base that has ACID schema. runStatementOnDriver("ALTER TABLE " + tblName + " COMPACT 'MAJOR'"); runWorker(hiveConf); // Alter table for perform schema evolution. runStatementOnDriver("ALTER TABLE " + tblName + " ADD COLUMNS(c int)"); // Validate there is an added NULL for column c. List<String> rs = runStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a"); String[] expectedResult = { "1\tfoo\tNULL", "2\tbar\tNULL" }; Assert.assertEquals(Arrays.asList(expectedResult), rs); } {code} Here is the back-trace for the failed test: {code} exec.Task: Job Submission failed with exception 'java.lang.RuntimeException(ORC split generation failed with exception: java.lang.IndexOutOfBoundsException: Index: 9, Size: 9)' java.lang.RuntimeException: ORC split generation failed with exception: java.lang.IndexOutOfBoundsException: Index: 9, Size: 9 at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1576) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1662) at org.apache.hadoop.hive.ql.io.HiveInputFormat.addSplitsForGroup(HiveInputFormat.java:370) at org.apache.hadoop.hive.ql.io.HiveInputFormat.getSplits(HiveInputFormat.java:488) at org.apache.hadoop.mapreduce.JobSubmitter.writeOldSplits(JobSubmitter.java:329) at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:321) at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:197) at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1297) at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1294) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) at org.apache.hadoop.mapreduce.Job.submit(Job.java:1294) at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:562) at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:557) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:557) at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:548) at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:417) at org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:141) at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:197) at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100) at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1983) at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1674) at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1410) at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1134) at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1122) at org.apache.hadoop.hive.ql.TestTxnCommands2.runStatementOnDriver(TestTxnCommands2.java:1315) at org.apache.hadoop.hive.ql.TestTxnCommands2.testAcidWithSchemaEvolution(TestTxnCommands2.java:177) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47) at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55) at org.junit.rules.RunRules.evaluate(RunRules.java:20) at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50) at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238) at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63) at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236) at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53) at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229) at org.junit.runners.ParentRunner.run(ParentRunner.java:309) at org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:254) at org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:149) at org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:124) at org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:200) at org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:153) at org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:103) Caused by: java.util.concurrent.ExecutionException: java.lang.IndexOutOfBoundsException: Index: 9, Size: 9 at java.util.concurrent.FutureTask.report(FutureTask.java:122) at java.util.concurrent.FutureTask.get(FutureTask.java:192) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1566) ... 57 more Caused by: java.lang.IndexOutOfBoundsException: Index: 9, Size: 9 at java.util.ArrayList.rangeCheck(ArrayList.java:653) at java.util.ArrayList.get(ArrayList.java:429) at java.util.Collections$UnmodifiableList.get(Collections.java:1309) at org.apache.orc.impl.ReaderImpl.getRawDataSizeOfColumn(ReaderImpl.java:618) at org.apache.orc.impl.ReaderImpl.getRawDataSizeFromColIndices(ReaderImpl.java:611) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.computeProjectionSize(OrcInputFormat.java:1449) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.populateAndCacheStripeDetails(OrcInputFormat.java:1431) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.callInternal(OrcInputFormat.java:1268) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.access$2600(OrcInputFormat.java:1068) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator$1.run(OrcInputFormat.java:1248) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator$1.run(OrcInputFormat.java:1245) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.call(OrcInputFormat.java:1245) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.runGetSplitsSync(OrcInputFormat.java:883) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.access$1300(OrcInputFormat.java:700) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy$1.run(OrcInputFormat.java:857) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy$1.run(OrcInputFormat.java:854) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.call(OrcInputFormat.java:854) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.call(OrcInputFormat.java:700) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) {code} > Queries with predicate fail when ETL split strategy is chosen for ACID tables > ----------------------------------------------------------------------------- > > Key: HIVE-14448 > URL: https://issues.apache.org/jira/browse/HIVE-14448 > Project: Hive > Issue Type: Bug > Components: Transactions > Affects Versions: 2.2.0 > Reporter: Saket Saurabh > Assignee: Sergey Shelukhin > Attachments: HIVE-14448.patch > > > When ETL split strategy is applied to ACID tables with predicate pushdown > (SARG enabled), split generation fails for ACID. This bug will be usually > exposed when working with data at scale, because in most otherwise cases only > BI split strategy is chosen. My guess is that this is happening because the > correct readerSchema is not being picked up when we try to extract SARG > column names. > Quickest way to reproduce is to add the following unit test to > ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java > {code:title=ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java|borderStyle=solid} > @Test > public void testETLSplitStrategyForACID() throws Exception { > hiveConf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, "ETL"); > hiveConf.setBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER, true); > runStatementOnDriver("insert into " + Table.ACIDTBL + " values(1,2)"); > runStatementOnDriver("alter table " + Table.ACIDTBL + " compact 'MAJOR'"); > runWorker(hiveConf); > List<String> rs = runStatementOnDriver("select * from " + Table.ACIDTBL > + " where a = 1"); > int[][] resultData = new int[][] {{1,2}}; > Assert.assertEquals(stringifyValues(resultData), rs); > } > {code} > Back-trace for this failed test is as follows: > {code} > exec.Task: Job Submission failed with exception > 'java.lang.RuntimeException(ORC split generation failed with exception: > java.lang.NegativeArraySizeException)' > java.lang.RuntimeException: ORC split generation failed with exception: > java.lang.NegativeArraySizeException > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1570) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1656) > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.addSplitsForGroup(HiveInputFormat.java:370) > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.getSplits(HiveInputFormat.java:488) > at > org.apache.hadoop.mapreduce.JobSubmitter.writeOldSplits(JobSubmitter.java:329) > at > org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:321) > at > org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:197) > at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1297) > at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1294) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at org.apache.hadoop.mapreduce.Job.submit(Job.java:1294) > at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:562) > at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:557) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at > org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:557) > at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:548) > at > org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:417) > at > org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:141) > at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:197) > at > org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100) > at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1962) > at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1653) > at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1389) > at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1131) > at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1119) > at > org.apache.hadoop.hive.ql.TestTxnCommands2.runStatementOnDriver(TestTxnCommands2.java:1292) > at > org.apache.hadoop.hive.ql.TestTxnCommands2.testETLSplitStrategyForACID(TestTxnCommands2.java:280) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47) > at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) > at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > at > org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) > at > org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) > at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55) > at org.junit.rules.RunRules.evaluate(RunRules.java:20) > at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50) > at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236) > at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229) > at org.junit.runners.ParentRunner.run(ParentRunner.java:309) > at > org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:254) > at > org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:149) > at > org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:124) > at > org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:200) > at > org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:153) > at > org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:103) > Caused by: java.util.concurrent.ExecutionException: > java.lang.NegativeArraySizeException > at java.util.concurrent.FutureTask.report(FutureTask.java:122) > at java.util.concurrent.FutureTask.get(FutureTask.java:192) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1560) > ... 57 more > Caused by: java.lang.NegativeArraySizeException > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSargColumnNames(OrcInputFormat.java:378) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.extractNeededColNames(OrcInputFormat.java:444) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.extractNeededColNames(OrcInputFormat.java:439) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.access$2800(OrcInputFormat.java:146) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.callInternal(OrcInputFormat.java:1274) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.access$2600(OrcInputFormat.java:1068) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator$1.run(OrcInputFormat.java:1248) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator$1.run(OrcInputFormat.java:1245) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.call(OrcInputFormat.java:1245) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.runGetSplitsSync(OrcInputFormat.java:883) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.access$1300(OrcInputFormat.java:700) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy$1.run(OrcInputFormat.java:857) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy$1.run(OrcInputFormat.java:854) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.call(OrcInputFormat.java:854) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.call(OrcInputFormat.java:700) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)