[ https://issues.apache.org/jira/browse/HIVE-14448?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15419697#comment-15419697 ]
Matt McCline commented on HIVE-14448: ------------------------------------- [~sershe] Thank you very much for your code reviewing. > Queries with predicate fail when ETL split strategy is chosen for ACID tables > ----------------------------------------------------------------------------- > > Key: HIVE-14448 > URL: https://issues.apache.org/jira/browse/HIVE-14448 > Project: Hive > Issue Type: Bug > Components: Transactions > Affects Versions: 2.2.0 > Reporter: Saket Saurabh > Assignee: Matt McCline > Priority: Critical > Fix For: 2.2.0, 2.1.1 > > Attachments: HIVE-14448.01.patch, HIVE-14448.02.patch, > HIVE-14448.03.patch, HIVE-14448.04.patch, HIVE-14448.patch > > > When ETL split strategy is applied to ACID tables with predicate pushdown > (SARG enabled), split generation fails for ACID. This bug will be usually > exposed when working with data at scale, because in most otherwise cases only > BI split strategy is chosen. My guess is that this is happening because the > correct readerSchema is not being picked up when we try to extract SARG > column names. > Quickest way to reproduce is to add the following unit test to > ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java > {code:title=ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java|borderStyle=solid} > @Test > public void testETLSplitStrategyForACID() throws Exception { > hiveConf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, "ETL"); > hiveConf.setBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER, true); > runStatementOnDriver("insert into " + Table.ACIDTBL + " values(1,2)"); > runStatementOnDriver("alter table " + Table.ACIDTBL + " compact 'MAJOR'"); > runWorker(hiveConf); > List<String> rs = runStatementOnDriver("select * from " + Table.ACIDTBL > + " where a = 1"); > int[][] resultData = new int[][] {{1,2}}; > Assert.assertEquals(stringifyValues(resultData), rs); > } > {code} > Back-trace for this failed test is as follows: > {code} > exec.Task: Job Submission failed with exception > 'java.lang.RuntimeException(ORC split generation failed with exception: > java.lang.NegativeArraySizeException)' > java.lang.RuntimeException: ORC split generation failed with exception: > java.lang.NegativeArraySizeException > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1570) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1656) > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.addSplitsForGroup(HiveInputFormat.java:370) > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.getSplits(HiveInputFormat.java:488) > at > org.apache.hadoop.mapreduce.JobSubmitter.writeOldSplits(JobSubmitter.java:329) > at > org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:321) > at > org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:197) > at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1297) > at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1294) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at org.apache.hadoop.mapreduce.Job.submit(Job.java:1294) > at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:562) > at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:557) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at > org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:557) > at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:548) > at > org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:417) > at > org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:141) > at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:197) > at > org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100) > at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1962) > at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1653) > at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1389) > at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1131) > at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1119) > at > org.apache.hadoop.hive.ql.TestTxnCommands2.runStatementOnDriver(TestTxnCommands2.java:1292) > at > org.apache.hadoop.hive.ql.TestTxnCommands2.testETLSplitStrategyForACID(TestTxnCommands2.java:280) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47) > at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) > at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > at > org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) > at > org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) > at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55) > at org.junit.rules.RunRules.evaluate(RunRules.java:20) > at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50) > at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236) > at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229) > at org.junit.runners.ParentRunner.run(ParentRunner.java:309) > at > org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:254) > at > org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:149) > at > org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:124) > at > org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:200) > at > org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:153) > at > org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:103) > Caused by: java.util.concurrent.ExecutionException: > java.lang.NegativeArraySizeException > at java.util.concurrent.FutureTask.report(FutureTask.java:122) > at java.util.concurrent.FutureTask.get(FutureTask.java:192) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1560) > ... 57 more > Caused by: java.lang.NegativeArraySizeException > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSargColumnNames(OrcInputFormat.java:378) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.extractNeededColNames(OrcInputFormat.java:444) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.extractNeededColNames(OrcInputFormat.java:439) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.access$2800(OrcInputFormat.java:146) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.callInternal(OrcInputFormat.java:1274) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.access$2600(OrcInputFormat.java:1068) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator$1.run(OrcInputFormat.java:1248) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator$1.run(OrcInputFormat.java:1245) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$SplitGenerator.call(OrcInputFormat.java:1245) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.runGetSplitsSync(OrcInputFormat.java:883) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.access$1300(OrcInputFormat.java:700) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy$1.run(OrcInputFormat.java:857) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy$1.run(OrcInputFormat.java:854) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.call(OrcInputFormat.java:854) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$ETLSplitStrategy.call(OrcInputFormat.java:700) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)