Omer Tal created PIG-5219:
-----------------------------

             Summary: IndexOutOfBoundsException when loading multiple 
directories with different schemas using OrcStorage
                 Key: PIG-5219
                 URL: https://issues.apache.org/jira/browse/PIG-5219
             Project: Pig
          Issue Type: Bug
    Affects Versions: 0.16.0
         Environment: Pig Version: 0.16.0
OS: EMR 5.3.1
            Reporter: Omer Tal


Scenario:
# Data set based on two hours in the same day. In hour 00 the ORC file has 4 
columns {a,b,c,d} and during hour 02 it changes to 5 columns {a,b,c,d,e}
# Loading ORC files with the same schema (hour 00):
{code}
x = load 's3://orc_files/dt=2017-03-21/hour=00' using OrcStorage();
dump x;
{code}
Result:
{code}
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
{code}
# Loading ORC files with different schemas in the same directory:
{code}
x = load 's3://orc_files/dt=2017-03-21/hour=02' using OrcStorage();
dump x;
{code}
Result:
{code}
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
{code}
# Loading the whole day (both hour 00 and 02):
{code}
x = load 's3://orc_files/dt=2017-03-21' using OrcStorage();
dump x;
{code}
Result:
{code}
37332 [PigTezLauncher-0] INFO  
org.apache.pig.backend.hadoop.executionengine.tez.TezJob  - DAG Status: 
status=FAILED, progress=TotalTasks: 1 Succeeded: 0 Running: 0 Failed: 1 Killed: 
0 FailedTaskAttempts: 4, diagnostics=Vertex failed, vertexName=scope-2, 
vertexId=vertex_1491991474861_0006_1_00, diagnostics=[Task failed, 
taskId=task_1491991474861_0006_1_00_000000, diagnostics=[TaskAttempt 0 failed, 
info=[Error: Error while running task ( failure ) : 
attempt_1491991474861_0006_1_00_000000_0:java.lang.IndexOutOfBoundsException: 
Index: 4, Size: 4
        at java.util.ArrayList.rangeCheck(ArrayList.java:653)
        at java.util.ArrayList.get(ArrayList.java:429)
        at 
org.apache.pig.impl.util.hive.HiveUtils.convertHiveToPig(HiveUtils.java:97)
        at org.apache.pig.builtin.OrcStorage.getNext(OrcStorage.java:381)
        at 
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader.nextKeyValue(PigRecordReader.java:204)
        at 
org.apache.tez.mapreduce.lib.MRReaderMapReduce.next(MRReaderMapReduce.java:119)
        at 
org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POSimpleTezLoad.getNextTuple(POSimpleTezLoad.java:140)
        at 
org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.processInput(PhysicalOperator.java:305)
        at 
org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POStoreTez.getNextTuple(POStoreTez.java:123)
        at 
org.apache.pig.backend.hadoop.executionengine.tez.runtime.PigProcessor.runPipeline(PigProcessor.java:376)
        at 
org.apache.pig.backend.hadoop.executionengine.tez.runtime.PigProcessor.run(PigProcessor.java:241)
        at 
org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:370)
        at 
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
        at 
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
        at 
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
        at 
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
        at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
{code}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Reply via email to