Omer Tal created PIG-5219:
-----------------------------
Summary: IndexOutOfBoundsException when loading multiple
directories with different schemas using OrcStorage
Key: PIG-5219
URL: https://issues.apache.org/jira/browse/PIG-5219
Project: Pig
Issue Type: Bug
Affects Versions: 0.16.0
Environment: Pig Version: 0.16.0
OS: EMR 5.3.1
Reporter: Omer Tal
Scenario:
# Data set based on two hours in the same day. In hour 00 the ORC file has 4
columns {a,b,c,d} and during hour 02 it changes to 5 columns {a,b,c,d,e}
# Loading ORC files with the same schema (hour 00):
{code}
x = load 's3://orc_files/dt=2017-03-21/hour=00' using OrcStorage();
dump x;
{code}
Result:
{code}
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
{code}
# Loading ORC files with different schemas in the same directory:
{code}
x = load 's3://orc_files/dt=2017-03-21/hour=02' using OrcStorage();
dump x;
{code}
Result:
{code}
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4,5)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
(1,2,3,4)
{code}
# Loading the whole day (both hour 00 and 02):
{code}
x = load 's3://orc_files/dt=2017-03-21' using OrcStorage();
dump x;
{code}
Result:
{code}
37332 [PigTezLauncher-0] INFO
org.apache.pig.backend.hadoop.executionengine.tez.TezJob - DAG Status:
status=FAILED, progress=TotalTasks: 1 Succeeded: 0 Running: 0 Failed: 1 Killed:
0 FailedTaskAttempts: 4, diagnostics=Vertex failed, vertexName=scope-2,
vertexId=vertex_1491991474861_0006_1_00, diagnostics=[Task failed,
taskId=task_1491991474861_0006_1_00_000000, diagnostics=[TaskAttempt 0 failed,
info=[Error: Error while running task ( failure ) :
attempt_1491991474861_0006_1_00_000000_0:java.lang.IndexOutOfBoundsException:
Index: 4, Size: 4
at java.util.ArrayList.rangeCheck(ArrayList.java:653)
at java.util.ArrayList.get(ArrayList.java:429)
at
org.apache.pig.impl.util.hive.HiveUtils.convertHiveToPig(HiveUtils.java:97)
at org.apache.pig.builtin.OrcStorage.getNext(OrcStorage.java:381)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader.nextKeyValue(PigRecordReader.java:204)
at
org.apache.tez.mapreduce.lib.MRReaderMapReduce.next(MRReaderMapReduce.java:119)
at
org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POSimpleTezLoad.getNextTuple(POSimpleTezLoad.java:140)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.processInput(PhysicalOperator.java:305)
at
org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POStoreTez.getNextTuple(POStoreTez.java:123)
at
org.apache.pig.backend.hadoop.executionengine.tez.runtime.PigProcessor.runPipeline(PigProcessor.java:376)
at
org.apache.pig.backend.hadoop.executionengine.tez.runtime.PigProcessor.run(PigProcessor.java:241)
at
org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:370)
at
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
at
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
{code}
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)