[ https://issues.apache.org/jira/browse/TEZ-1947?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Rajesh Balamohan updated TEZ-1947: ---------------------------------- Description: It would be beneficial to do certain config checks (whereever possible) upfront rather having fail later in the downstream. For e.g, in the following example the DAG failed after 400+ seconds for some config issue. {code} Status: Running (Executing on YARN cluster with App id application_1421164610335_0060) -------------------------------------------------------------------------------- VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED -------------------------------------------------------------------------------- Map 1 ...... KILLED 251 170 0 81 0 81 Reducer 2 FAILED 1009 0 0 1009 23 1008 -------------------------------------------------------------------------------- VERTICES: 00/02 [===>>-----------------------] 13% ELAPSED TIME: 449.01 s -------------------------------------------------------------------------------- Status: Failed Vertex failed, vertexName=Reducer 2, vertexId=vertex_1421164610335_0060_1_01, diagnostics=[Task failed, taskId=task_1421164610335_0060_1_01_000004, diagnostics=[TaskAttempt 0 failed, info=[Error: Failure while running task:java.lang.RuntimeException: Invlaid configuration: maxSingleShuffleLimit should be less than mergeThresholdmaxSingleShuffleLimit: 238251152, mergeThreshold: 148668720 at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager.<init>(MergeManager.java:260) at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.Shuffle.<init>(Shuffle.java:206) at org.apache.tez.runtime.library.input.OrderedGroupedKVInput.start(OrderedGroupedKVInput.java:124) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:405) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:393) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) ], TaskAttempt 1 failed, info=[Error: Failure while running task:java.lang.RuntimeException: Invlaid configuration: maxSingleShuffleLimit should be less than mergeThresholdmaxSingleShuffleLimit: 238251152, mergeThreshold: 148668720 at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager.<init>(MergeManager.java:260) at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.Shuffle.<init>(Shuffle.java:206) at org.apache.tez.runtime.library.input.OrderedGroupedKVInput.start(OrderedGroupedKVInput.java:124) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:405) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:393) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) {code} was: It would be beneficial to do certain config checks upfront rather having fail later in the downstream. For e.g, in the following example the DAG failed after 400+ seconds for some config issue. {code} Status: Running (Executing on YARN cluster with App id application_1421164610335_0060) -------------------------------------------------------------------------------- VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED -------------------------------------------------------------------------------- Map 1 ...... KILLED 251 170 0 81 0 81 Reducer 2 FAILED 1009 0 0 1009 23 1008 -------------------------------------------------------------------------------- VERTICES: 00/02 [===>>-----------------------] 13% ELAPSED TIME: 449.01 s -------------------------------------------------------------------------------- Status: Failed Vertex failed, vertexName=Reducer 2, vertexId=vertex_1421164610335_0060_1_01, diagnostics=[Task failed, taskId=task_1421164610335_0060_1_01_000004, diagnostics=[TaskAttempt 0 failed, info=[Error: Failure while running task:java.lang.RuntimeException: Invlaid configuration: maxSingleShuffleLimit should be less than mergeThresholdmaxSingleShuffleLimit: 238251152, mergeThreshold: 148668720 at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager.<init>(MergeManager.java:260) at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.Shuffle.<init>(Shuffle.java:206) at org.apache.tez.runtime.library.input.OrderedGroupedKVInput.start(OrderedGroupedKVInput.java:124) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:405) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:393) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) ], TaskAttempt 1 failed, info=[Error: Failure while running task:java.lang.RuntimeException: Invlaid configuration: maxSingleShuffleLimit should be less than mergeThresholdmaxSingleShuffleLimit: 238251152, mergeThreshold: 148668720 at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager.<init>(MergeManager.java:260) at org.apache.tez.runtime.library.common.shuffle.orderedgrouped.Shuffle.<init>(Shuffle.java:206) at org.apache.tez.runtime.library.input.OrderedGroupedKVInput.start(OrderedGroupedKVInput.java:124) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:405) at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:393) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) {code} > Failing fast when DAG configs have wrong values can save cluster resources > -------------------------------------------------------------------------- > > Key: TEZ-1947 > URL: https://issues.apache.org/jira/browse/TEZ-1947 > Project: Apache Tez > Issue Type: Bug > Reporter: Rajesh Balamohan > > It would be beneficial to do certain config checks (whereever possible) > upfront rather having fail later in the downstream. For e.g, in the > following example the DAG failed after 400+ seconds for some config issue. > {code} > Status: Running (Executing on YARN cluster with App id > application_1421164610335_0060) > -------------------------------------------------------------------------------- > VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED > KILLED > -------------------------------------------------------------------------------- > Map 1 ...... KILLED 251 170 0 81 0 > 81 > Reducer 2 FAILED 1009 0 0 1009 23 > 1008 > -------------------------------------------------------------------------------- > VERTICES: 00/02 [===>>-----------------------] 13% ELAPSED TIME: 449.01 s > -------------------------------------------------------------------------------- > Status: Failed > Vertex failed, vertexName=Reducer 2, vertexId=vertex_1421164610335_0060_1_01, > diagnostics=[Task failed, taskId=task_1421164610335_0060_1_01_000004, > diagnostics=[TaskAttempt 0 failed, info=[Error: Failure while running > task:java.lang.RuntimeException: Invlaid configuration: maxSingleShuffleLimit > should be less than mergeThresholdmaxSingleShuffleLimit: 238251152, > mergeThreshold: 148668720 > at > org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager.<init>(MergeManager.java:260) > at > org.apache.tez.runtime.library.common.shuffle.orderedgrouped.Shuffle.<init>(Shuffle.java:206) > at > org.apache.tez.runtime.library.input.OrderedGroupedKVInput.start(OrderedGroupedKVInput.java:124) > at > org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:405) > at > org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:393) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > ], TaskAttempt 1 failed, info=[Error: Failure while running > task:java.lang.RuntimeException: Invlaid configuration: maxSingleShuffleLimit > should be less than mergeThresholdmaxSingleShuffleLimit: 238251152, > mergeThreshold: 148668720 > at > org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager.<init>(MergeManager.java:260) > at > org.apache.tez.runtime.library.common.shuffle.orderedgrouped.Shuffle.<init>(Shuffle.java:206) > at > org.apache.tez.runtime.library.input.OrderedGroupedKVInput.start(OrderedGroupedKVInput.java:124) > at > org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:405) > at > org.apache.tez.runtime.LogicalIOProcessorRuntimeTask$StartInputCallable.call(LogicalIOProcessorRuntimeTask.java:393) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)