[ 
https://issues.apache.org/jira/browse/TEZ-1223?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14047652#comment-14047652
 ] 

Rajesh Balamohan commented on TEZ-1223:
---------------------------------------

Update:
=======
- Added getRemoteAddress with ip/port information in the server side, to check 
if ports to the same node is changing when transferring multiple map output.  
Analysis showed that this is not the case (for a given request with multiple 
mapId, the ip/port is same till all map outputs are streamed through). 

> Shuffle errors at 10 TB scale
> -----------------------------
>
>                 Key: TEZ-1223
>                 URL: https://issues.apache.org/jira/browse/TEZ-1223
>             Project: Apache Tez
>          Issue Type: Bug
>            Reporter: Rajesh Balamohan
>            Assignee: Rajesh Balamohan
>              Labels: performance, scalability
>         Attachments: shuffle_data.tar.gz
>
>
> When running a job with the following DAG at 10 TB scale, different shuffle 
> exceptions occurred.  Creating this as umbrella ticket for tracking these 
> errors.  Most of them are related to ShuffleHeader parsing.
> DAG:
> =====
> digraph rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1
> { graph [ 
> label="rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1", 
> fontsize=24, fontname=Helvetica]; node [fontsize=12, fontname=Helvetica]; 
> edge [fontsize=9, fontcolor=blue, fontname=Arial]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" [ label 
> = "Map_5[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n 
> dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ 
> label = "Reducer_9[ReduceTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n 
> dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11_store_returns"
>  [ label = "Map_11[store_returns]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11_store_returns"
>  -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" [ 
> label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" 
> ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4_out_Reducer_4"
>  [ label = "Reducer_4[out_Reducer_4]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" [ label 
> = "Map_10[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n 
> dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" [ label 
> = "Map_8[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ 
> label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n 
> dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" [ label 
> = "Map_1[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n 
> dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" [ label 
> = "Map_6[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n 
> dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10_item" [ 
> label = "Map_10[item]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10_item" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" [ label 
> = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6_d3" [ 
> label = "Map_6[d3]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6_d3" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" [ label 
> = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2_catalog_sales"
>  [ label = "Map_2[catalog_sales]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2_catalog_sales"
>  -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" [ 
> label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" 
> ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" [ 
> label = "Map_2[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n 
> dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "Reducer_3[ReduceTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" [ 
> label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n 
> dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8_store_sales"
>  [ label = "Map_8[store_sales]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8_store_sales"
>  -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" [ 
> label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" 
> ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7_store" 
> [ label = "Map_7[store]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7_store" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" [ label 
> = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5_d2" [ 
> label = "Map_5[d2]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5_d2" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" [ label 
> = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" [ 
> label = "Reducer_4[ReduceTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4_out_Reducer_4"
>  [ label = "Output [outputClass=MROutput,\n initializer=]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" [ label 
> = "Map_11[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ 
> label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n 
> dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1_d1" [ 
> label = "Map_1[d1]", shape = "box" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1_d1" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" [ label 
> = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" [ label 
> = "Map_7[MapTezProcessor]" ]; 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" -> 
> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ 
> label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n 
> dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; } 



--
This message was sent by Atlassian JIRA
(v6.2#6252)

Reply via email to