trevor-m opened a new issue #6691: URL: https://github.com/apache/incubator-tvm/issues/6691
I've started noticing a large performance regression affecting Keras MobileNetV2 caused by `INDEX_DEFAULT_I64=ON` (PR #6143). This is on an AWS m5.12xlarge instance. INDEX_DEFAULT_I64 | Frames per second ------------ | ------------- ON | 66.56 OFF | 435.49 I profiled the ops and found the slowdown comes from the ## Profile with `INDEX_DEFAULT_I64=OFF` (fast) ``` Node Name Ops Time(us) Time(%) Shape Inputs Outputs --------- --- -------- ------- ----- ------ ------- fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 64.704 3.571 (1, 9, 56, 56, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 53.362 2.945 (1, 2, 112, 112, 16) 3 1 fused_nn_pad_3 fused_nn_pad_3 50.582 2.791 (1, 6, 113, 113, 16) 1 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 47.874 2.642 (1, 6, 56, 56, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_6 fused_nn_contrib_conv2d_NCHWc_add_clip_6 46.828 2.584 (1, 6, 112, 112, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 42.364 2.338 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 39.554 2.183 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 39.418 2.175 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_4 fused_nn_contrib_conv2d_NCHWc_add_add_4 38.871 2.145 (1, 2, 56, 56, 12) 4 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 37.926 2.093 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_5 fused_nn_contrib_conv2d_NCHWc_add_clip_5 37.407 2.064 (1, 9, 56, 56, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_51 fused_nn_contrib_conv2d_NCHWc_add_clip_5 35.349 1.951 (1, 9, 56, 56, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip fused_nn_contrib_conv2d_NCHWc_add_clip 34.692 1.915 (1, 80, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_6 fused_nn_contrib_conv2d_NCHWc_add_6 34.052 1.879 (1, 1, 112, 112, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add fused_nn_contrib_conv2d_NCHWc_add 33.58 1.853 (1, 20, 7, 7, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.298 1.838 (1, 24, 14, 14, 16) 3 1 fused_nn_pad_2 fused_nn_pad_2 33.201 1.832 (1, 9, 57, 57, 16) 1 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.057 1.824 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.027 1.823 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 32.787 1.809 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_5 fused_nn_contrib_conv2d_NCHWc_add_5 32.332 1.784 (1, 2, 56, 56, 12) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 32.156 1.775 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 31.68 1.748 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.832 1.701 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_7 fused_nn_contrib_conv2d_NCHWc_add_clip_7 30.521 1.684 (1, 2, 112, 112, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_11 fused_nn_contrib_conv2d_NCHWc_add_add_1 30.012 1.656 (1, 6, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_1 fused_nn_contrib_conv2d_NCHWc_add_add_1 29.914 1.651 (1, 6, 14, 14, 16) 4 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 28.642 1.581 (1, 9, 28, 28, 16) 3 1 fused_nn_global_avg_pool2d fused_nn_global_avg_pool2d 28.552 1.576 (1, 80, 1, 1, 16) 1 1 fused_layout_transform_40 fused_layout_transform_40 26.741 1.476 (1, 8, 56, 56, 12) 1 1 fused_layout_transform_41 fused_layout_transform_41 25.793 1.423 (1, 12, 56, 56, 12) 1 1 fused_nn_contrib_conv2d_NCHWc_add_add1 fused_nn_contrib_conv2d_NCHWc_add_add 25.759 1.422 (1, 10, 7, 7, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_2 fused_nn_contrib_conv2d_NCHWc_add_add_2 25.566 1.411 (1, 4, 14, 14, 16) 4 1 fused_nn_dense_add fused_nn_dense_add 25.52 1.408 (1, 1000) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add fused_nn_contrib_conv2d_NCHWc_add_add 25.391 1.401 (1, 10, 7, 7, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_clip_21 fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.345 1.399 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_2 fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.262 1.394 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_22 fused_nn_contrib_conv2d_NCHWc_add_clip_2 24.895 1.374 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_3 fused_nn_contrib_conv2d_NCHWc_add_add_3 24.679 1.362 (1, 2, 28, 28, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_31 fused_nn_contrib_conv2d_NCHWc_add_add_3 24.553 1.355 (1, 2, 28, 28, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_2 fused_nn_contrib_conv2d_NCHWc_add_2 23.364 1.289 (1, 6, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_21 fused_nn_contrib_conv2d_NCHWc_add_add_2 23.264 1.284 (1, 4, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_22 fused_nn_contrib_conv2d_NCHWc_add_add_2 23.006 1.27 (1, 4, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_clip_11 fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.724 1.254 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_32 fused_nn_contrib_conv2d_NCHWc_add_clip_3 22.722 1.254 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_41 fused_nn_contrib_conv2d_NCHWc_add_clip_4 22.522 1.243 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_1 fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.247 1.228 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_33 fused_nn_contrib_conv2d_NCHWc_add_clip_3 21.648 1.195 (1, 24, 14, 14, 16) 3 1 fused_nn_pad fused_nn_pad 21.439 1.183 (1, 36, 15, 15, 16) 1 1 fused_nn_contrib_conv2d_NCHWc_add_clip_12 fused_nn_contrib_conv2d_NCHWc_add_clip_1 21.437 1.183 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_4 fused_nn_contrib_conv2d_NCHWc_add_4 21.426 1.182 (1, 2, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_1 fused_nn_contrib_conv2d_NCHWc_add_1 21.227 1.171 (1, 10, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_31 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.739 1.145 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_3 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.719 1.143 (1, 24, 14, 14, 16) 3 1 fused_nn_softmax fused_nn_softmax 19.798 1.093 (1, 1000) 1 1 fused_nn_contrib_conv2d_NCHWc_add_clip_42 fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.751 1.09 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_4 fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.679 1.086 (1, 12, 28, 28, 16) 3 1 fused_nn_pad_1 fused_nn_pad_1 18.729 1.034 (1, 12, 29, 29, 16) 1 1 fused_nn_contrib_conv2d_NCHWc_add_3 fused_nn_contrib_conv2d_NCHWc_add_3 18.411 1.016 (1, 4, 14, 14, 16) 3 1 fused_nn_pad_layout_transform fused_nn_pad_layout_transform 18.159 1.002 (1, 1, 225, 225, 3) 1 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 15.938 0.88 (1, 12, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 15.438 0.852 (1, 36, 7, 7, 16) 3 1 fused_layout_transform_transpose_nn_batch_flatten fused_layout_transform_transpose_nn_batch_flatten 1.563 0.086 (1, 1280) 1 1 Total_time - 1812.033 - - - - ``` ## Profile with `INDEX_DEFAULT_I64=ON` (slow) ``` Node Name Ops Time(us) Time(%) Shape Inputs Outputs --------- --- -------- ------- ----- ------ ------- fused_nn_contrib_conv2d_NCHWc_add_add_1 fused_nn_contrib_conv2d_NCHWc_add_add_1 3105.8 21.391 (1, 6, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_11 fused_nn_contrib_conv2d_NCHWc_add_add_1 3104.62 21.382 (1, 6, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_2 fused_nn_contrib_conv2d_NCHWc_add_add_2 2200.03 15.152 (1, 4, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_21 fused_nn_contrib_conv2d_NCHWc_add_add_2 2189.84 15.082 (1, 4, 14, 14, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_add_22 fused_nn_contrib_conv2d_NCHWc_add_add_2 2185.71 15.054 (1, 4, 14, 14, 16) 4 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 60.094 0.414 (1, 9, 56, 56, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 52.82 0.364 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 51.393 0.354 (1, 2, 112, 112, 16) 3 1 fused_nn_pad_3 fused_nn_pad_3 51.19 0.353 (1, 6, 113, 113, 16) 1 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 49.058 0.338 (1, 6, 56, 56, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_6 fused_nn_contrib_conv2d_NCHWc_add_clip_6 46.637 0.321 (1, 6, 112, 112, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 43.381 0.299 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 40.165 0.277 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 39.355 0.271 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 39.205 0.27 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_4 fused_nn_contrib_conv2d_NCHWc_add_add_4 38.595 0.266 (1, 2, 56, 56, 12) 4 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 38.019 0.262 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 37.559 0.259 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_5 fused_nn_contrib_conv2d_NCHWc_add_clip_5 36.159 0.249 (1, 9, 56, 56, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_51 fused_nn_contrib_conv2d_NCHWc_add_clip_5 35.269 0.243 (1, 9, 56, 56, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip fused_nn_contrib_conv2d_NCHWc_add_clip 34.755 0.239 (1, 80, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_2 fused_nn_contrib_conv2d_NCHWc_add_2 34.248 0.236 (1, 6, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_6 fused_nn_contrib_conv2d_NCHWc_add_6 33.65 0.232 (1, 1, 112, 112, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_7 fused_nn_contrib_conv2d_NCHWc_add_clip_7 33.163 0.228 (1, 2, 112, 112, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 32.593 0.224 (1, 24, 14, 14, 16) 3 1 fused_nn_pad_2 fused_nn_pad_2 32.542 0.224 (1, 9, 57, 57, 16) 1 1 fused_nn_contrib_conv2d_NCHWc_add fused_nn_contrib_conv2d_NCHWc_add 32.471 0.224 (1, 20, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_5 fused_nn_contrib_conv2d_NCHWc_add_5 31.587 0.218 (1, 2, 56, 56, 12) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.659 0.211 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.109 0.207 (1, 60, 7, 7, 16) 3 1 fused_nn_pad fused_nn_pad 29.258 0.202 (1, 36, 15, 15, 16) 1 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 29.083 0.2 (1, 9, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_2 fused_nn_contrib_conv2d_NCHWc_add_clip_2 28.273 0.195 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 28.052 0.193 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_22 fused_nn_contrib_conv2d_NCHWc_add_clip_2 27.855 0.192 (1, 36, 14, 14, 16) 3 1 fused_layout_transform_40 fused_layout_transform_40 27.811 0.192 (1, 8, 56, 56, 12) 1 1 fused_nn_global_avg_pool2d fused_nn_global_avg_pool2d 27.724 0.191 (1, 80, 1, 1, 16) 1 1 fused_layout_transform_41 fused_layout_transform_41 27.308 0.188 (1, 12, 56, 56, 12) 1 1 fused_nn_dense_add fused_nn_dense_add 26.655 0.184 (1, 1000) 3 1 fused_nn_contrib_conv2d_NCHWc_add_1 fused_nn_contrib_conv2d_NCHWc_add_1 26.406 0.182 (1, 10, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add fused_nn_contrib_conv2d_NCHWc_add_add 25.447 0.175 (1, 10, 7, 7, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_clip_21 fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.433 0.175 (1, 36, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add1 fused_nn_contrib_conv2d_NCHWc_add_add 25.276 0.174 (1, 10, 7, 7, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_clip_11 fused_nn_contrib_conv2d_NCHWc_add_clip_1 24.78 0.171 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_31 fused_nn_contrib_conv2d_NCHWc_add_add_3 24.132 0.166 (1, 2, 28, 28, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_clip_12 fused_nn_contrib_conv2d_NCHWc_add_clip_1 23.359 0.161 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_add_3 fused_nn_contrib_conv2d_NCHWc_add_add_3 23.226 0.16 (1, 2, 28, 28, 16) 4 1 fused_nn_contrib_conv2d_NCHWc_add_clip_31 fused_nn_contrib_conv2d_NCHWc_add_clip_3 22.999 0.158 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_1 fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.372 0.154 (1, 60, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_41 fused_nn_contrib_conv2d_NCHWc_add_clip_4 21.948 0.151 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_4 fused_nn_contrib_conv2d_NCHWc_add_4 21.359 0.147 (1, 2, 28, 28, 16) 3 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 21.269 0.146 (1, 36, 7, 7, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_33 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.916 0.144 (1, 24, 14, 14, 16) 3 1 fused_nn_softmax fused_nn_softmax 20.415 0.141 (1, 1000) 1 1 fused_nn_contrib_conv2d_NCHWc_add_clip_3 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.37 0.14 (1, 24, 14, 14, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_4 fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.395 0.134 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_clip_32 fused_nn_contrib_conv2d_NCHWc_add_clip_3 19.306 0.133 (1, 24, 14, 14, 16) 3 1 fused_nn_pad_1 fused_nn_pad_1 19.284 0.133 (1, 12, 29, 29, 16) 1 1 fused_nn_contrib_conv2d_NCHWc_add_clip_42 fused_nn_contrib_conv2d_NCHWc_add_clip_4 18.807 0.13 (1, 12, 28, 28, 16) 3 1 fused_nn_contrib_conv2d_NCHWc_add_3 fused_nn_contrib_conv2d_NCHWc_add_3 17.728 0.122 (1, 4, 14, 14, 16) 3 1 fused_nn_pad_layout_transform fused_nn_pad_layout_transform 15.683 0.108 (1, 1, 225, 225, 3) 1 1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 15.236 0.105 (1, 12, 14, 14, 16) 3 1 fused_layout_transform_transpose_nn_batch_flatten fused_layout_transform_transpose_nn_batch_flatten 1.607 0.011 (1, 1280) 1 1 Total_time - 14519.449 - - - - ``` Here is a script to reproduce: ``` import time import numpy as np import tvm from tvm import relay from tvm.contrib import graph_runtime import tensorflow as tf input_shape = (1, 3, 224, 224) model = tf.keras.applications.MobileNetV2() mod, params = relay.frontend.from_keras(model, shape={'input_1': input_shape}) dtype = 'float32' with relay.build_config(opt_level=3): graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512", params=params) i_data = np.random.uniform(0, 1, input_shape).astype(dtype) mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod.set_input(**params) # Time times = [] for i in range(100): start_time = time.time() mod.run(input_1=i_data) res = mod.get_output(0) times.append(time.time() - start_time) print('Mean latency:', 1000.0 * np.mean(times[10:])) print('Mean FPS:', 1.0 / np.mean(times[10:])) ``` Thanks! ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org