I'm comparing performance of a model when using:
- `mxnet-cu100` using CUDNN
- TVM CUDA `-libs=cudnn`
>From my understanding the result should be basically the same but instead TVM
>is a lot slower. When compiling the model I see CUDNN log looking for best
>algorithm, so I think the setup is fine. After compiling the model I use
>`module.module.time_evaluator` to measure inference time and result is:
- 1666ms (vs 338ms of `mxnet-cu100`)
I also tried debug runtime already with following results:
```
Node Name Ops
Time(us) Time(%) Shape Inputs Outputs
--------- ---
-------- ------- ----- ------ -------
fused_nn_conv2d_transpose_multiply_add_nn_relu
fused_nn_conv2d_transpose_multiply_add_nn_relu 1144440.0 68.686 (1,
256, 640, 480) 4 1
fused_nn_conv2d_transpose_multiply_add_nn_relu_1
fused_nn_conv2d_transpose_multiply_add_nn_relu_1 255461.0 15.332 (1,
256, 320, 240) 4 1
fused_nn_conv2d_transpose_multiply_add_nn_relu_3
fused_nn_conv2d_transpose_multiply_add_nn_relu_3 126896.0 7.616 (1,
256, 80, 60) 4 1
fused_nn_conv2d_transpose_multiply_add_nn_relu_2
fused_nn_conv2d_transpose_multiply_add_nn_relu_2 58891.6 3.535 (1,
256, 160, 120) 4 1
fused_nn_conv2d_add_nn_relu fused_nn_conv2d_add_nn_relu
27347.4 1.641 (1, 256, 640, 480) 3 1
fused_nn_conv2d_add fused_nn_conv2d_add
4067.5 0.244 (1, 64, 320, 240) 3 1
fused_nn_conv2d_add_nn_relu_1 fused_nn_conv2d_add_nn_relu_1
3312.35 0.199 (1, 512, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_12 fused_nn_conv2d_add_nn_relu_1
3256.99 0.195 (1, 512, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_11 fused_nn_conv2d_add_nn_relu_1
3228.79 0.194 (1, 512, 40, 30) 3 1
fused_nn_conv2d_add_4 fused_nn_conv2d_add_4
2783.87 0.167 (1, 2048, 40, 30) 3 1
fused_nn_conv2d_add_add_nn_relu1
fused_nn_conv2d_add_add_nn_relu 1652.69 0.099 (1,
2048, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu2
fused_nn_conv2d_add_add_nn_relu 1651.54 0.099 (1,
2048, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu
fused_nn_conv2d_add_add_nn_relu 1650.41 0.099 (1,
2048, 40, 30) 4 1
fused_nn_conv2d_add_nn_relu_2 fused_nn_conv2d_add_nn_relu_2
1437.18 0.086 (1, 512, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_21 fused_nn_conv2d_add_nn_relu_2
1421.56 0.085 (1, 512, 40, 30) 3 1
fused_nn_conv2d_add_add_nn_relu_31
fused_nn_conv2d_add_add_nn_relu_3 1101.8 0.066 (1,
256, 160, 120) 4 1
fused_nn_conv2d_add_add_nn_relu_3
fused_nn_conv2d_add_add_nn_relu_3 1101.62 0.066 (1,
256, 160, 120) 4 1
fused_nn_conv2d_add_add_nn_relu_32
fused_nn_conv2d_add_add_nn_relu_3 1094.64 0.066 (1,
256, 160, 120) 4 1
fused_nn_conv2d_add_2 fused_nn_conv2d_add_2
932.318 0.056 (1, 512, 80, 60) 3 1
fused_nn_conv2d_add_1 fused_nn_conv2d_add_1
910.287 0.055 (1, 256, 160, 120) 3 1
fused_nn_conv2d_add_nn_relu_111
fused_nn_conv2d_add_nn_relu_11 902.002 0.054 (1,
128, 160, 120) 3 1
fused_nn_conv2d_add_nn_relu_6 fused_nn_conv2d_add_nn_relu_6
881.889 0.053 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_44 fused_nn_conv2d_add_nn_relu_4
877.233 0.053 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_41 fused_nn_conv2d_add_nn_relu_4
877.187 0.053 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_42 fused_nn_conv2d_add_nn_relu_4
875.494 0.053 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_43 fused_nn_conv2d_add_nn_relu_4
875.401 0.053 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_4 fused_nn_conv2d_add_nn_relu_4
874.796 0.053 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_10
fused_nn_conv2d_add_nn_relu_10 846.727 0.051 (1,
128, 80, 60) 3 1
fused_nn_conv2d_add_3 fused_nn_conv2d_add_3
826.964 0.05 (1, 1024, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_7 fused_nn_conv2d_add_nn_relu_7
777.48 0.047 (1, 256, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_3 fused_nn_conv2d_add_nn_relu_3
747.369 0.045 (1, 512, 40, 30) 3 1
fused_nn_conv2d_add_add_nn_relu_2
fused_nn_conv2d_add_add_nn_relu_2 702.484 0.042 (1,
512, 80, 60) 4 1
fused_nn_conv2d_add_add_nn_relu_22
fused_nn_conv2d_add_add_nn_relu_2 701.242 0.042 (1,
512, 80, 60) 4 1
fused_nn_conv2d_add_add_nn_relu_21
fused_nn_conv2d_add_add_nn_relu_2 700.993 0.042 (1,
512, 80, 60) 4 1
fused_nn_conv2d_add_add_nn_relu_23
fused_nn_conv2d_add_add_nn_relu_2 697.114 0.042 (1,
512, 80, 60) 4 1
fused_nn_conv2d_add_nn_relu_123
fused_nn_conv2d_add_nn_relu_12 610.56 0.037 (1, 64,
160, 120) 3 1
fused_nn_conv2d_add_nn_relu_122
fused_nn_conv2d_add_nn_relu_12 596.89 0.036 (1, 64,
160, 120) 3 1
fused_nn_conv2d_add_nn_relu_121
fused_nn_conv2d_add_nn_relu_12 594.063 0.036 (1, 64,
160, 120) 3 1
fused_nn_conv2d_add_add_nn_relu_1
fused_nn_conv2d_add_add_nn_relu_1 529.202 0.032 (1,
1024, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu_12
fused_nn_conv2d_add_add_nn_relu_1 529.186 0.032 (1,
1024, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu_14
fused_nn_conv2d_add_add_nn_relu_1 528.849 0.032 (1,
1024, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu_13
fused_nn_conv2d_add_add_nn_relu_1 528.555 0.032 (1,
1024, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu_15
fused_nn_conv2d_add_add_nn_relu_1 528.112 0.032 (1,
1024, 40, 30) 4 1
fused_nn_conv2d_add_add_nn_relu_11
fused_nn_conv2d_add_add_nn_relu_1 527.938 0.032 (1,
1024, 40, 30) 4 1
fused_nn_conv2d_add_nn_relu_131
fused_nn_conv2d_add_nn_relu_13 514.069 0.031 (1, 64,
160, 120) 3 1
fused_nn_conv2d_add_nn_relu_13
fused_nn_conv2d_add_nn_relu_13 514.053 0.031 (1, 64,
160, 120) 3 1
fused_nn_conv2d_add_nn_relu_8 fused_nn_conv2d_add_nn_relu_8
508.776 0.031 (1, 128, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_82 fused_nn_conv2d_add_nn_relu_8
506.434 0.03 (1, 128, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_81 fused_nn_conv2d_add_nn_relu_8
503.658 0.03 (1, 128, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_92 fused_nn_conv2d_add_nn_relu_9
421.353 0.025 (1, 128, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_9 fused_nn_conv2d_add_nn_relu_9
418.987 0.025 (1, 128, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_91 fused_nn_conv2d_add_nn_relu_9
417.537 0.025 (1, 128, 80, 60) 3 1
fused_nn_conv2d_add_nn_relu_54 fused_nn_conv2d_add_nn_relu_5
416.805 0.025 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_51 fused_nn_conv2d_add_nn_relu_5
416.662 0.025 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_53 fused_nn_conv2d_add_nn_relu_5
415.586 0.025 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_5 fused_nn_conv2d_add_nn_relu_5
415.565 0.025 (1, 256, 40, 30) 3 1
fused_nn_conv2d_add_nn_relu_52 fused_nn_conv2d_add_nn_relu_5
415.012 0.025 (1, 256, 40, 30) 3 1
fused_nn_max_pool2d_nn_relu fused_nn_max_pool2d_nn_relu
262.382 0.016 (1, 64, 160, 120) 1 1
fused_nn_conv2d_add_nn_relu_14
fused_nn_conv2d_add_nn_relu_14 260.942 0.016 (1, 64,
160, 120) 3 1
Total_time -
1666185.096 - - - -
23.942410945892334
```
If I remove the `conv2d_transpose` layers from the model then performance are
the following:
- TVM CUDA autotuned: 40ms
- `mxnet-cu100`: 60ms
It's clear that implementation of `conv2d_transpose` is the bottleneck for TVM,
but why cannot I reproduce the same performance when specifying `-libs=cudnn`?
---
[Visit Topic](https://discuss.tvm.ai/t/cuda-libs-cudnn-performance/6700/1) to
respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.ai/email/unsubscribe/6a15aba1a057a1a65e8d6d3b76aab0d232ca44bf8c61e358af41f4ddd20f13f6).