LeiWang1999 commented on PR #15462:
URL: https://github.com/apache/tvm/pull/15462#issuecomment-1663807057
done, and we have about 5x speedup.
```bash
Time cost is: 8.849430084228516 ms
Name Duration (us) Percent
Device Count
Argument Shapes
fused_relax_nn_conv2d_relax_add_relax_nn_relu_cudnn 1124.35 13.64
cuda0 1 float16[128, 224, 224, 3], float16[64, 7, 7, 3], float16[1,
1, 1, 64], float16[128, 112, 112, 64]
fused_relax_nn_conv2d_relax_add_cudnn 680.29 8.25
cuda0 2 float16[128, 56, 56, 64], float16[64, 3, 3, 64],
float16[1, 1, 1, 64], float16[128, 56, 56, 64]
fused_relax_nn_conv2d_relax_add_relax_nn_relu1_cudnn 665.60 8.07
cuda0 2 float16[128, 56, 56, 64], float16[64, 3, 3, 64],
float16[1, 1, 1, 64], float16[128, 56, 56, 64]
add 528.38 6.41
cuda0 2 float16[128, 56, 56, 64], float16[128,
56, 56, 64], float16[128, 56, 56, 64]
fused_relax_nn_conv2d_relax_add5_cudnn 519.17 6.30
cuda0 2 float16[128, 7, 7, 512], float16[512, 3, 3, 512],
float16[1, 1, 1, 512], float16[128, 7, 7, 512]
fused_relax_nn_conv2d_relax_add1_cudnn 508.93 6.17
cuda0 2 float16[128, 28, 28, 128], float16[128, 3, 3, 128], float16[1,
1, 1, 128], float16[128, 28, 28, 128]
fused_relax_nn_conv2d_relax_add3_cudnn 474.24 5.75
cuda0 2 float16[128, 14, 14, 256], float16[256, 3, 3, 256], float16[1,
1, 1, 256], float16[128, 14, 14, 256]
relu 444.41 5.39
cuda0 2 float16[128,
56, 56, 64], float16[128, 56, 56, 64]
max_pool2d 389.12 4.72
cuda0 1 float16[128,
112, 112, 64], float16[128, 56, 56, 64]
add1 268.29 3.25
cuda0 2 float16[128, 28, 28, 128], float16[128,
28, 28, 128], float16[128, 28, 28, 128]
fused_relax_nn_conv2d_relax_add_relax_nn_relu7_cudnn 257.02 3.12
cuda0 1 float16[128, 7, 7, 512], float16[512, 3, 3, 512],
float16[1, 1, 1, 512], float16[128, 7, 7, 512]
fused_relax_nn_conv2d_relax_add_relax_nn_relu3_cudnn 252.26 3.06
cuda0 1 float16[128, 28, 28, 128], float16[128, 3, 3, 128], float16[1,
1, 1, 128], float16[128, 28, 28, 128]
fused_relax_nn_conv2d_relax_add_relax_nn_relu5_cudnn 236.54 2.87
cuda0 1 float16[128, 14, 14, 256], float16[256, 3, 3, 256], float16[1,
1, 1, 256], float16[128, 14, 14, 256]
relu1 225.28 2.73
cuda0 2 float16[128,
28, 28, 128], float16[128, 28, 28, 128]
transpose 179.20 2.17
cuda0 1 float16[128, 3,
224, 224], float16[128, 224, 224, 3]
fused_relax_nn_conv2d_relax_add_relax_nn_relu2_cudnn 176.13 2.14
cuda0 1 float16[128, 56, 56, 64], float16[128, 3, 3, 64], float16[1,
1, 1, 128], float16[128, 28, 28, 128]
fused_relax_nn_conv2d_relax_add2_cudnn 172.03 2.09
cuda0 1 float16[128, 56, 56, 64], float16[128, 1, 1, 64], float16[1,
1, 1, 128], float16[128, 28, 28, 128]
fused_relax_nn_conv2d_relax_add4_cudnn 157.69 1.91
cuda0 1 float16[128, 28, 28, 128], float16[256, 1, 1, 128], float16[1,
1, 1, 256], float16[128, 14, 14, 256]
fused_relax_nn_conv2d_relax_add_relax_nn_relu4_cudnn 154.62 1.88
cuda0 1 float16[128, 28, 28, 128], float16[256, 3, 3, 128], float16[1,
1, 1, 256], float16[128, 14, 14, 256]
fused_relax_nn_conv2d_relax_add6_cudnn 147.46 1.79
cuda0 1 float16[128, 14, 14, 256], float16[512, 1, 1, 256],
float16[1, 1, 1, 512], float16[128, 7, 7, 512]
add2 140.29 1.70
cuda0 2 float16[128, 14, 14, 256], float16[128,
14, 14, 256], float16[128, 14, 14, 256]
fused_relax_nn_conv2d_relax_add_relax_nn_relu6_cudnn 135.17 1.64
cuda0 1 float16[128, 14, 14, 256], float16[512, 3, 3, 256],
float16[1, 1, 1, 512], float16[128, 7, 7, 512]
relu2 116.96 1.42
cuda0 2 float16[128,
14, 14, 256], float16[128, 14, 14, 256]
matmul 76.80 0.93
cuda0 1 float16[128, 512],
float16[512, 1000], float16[128, 1000]
add3 67.58 0.82
cuda0 2 float16[128, 7, 7, 512],
float16[128, 7, 7, 512], float16[128, 7, 7, 512]
relu3 52.41 0.64
cuda0 2
float16[128, 7, 7, 512], float16[128, 7, 7, 512]
adaptive_avg_pool2d 17.41 0.21
cuda0 1
float16[128, 7, 7, 512], float16[128, 1, 1, 512]
add4 5.12 0.06
cuda0 1 float16[128,
1000], float16[1000], float16[128, 1000]
vm.builtin.check_tensor_info 2.05 0.02
cuda0 1
float16[128, 3, 224, 224]
vm.builtin.reshape 2.05 0.02
cuda0 1
float16[128, 512]
vm.builtin.match_shape 1.02 0.01
cuda0 1
float16[128, 3, 224, 224]
vm.builtin.reshape 1.02 0.01
cuda0 1
float16[128, 1, 1, 512]
vm.builtin.reshape 1.02 0.01
cuda0 1
float16[128, 512, 1, 1]
----------
Sum 8179.91 99.23
46
Total 6316.38
cpu0 1
Total 8243.20
cuda0 1
Configuration
-------------
Number of threads: 32
Executor: VM
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]