niketanpansare commented on issue #856: [SYSTEMML-540] Improve the performance of GPU lstm backward operator by passing the state URL: https://github.com/apache/systemml/pull/856#issuecomment-472928851 Nvprof profile for a sample run to test if we avoid lstm forward kerned in backward: - With new PR: ``` ==367516== Profiling application: /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.161-0.b14.el7_4.x86_64/jre/bin/java -cp /home/npansar/spark-2.4.0-bin-hadoop2.7/conf/:/home/npansar/spark-2.4.0-bin-hadoop2.7/jars/* -Xmx200g org.apache.spark.deploy.SparkSubmit --conf spark.driver.memory=200g --jars ==367516== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 67.76% 8.50410s 27500 309.24us 168.38us 1.0000ms maxwell_dgemm_64x64_nn 20.92% 2.62489s 300 8.7496ms 415.00us 13.133ms maxwell_dgemm_64x64_nt 3.36% 421.96ms 200 2.1098ms 2.0941ms 2.3021ms cudnn::detail::initstates(unsigned __int64, curandStateXORWOW*, int) 2.63% 330.08ms 10000 33.008us 12.352us 90.047us void LSTM_elementWise_bp1<double, double, double>(int, int, double*, double*, double*, double*, double*, double*, double*, double*, double*, int, int) 2.43% 304.92ms 10000 30.492us 15.936us 133.73us void LSTM_elementWise_fp<double, double, double>(int, int, int, int, double const *, double const *, double const *, double*, double*, double*, double const *, double*, bool, int) 0.49% 62.108ms 100 621.08us 581.72us 641.05us void elementWise_bp2<double, double, double, int=4, int=4, bool=1>(int, int, double*, double*) 0.45% 56.944ms 100 569.44us 562.27us 603.67us prepare_lstm_weight_d 0.41% 50.906ms 100 509.06us 505.53us 511.77us prepare_lstm_dweight_d 0.34% 42.931ms 200 214.66us 158.94us 302.72us void transpose_readWrite_alignment_kernel<double, double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *) 0.32% 40.181ms 29413 1.3660us 800ns 11.008us [CUDA memset] 0.25% 31.902ms 9 3.5446ms 1.3440us 16.452ms [CUDA memcpy HtoD] 0.21% 26.835ms 600 44.724us 1.8870us 142.85us reduce_sum_d 0.17% 21.635ms 100 216.35us 214.88us 218.01us prepare_lstm_input_d 0.16% 20.367ms 100 203.67us 202.40us 205.31us prepare_lstm_dinput_d 0.07% 8.8985ms 500 17.796us 1.0880us 44.960us [CUDA memcpy DtoH] 0.00% 512.64us 200 2.5630us 2.2080us 3.0720us [CUDA memcpy DtoD] 0.00% 456.99us 100 4.5690us 3.9680us 5.0880us prepare_lstm_backward_gradients_d ``` - With apache: ``` ==367108== Profiling application: /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.161-0.b14.el7_4.x86_64/jre/bin/java -cp /home/npansar/spark-2.4.0-bin-hadoop2.7/conf/:/home/npansar/spark-2.4.0-bin-hadoop2.7/jars/* -Xmx200g org.apache.spark.deploy.SparkSubmit --conf spark.driver.memory=200g --jars ==367108== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 73.70% 12.8474s 42500 302.29us 168.64us 992.73us maxwell_dgemm_64x64_nn 15.07% 2.62660s 300 8.7553ms 411.16us 13.323ms maxwell_dgemm_64x64_nt 4.02% 700.83ms 20000 35.041us 15.808us 141.57us void LSTM_elementWise_fp<double, double, double>(int, int, int, int, double const *, double const *, double const *, double*, double*, double*, double const *, double*, bool, int) 2.42% 422.03ms 200 2.1101ms 2.0900ms 2.3123ms cudnn::detail::initstates(unsigned __int64, curandStateXORWOW*, int) 1.98% 345.08ms 10000 34.508us 12.191us 94.623us void LSTM_elementWise_bp1<double, double, double>(int, int, double*, double*, double*, double*, double*, double*, double*, double*, double*, int, int) 0.62% 107.35ms 200 536.74us 533.05us 586.27us prepare_lstm_weight_d 0.47% 82.795ms 400 206.99us 156.38us 293.25us void transpose_readWrite_alignment_kernel<double, double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *) 0.37% 64.101ms 44913 1.4270us 799ns 9.9840us [CUDA memset] 0.36% 61.904ms 100 619.04us 569.11us 638.84us void elementWise_bp2<double, double, double, int=4, int=4, bool=1>(int, int, double*, double*) 0.29% 50.748ms 100 507.48us 502.84us 511.26us prepare_lstm_dweight_d 0.24% 41.002ms 200 205.01us 199.87us 209.53us prepare_lstm_input_d 0.15% 26.800ms 600 44.667us 1.8870us 142.69us reduce_sum_d 0.14% 24.334ms 9 2.7038ms 1.3120us 13.707ms [CUDA memcpy HtoD] 0.12% 20.365ms 100 203.65us 202.46us 205.37us prepare_lstm_dinput_d 0.05% 8.7729ms 500 17.545us 1.1200us 41.919us [CUDA memcpy DtoH] 0.00% 588.22us 200 2.9410us 2.3990us 3.7120us [CUDA memcpy DtoD] 0.00% 405.95us 100 4.0590us 3.5520us 6.7200us prepare_lstm_backward_gradients_d ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
