[SYSTEMML-540] Improved the performance of bias_add and added relu_backward (CP + GPU)
Closes #337. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/afe61b5a Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/afe61b5a Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/afe61b5a Branch: refs/heads/master Commit: afe61b5a295704f3a994f522a2f91d9c242d5c4c Parents: b0fb707 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Tue Jan 10 08:35:56 2017 -0800 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Tue Jan 10 08:35:56 2017 -0800 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 12 +- src/main/cpp/kernels/SystemML.ptx | 2410 ++++++------------ .../java/org/apache/sysml/hops/BinaryOp.java | 18 +- .../apache/sysml/lops/ConvolutionTransform.java | 17 +- .../instructions/CPInstructionParser.java | 1 + .../instructions/GPUInstructionParser.java | 1 + .../cp/ConvolutionCPInstruction.java | 30 +- .../gpu/ConvolutionGPUInstruction.java | 24 +- .../runtime/matrix/data/LibMatrixCUDA.java | 18 + .../sysml/runtime/matrix/data/LibMatrixDNN.java | 139 +- .../functions/tensor/ReluBackwardTest.java | 117 + .../scripts/functions/tensor/ReluBackwardTest.R | 30 + .../functions/tensor/ReluBackwardTest.dml | 27 + 13 files changed, 1215 insertions(+), 1629 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 1812b6a..7e32f0e 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -20,7 +20,7 @@ /********************************** When updating a kernel or adding a new one, please compile the ptx file and commit it: -nvcc -ptx SystemML.cu +nvcc -ptx -arch=sm_30 SystemML.cu ***********************************/ #include <cfloat> @@ -116,6 +116,16 @@ __global__ void relu(double* A, double* ret, int rlen, int clen) { } } +extern "C" +__global__ void relu_backward(double* X, double* dout, double* ret, int rlen, int clen) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + if(ix < rlen && iy < clen) { + int index = ix * clen + iy; + ret[index] = X[index] > 0 ? dout[index] : 0; + } +} + // Compares the value and set extern "C" __global__ void compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 99d5898..e30e00a 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,859 +1,24 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-19856038 -// Cuda compilation tools, release 7.5, V7.5.17 +// Compiler Build ID: CL-21124049 +// Cuda compilation tools, release 8.0, V8.0.44 // Based on LLVM 3.4svn // -.version 4.3 +.version 5.0 .target sm_30 .address_size 64 - // .globl getBoolean -.func (.param .b64 func_retval0) __internal_accurate_pow -( - .param .b64 __internal_accurate_pow_param_0, - .param .b64 __internal_accurate_pow_param_1 -) -; -.extern .shared .align 8 .b8 sdata[]; - -.visible .func (.param .b64 func_retval0) getBoolean( - .param .b32 getBoolean_param_0 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<2>; - .reg .f64 %fd<2>; - - - ld.param.u32 %r1, [getBoolean_param_0]; - setp.eq.s32 %p1, %r1, 0; - selp.f64 %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1; - st.param.f64 [func_retval0+0], %fd1; - ret; -} - - // .globl binaryOp -.visible .func (.param .b64 func_retval0) binaryOp( - .param .b64 binaryOp_param_0, - .param .b64 binaryOp_param_1, - .param .b32 binaryOp_param_2 -) -{ - .reg .pred %p<41>; - .reg .b32 %r<30>; - .reg .f64 %fd<40>; - .reg .b64 %rd<3>; - - - ld.param.f64 %fd26, [binaryOp_param_0]; - ld.param.f64 %fd27, [binaryOp_param_1]; - ld.param.u32 %r3, [binaryOp_param_2]; - setp.eq.s32 %p2, %r3, 0; - @%p2 bra BB1_40; - - setp.eq.s32 %p3, %r3, 1; - @%p3 bra BB1_39; - bra.uni BB1_2; - -BB1_39: - sub.f64 %fd39, %fd26, %fd27; - bra.uni BB1_41; - -BB1_40: - add.f64 %fd39, %fd26, %fd27; - bra.uni BB1_41; - -BB1_2: - setp.eq.s32 %p4, %r3, 2; - @%p4 bra BB1_38; - bra.uni BB1_3; - -BB1_38: - mul.f64 %fd39, %fd26, %fd27; - bra.uni BB1_41; - -BB1_3: - setp.eq.s32 %p5, %r3, 3; - @%p5 bra BB1_37; - bra.uni BB1_4; - -BB1_37: - div.rn.f64 %fd39, %fd26, %fd27; - bra.uni BB1_41; - -BB1_4: - setp.eq.s32 %p6, %r3, 4; - @%p6 bra BB1_21; - bra.uni BB1_5; - -BB1_21: - { - .reg .b32 %temp; - mov.b64 {%temp, %r1}, %fd26; - } - { - .reg .b32 %temp; - mov.b64 {%temp, %r2}, %fd27; - } - bfe.u32 %r4, %r2, 20, 11; - add.s32 %r5, %r4, -1012; - mov.b64 %rd2, %fd27; - shl.b64 %rd1, %rd2, %r5; - setp.eq.s64 %p21, %rd1, -9223372036854775808; - abs.f64 %fd9, %fd26; - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - // <end>} - .param .b64 param0; - st.param.f64 [param0+0], %fd9; - .param .b64 param1; - st.param.f64 [param1+0], %fd27; - .param .b64 retval0; - call.uni (retval0), - __internal_accurate_pow, - ( - param0, - param1 - ); - ld.param.f64 %fd38, [retval0+0]; - - //{ - }// Callseq End 0 - setp.lt.s32 %p22, %r1, 0; - and.pred %p1, %p22, %p21; - @!%p1 bra BB1_23; - bra.uni BB1_22; - -BB1_22: - { - .reg .b32 %temp; - mov.b64 {%temp, %r6}, %fd38; - } - xor.b32 %r7, %r6, -2147483648; - { - .reg .b32 %temp; - mov.b64 {%r8, %temp}, %fd38; - } - mov.b64 %fd38, {%r8, %r7}; - -BB1_23: - mov.f64 %fd37, %fd38; - setp.eq.f64 %p23, %fd26, 0d0000000000000000; - @%p23 bra BB1_26; - bra.uni BB1_24; - -BB1_26: - selp.b32 %r9, %r1, 0, %p21; - or.b32 %r10, %r9, 2146435072; - setp.lt.s32 %p27, %r2, 0; - selp.b32 %r11, %r10, %r9, %p27; - mov.u32 %r12, 0; - mov.b64 %fd37, {%r12, %r11}; - bra.uni BB1_27; - -BB1_5: - setp.eq.s32 %p7, %r3, 5; - @%p7 bra BB1_20; - bra.uni BB1_6; - -BB1_20: - setp.lt.f64 %p20, %fd26, %fd27; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p20; - bra.uni BB1_41; - -BB1_6: - setp.eq.s32 %p8, %r3, 6; - @%p8 bra BB1_19; - bra.uni BB1_7; - -BB1_19: - setp.le.f64 %p19, %fd26, %fd27; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p19; - bra.uni BB1_41; - -BB1_24: - setp.gt.s32 %p24, %r1, -1; - @%p24 bra BB1_27; - - cvt.rzi.f64.f64 %fd29, %fd27; - setp.neu.f64 %p25, %fd29, %fd27; - selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p25; - -BB1_27: - mov.f64 %fd15, %fd37; - add.f64 %fd16, %fd26, %fd27; - { - .reg .b32 %temp; - mov.b64 {%temp, %r13}, %fd16; - } - and.b32 %r14, %r13, 2146435072; - setp.ne.s32 %p28, %r14, 2146435072; - mov.f64 %fd36, %fd15; - @%p28 bra BB1_36; - - setp.gtu.f64 %p29, %fd9, 0d7FF0000000000000; - mov.f64 %fd36, %fd16; - @%p29 bra BB1_36; - - abs.f64 %fd30, %fd27; - setp.gtu.f64 %p30, %fd30, 0d7FF0000000000000; - mov.f64 %fd35, %fd16; - mov.f64 %fd36, %fd35; - @%p30 bra BB1_36; - - and.b32 %r15, %r2, 2147483647; - setp.ne.s32 %p31, %r15, 2146435072; - @%p31 bra BB1_32; - - { - .reg .b32 %temp; - mov.b64 {%r16, %temp}, %fd27; - } - setp.eq.s32 %p32, %r16, 0; - @%p32 bra BB1_35; - -BB1_32: - and.b32 %r17, %r1, 2147483647; - setp.ne.s32 %p33, %r17, 2146435072; - mov.f64 %fd33, %fd15; - mov.f64 %fd36, %fd33; - @%p33 bra BB1_36; - - { - .reg .b32 %temp; - mov.b64 {%r18, %temp}, %fd26; - } - setp.ne.s32 %p34, %r18, 0; - mov.f64 %fd36, %fd15; - @%p34 bra BB1_36; - - shr.s32 %r19, %r2, 31; - and.b32 %r20, %r19, -2146435072; - add.s32 %r21, %r20, 2146435072; - or.b32 %r22, %r21, -2147483648; - selp.b32 %r23, %r22, %r21, %p1; - mov.u32 %r24, 0; - mov.b64 %fd36, {%r24, %r23}; - bra.uni BB1_36; - -BB1_7: - setp.eq.s32 %p9, %r3, 7; - @%p9 bra BB1_18; - bra.uni BB1_8; - -BB1_18: - setp.gt.f64 %p18, %fd26, %fd27; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p18; - bra.uni BB1_41; - -BB1_8: - setp.eq.s32 %p10, %r3, 8; - @%p10 bra BB1_17; - bra.uni BB1_9; - -BB1_17: - setp.ge.f64 %p17, %fd26, %fd27; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p17; - bra.uni BB1_41; - -BB1_9: - setp.eq.s32 %p11, %r3, 9; - @%p11 bra BB1_16; - bra.uni BB1_10; - -BB1_16: - setp.eq.f64 %p16, %fd26, %fd27; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p16; - bra.uni BB1_41; - -BB1_10: - setp.eq.s32 %p12, %r3, 10; - @%p12 bra BB1_15; - bra.uni BB1_11; - -BB1_15: - setp.neu.f64 %p15, %fd26, %fd27; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p15; - bra.uni BB1_41; - -BB1_35: - setp.gt.f64 %p35, %fd9, 0d3FF0000000000000; - selp.b32 %r25, 2146435072, 0, %p35; - xor.b32 %r26, %r25, 2146435072; - setp.lt.s32 %p36, %r2, 0; - selp.b32 %r27, %r26, %r25, %p36; - setp.eq.f64 %p37, %fd26, 0dBFF0000000000000; - selp.b32 %r28, 1072693248, %r27, %p37; - mov.u32 %r29, 0; - mov.b64 %fd36, {%r29, %r28}; - -BB1_36: - setp.eq.f64 %p38, %fd27, 0d0000000000000000; - setp.eq.f64 %p39, %fd26, 0d3FF0000000000000; - or.pred %p40, %p39, %p38; - selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p40; - -BB1_41: - st.param.f64 [func_retval0+0], %fd39; - ret; - -BB1_11: - setp.eq.s32 %p13, %r3, 11; - @%p13 bra BB1_14; - bra.uni BB1_12; - -BB1_14: - min.f64 %fd39, %fd26, %fd27; - bra.uni BB1_41; - -BB1_12: - mov.f64 %fd39, 0dC08F380000000000; - setp.ne.s32 %p14, %r3, 12; - @%p14 bra BB1_41; - - max.f64 %fd39, %fd26, %fd27; - bra.uni BB1_41; -} - - // .globl _Z6reduceI5SumOpEvPdS1_jT_d -.visible .func _Z6reduceI5SumOpEvPdS1_jT_d( - .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI5SumOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI5SumOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<18>; - .reg .b32 %r<31>; - .reg .f64 %fd<70>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI5SumOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI5SumOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI5SumOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd67, [_Z6reduceI5SumOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r30, %r8, %r9, %r6; - setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB2_5; - - mov.f64 %fd68, %fd67; - -BB2_2: - mov.f64 %fd1, %fd68; - mul.wide.u32 %rd4, %r30, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd26, [%rd5]; - add.f64 %fd69, %fd1, %fd26; - add.s32 %r3, %r30, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB2_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd27, [%rd7]; - add.f64 %fd69, %fd69, %fd27; - -BB2_4: - mov.f64 %fd68, %fd69; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r30, %r12, %r13, %r30; - setp.lt.u32 %p3, %r30, %r5; - mov.f64 %fd67, %fd68; - @%p3 bra BB2_2; - -BB2_5: - mov.f64 %fd65, %fd67; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd65; - bar.sync 0; - setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB2_9; - - setp.gt.u32 %p5, %r6, 255; - mov.f64 %fd66, %fd65; - @%p5 bra BB2_8; - - ld.shared.f64 %fd28, [%rd1+2048]; - add.f64 %fd66, %fd65, %fd28; - st.shared.f64 [%rd1], %fd66; - -BB2_8: - mov.f64 %fd65, %fd66; - bar.sync 0; - -BB2_9: - mov.f64 %fd63, %fd65; - setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB2_13; - - setp.gt.u32 %p7, %r6, 127; - mov.f64 %fd64, %fd63; - @%p7 bra BB2_12; - - ld.shared.f64 %fd29, [%rd1+1024]; - add.f64 %fd64, %fd63, %fd29; - st.shared.f64 [%rd1], %fd64; - -BB2_12: - mov.f64 %fd63, %fd64; - bar.sync 0; - -BB2_13: - mov.f64 %fd61, %fd63; - setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB2_17; - - setp.gt.u32 %p9, %r6, 63; - mov.f64 %fd62, %fd61; - @%p9 bra BB2_16; - - ld.shared.f64 %fd30, [%rd1+512]; - add.f64 %fd62, %fd61, %fd30; - st.shared.f64 [%rd1], %fd62; - -BB2_16: - mov.f64 %fd61, %fd62; - bar.sync 0; - -BB2_17: - mov.f64 %fd60, %fd61; - setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB2_30; - - setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB2_20; - - ld.volatile.shared.f64 %fd31, [%rd1+256]; - add.f64 %fd60, %fd60, %fd31; - st.volatile.shared.f64 [%rd1], %fd60; - -BB2_20: - mov.f64 %fd59, %fd60; - setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB2_22; - - ld.volatile.shared.f64 %fd32, [%rd1+128]; - add.f64 %fd59, %fd59, %fd32; - st.volatile.shared.f64 [%rd1], %fd59; - -BB2_22: - mov.f64 %fd58, %fd59; - setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB2_24; - - ld.volatile.shared.f64 %fd33, [%rd1+64]; - add.f64 %fd58, %fd58, %fd33; - st.volatile.shared.f64 [%rd1], %fd58; - -BB2_24: - mov.f64 %fd57, %fd58; - setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB2_26; - - ld.volatile.shared.f64 %fd34, [%rd1+32]; - add.f64 %fd57, %fd57, %fd34; - st.volatile.shared.f64 [%rd1], %fd57; - -BB2_26: - mov.f64 %fd56, %fd57; - setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB2_28; - - ld.volatile.shared.f64 %fd35, [%rd1+16]; - add.f64 %fd56, %fd56, %fd35; - st.volatile.shared.f64 [%rd1], %fd56; - -BB2_28: - setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB2_30; - - ld.volatile.shared.f64 %fd36, [%rd1+8]; - add.f64 %fd37, %fd56, %fd36; - st.volatile.shared.f64 [%rd1], %fd37; - -BB2_30: - setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB2_32; - - ld.shared.f64 %fd38, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd38; - -BB2_32: - ret; -} - - // .globl _Z6reduceI5MaxOpEvPdS1_jT_d -.visible .func _Z6reduceI5MaxOpEvPdS1_jT_d( - .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI5MaxOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI5MaxOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<18>; - .reg .b32 %r<31>; - .reg .f64 %fd<70>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd67, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r30, %r8, %r9, %r6; - setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB3_5; - - mov.f64 %fd68, %fd67; - -BB3_2: - mov.f64 %fd1, %fd68; - mul.wide.u32 %rd4, %r30, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd26, [%rd5]; - max.f64 %fd69, %fd1, %fd26; - add.s32 %r3, %r30, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB3_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd27, [%rd7]; - max.f64 %fd69, %fd69, %fd27; - -BB3_4: - mov.f64 %fd68, %fd69; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r30, %r12, %r13, %r30; - setp.lt.u32 %p3, %r30, %r5; - mov.f64 %fd67, %fd68; - @%p3 bra BB3_2; - -BB3_5: - mov.f64 %fd65, %fd67; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd65; - bar.sync 0; - setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB3_9; - - setp.gt.u32 %p5, %r6, 255; - mov.f64 %fd66, %fd65; - @%p5 bra BB3_8; - - ld.shared.f64 %fd28, [%rd1+2048]; - max.f64 %fd66, %fd65, %fd28; - st.shared.f64 [%rd1], %fd66; - -BB3_8: - mov.f64 %fd65, %fd66; - bar.sync 0; - -BB3_9: - mov.f64 %fd63, %fd65; - setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB3_13; - - setp.gt.u32 %p7, %r6, 127; - mov.f64 %fd64, %fd63; - @%p7 bra BB3_12; - - ld.shared.f64 %fd29, [%rd1+1024]; - max.f64 %fd64, %fd63, %fd29; - st.shared.f64 [%rd1], %fd64; - -BB3_12: - mov.f64 %fd63, %fd64; - bar.sync 0; - -BB3_13: - mov.f64 %fd61, %fd63; - setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB3_17; - - setp.gt.u32 %p9, %r6, 63; - mov.f64 %fd62, %fd61; - @%p9 bra BB3_16; - - ld.shared.f64 %fd30, [%rd1+512]; - max.f64 %fd62, %fd61, %fd30; - st.shared.f64 [%rd1], %fd62; - -BB3_16: - mov.f64 %fd61, %fd62; - bar.sync 0; - -BB3_17: - mov.f64 %fd60, %fd61; - setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB3_30; - - setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB3_20; - - ld.volatile.shared.f64 %fd31, [%rd1+256]; - max.f64 %fd60, %fd60, %fd31; - st.volatile.shared.f64 [%rd1], %fd60; - -BB3_20: - mov.f64 %fd59, %fd60; - setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB3_22; - - ld.volatile.shared.f64 %fd32, [%rd1+128]; - max.f64 %fd59, %fd59, %fd32; - st.volatile.shared.f64 [%rd1], %fd59; - -BB3_22: - mov.f64 %fd58, %fd59; - setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB3_24; - - ld.volatile.shared.f64 %fd33, [%rd1+64]; - max.f64 %fd58, %fd58, %fd33; - st.volatile.shared.f64 [%rd1], %fd58; - -BB3_24: - mov.f64 %fd57, %fd58; - setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB3_26; - - ld.volatile.shared.f64 %fd34, [%rd1+32]; - max.f64 %fd57, %fd57, %fd34; - st.volatile.shared.f64 [%rd1], %fd57; - -BB3_26: - mov.f64 %fd56, %fd57; - setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB3_28; - - ld.volatile.shared.f64 %fd35, [%rd1+16]; - max.f64 %fd56, %fd56, %fd35; - st.volatile.shared.f64 [%rd1], %fd56; - -BB3_28: - setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB3_30; - - ld.volatile.shared.f64 %fd36, [%rd1+8]; - max.f64 %fd37, %fd56, %fd36; - st.volatile.shared.f64 [%rd1], %fd37; - -BB3_30: - setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB3_32; - - ld.shared.f64 %fd38, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd38; - -BB3_32: - ret; -} - - // .globl _Z6reduceI5MinOpEvPdS1_jT_d -.visible .func _Z6reduceI5MinOpEvPdS1_jT_d( - .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI5MinOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI5MinOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<18>; - .reg .b32 %r<31>; - .reg .f64 %fd<70>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI5MinOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI5MinOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI5MinOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd67, [_Z6reduceI5MinOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r30, %r8, %r9, %r6; - setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB4_5; - - mov.f64 %fd68, %fd67; - -BB4_2: - mov.f64 %fd1, %fd68; - mul.wide.u32 %rd4, %r30, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd26, [%rd5]; - min.f64 %fd69, %fd1, %fd26; - add.s32 %r3, %r30, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB4_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd27, [%rd7]; - min.f64 %fd69, %fd69, %fd27; - -BB4_4: - mov.f64 %fd68, %fd69; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r30, %r12, %r13, %r30; - setp.lt.u32 %p3, %r30, %r5; - mov.f64 %fd67, %fd68; - @%p3 bra BB4_2; - -BB4_5: - mov.f64 %fd65, %fd67; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd65; - bar.sync 0; - setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB4_9; - - setp.gt.u32 %p5, %r6, 255; - mov.f64 %fd66, %fd65; - @%p5 bra BB4_8; - - ld.shared.f64 %fd28, [%rd1+2048]; - min.f64 %fd66, %fd65, %fd28; - st.shared.f64 [%rd1], %fd66; - -BB4_8: - mov.f64 %fd65, %fd66; - bar.sync 0; - -BB4_9: - mov.f64 %fd63, %fd65; - setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB4_13; - - setp.gt.u32 %p7, %r6, 127; - mov.f64 %fd64, %fd63; - @%p7 bra BB4_12; - - ld.shared.f64 %fd29, [%rd1+1024]; - min.f64 %fd64, %fd63, %fd29; - st.shared.f64 [%rd1], %fd64; - -BB4_12: - mov.f64 %fd63, %fd64; - bar.sync 0; - -BB4_13: - mov.f64 %fd61, %fd63; - setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB4_17; - - setp.gt.u32 %p9, %r6, 63; - mov.f64 %fd62, %fd61; - @%p9 bra BB4_16; - - ld.shared.f64 %fd30, [%rd1+512]; - min.f64 %fd62, %fd61, %fd30; - st.shared.f64 [%rd1], %fd62; - -BB4_16: - mov.f64 %fd61, %fd62; - bar.sync 0; - -BB4_17: - mov.f64 %fd60, %fd61; - setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB4_30; - - setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB4_20; - - ld.volatile.shared.f64 %fd31, [%rd1+256]; - min.f64 %fd60, %fd60, %fd31; - st.volatile.shared.f64 [%rd1], %fd60; - -BB4_20: - mov.f64 %fd59, %fd60; - setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB4_22; - - ld.volatile.shared.f64 %fd32, [%rd1+128]; - min.f64 %fd59, %fd59, %fd32; - st.volatile.shared.f64 [%rd1], %fd59; - -BB4_22: - mov.f64 %fd58, %fd59; - setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB4_24; - - ld.volatile.shared.f64 %fd33, [%rd1+64]; - min.f64 %fd58, %fd58, %fd33; - st.volatile.shared.f64 [%rd1], %fd58; - -BB4_24: - mov.f64 %fd57, %fd58; - setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB4_26; - - ld.volatile.shared.f64 %fd34, [%rd1+32]; - min.f64 %fd57, %fd57, %fd34; - st.volatile.shared.f64 [%rd1], %fd57; - -BB4_26: - mov.f64 %fd56, %fd57; - setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB4_28; - - ld.volatile.shared.f64 %fd35, [%rd1+16]; - min.f64 %fd56, %fd56, %fd35; - st.volatile.shared.f64 [%rd1], %fd56; - -BB4_28: - setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB4_30; - - ld.volatile.shared.f64 %fd36, [%rd1+8]; - min.f64 %fd37, %fd56, %fd36; - st.volatile.shared.f64 [%rd1], %fd37; - -BB4_30: - setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB4_32; - - ld.shared.f64 %fd38, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd38; - -BB4_32: - ret; -} - // .globl copyUpperToLowerTriangleDense +.func (.param .b64 func_retval0) __internal_accurate_pow +( + .param .b64 __internal_accurate_pow_param_0, + .param .b64 __internal_accurate_pow_param_1 +) +; +.extern .shared .align 8 .b8 sdata[]; + .visible .entry copyUpperToLowerTriangleDense( .param .u64 copyUpperToLowerTriangleDense_param_0, .param .u32 copyUpperToLowerTriangleDense_param_1, @@ -881,10 +46,10 @@ BB4_32: setp.gt.s32 %p1, %r2, %r1; setp.lt.s32 %p2, %r3, %r5; and.pred %p3, %p1, %p2; - @!%p3 bra BB5_2; - bra.uni BB5_1; + @!%p3 bra BB0_2; + bra.uni BB0_1; -BB5_1: +BB0_1: cvta.to.global.u64 %rd2, %rd1; mad.lo.s32 %r12, %r1, %r4, %r2; mul.wide.s32 %rd3, %r12, 8; @@ -894,7 +59,7 @@ BB5_1: add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; -BB5_2: +BB0_2: ret; } @@ -927,14 +92,14 @@ BB5_2: mad.lo.s32 %r1, %r8, %r9, %r11; mul.lo.s32 %r12, %r3, %r2; setp.ge.s32 %p1, %r1, %r12; - @%p1 bra BB6_2; + @%p1 bra BB1_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB6_2: +BB1_2: ret; } @@ -968,10 +133,10 @@ BB6_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB7_2; - bra.uni BB7_1; + @!%p3 bra BB2_2; + bra.uni BB2_1; -BB7_1: +BB2_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -980,7 +145,7 @@ BB7_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd1; -BB7_2: +BB2_2: ret; } @@ -1013,10 +178,10 @@ BB7_2: setp.lt.s32 %p1, %r1, %r4; setp.lt.s32 %p2, %r2, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB8_2; - bra.uni BB8_1; + @!%p3 bra BB3_2; + bra.uni BB3_1; -BB8_1: +BB3_1: cvta.to.global.u64 %rd3, %rd1; mad.lo.s32 %r11, %r1, %r3, %r2; mul.wide.s32 %rd4, %r11, 8; @@ -1028,7 +193,67 @@ BB8_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd3; -BB8_2: +BB3_2: + ret; +} + + // .globl relu_backward +.visible .entry relu_backward( + .param .u64 relu_backward_param_0, + .param .u64 relu_backward_param_1, + .param .u64 relu_backward_param_2, + .param .u32 relu_backward_param_3, + .param .u32 relu_backward_param_4 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<12>; + .reg .f64 %fd<6>; + .reg .b64 %rd<14>; + + + ld.param.u64 %rd2, [relu_backward_param_0]; + ld.param.u64 %rd3, [relu_backward_param_1]; + ld.param.u64 %rd4, [relu_backward_param_2]; + ld.param.u32 %r4, [relu_backward_param_3]; + ld.param.u32 %r3, [relu_backward_param_4]; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r5, %r6, %r7; + mov.u32 %r8, %ntid.y; + mov.u32 %r9, %ctaid.y; + mov.u32 %r10, %tid.y; + mad.lo.s32 %r2, %r8, %r9, %r10; + setp.lt.s32 %p1, %r1, %r4; + setp.lt.s32 %p2, %r2, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB4_4; + bra.uni BB4_1; + +BB4_1: + cvta.to.global.u64 %rd5, %rd2; + mad.lo.s32 %r11, %r1, %r3, %r2; + cvt.s64.s32 %rd1, %r11; + mul.wide.s32 %rd6, %r11, 8; + add.s64 %rd7, %rd5, %rd6; + ld.global.f64 %fd4, [%rd7]; + mov.f64 %fd5, 0d0000000000000000; + setp.leu.f64 %p4, %fd4, 0d0000000000000000; + @%p4 bra BB4_3; + + cvta.to.global.u64 %rd8, %rd3; + shl.b64 %rd9, %rd1, 3; + add.s64 %rd10, %rd8, %rd9; + ld.global.f64 %fd5, [%rd10]; + +BB4_3: + cvta.to.global.u64 %rd11, %rd4; + shl.b64 %rd12, %rd1, 3; + add.s64 %rd13, %rd11, %rd12; + st.global.f64 [%rd13], %fd5; + +BB4_4: ret; } @@ -1072,10 +297,10 @@ BB8_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB9_6; - bra.uni BB9_1; + @!%p3 bra BB5_6; + bra.uni BB5_1; -BB9_1: +BB5_1: cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; add.s64 %rd6, %rd4, %rd5; @@ -1085,26 +310,26 @@ BB9_1: setp.lt.f64 %p4, %fd8, %fd3; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB9_5; - bra.uni BB9_2; + @%p4 bra BB5_5; + bra.uni BB5_2; -BB9_5: +BB5_5: st.global.f64 [%rd1], %fd4; - bra.uni BB9_6; + bra.uni BB5_6; -BB9_2: +BB5_2: setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB9_4; - bra.uni BB9_3; + @%p5 bra BB5_4; + bra.uni BB5_3; -BB9_4: +BB5_4: st.global.f64 [%rd1], %fd5; - bra.uni BB9_6; + bra.uni BB5_6; -BB9_3: +BB5_3: st.global.f64 [%rd1], %fd6; -BB9_6: +BB5_6: ret; } @@ -1120,9 +345,9 @@ BB9_6: .param .u32 binCellOp_param_7 ) { - .reg .pred %p<52>; - .reg .b32 %r<56>; - .reg .f64 %fd<40>; + .reg .pred %p<54>; + .reg .b32 %r<55>; + .reg .f64 %fd<39>; .reg .b64 %rd<15>; @@ -1145,93 +370,93 @@ BB9_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB10_55; - bra.uni BB10_1; + @!%p4 bra BB6_53; + bra.uni BB6_1; -BB10_1: +BB6_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r54, %r1; - @%p5 bra BB10_5; + mov.u32 %r53, %r1; + @%p5 bra BB6_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r55, %r3; - @%p6 bra BB10_4; + mov.u32 %r54, %r3; + @%p6 bra BB6_4; - mov.u32 %r55, %r2; + mov.u32 %r54, %r2; -BB10_4: - mov.u32 %r49, %r55; - mov.u32 %r4, %r49; - mov.u32 %r54, %r4; +BB6_4: + mov.u32 %r48, %r54; + mov.u32 %r4, %r48; + mov.u32 %r53, %r4; -BB10_5: - mov.u32 %r5, %r54; +BB6_5: + mov.u32 %r5, %r53; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r52, %r1; - @%p7 bra BB10_9; + mov.u32 %r51, %r1; + @%p7 bra BB6_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r53, %r3; - @%p8 bra BB10_8; + mov.u32 %r52, %r3; + @%p8 bra BB6_8; - mov.u32 %r53, %r2; + mov.u32 %r52, %r2; -BB10_8: - mov.u32 %r52, %r53; +BB6_8: + mov.u32 %r51, %r52; -BB10_9: +BB6_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r52, 8; + mul.wide.s32 %rd9, %r51, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd39, 0dC08F380000000000; + mov.f64 %fd38, 0dC08F380000000000; setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB10_19; + @%p9 bra BB6_19; setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB10_15; + @%p19 bra BB6_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB10_53; + @%p23 bra BB6_51; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB10_52; - bra.uni BB10_13; + @%p24 bra BB6_50; + bra.uni BB6_13; -BB10_52: - sub.f64 %fd39, %fd1, %fd2; - bra.uni BB10_54; +BB6_50: + sub.f64 %fd38, %fd1, %fd2; + bra.uni BB6_52; -BB10_19: +BB6_19: setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB10_24; + @%p10 bra BB6_24; setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB10_34; + @%p16 bra BB6_34; setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB10_33; - bra.uni BB10_22; + @%p17 bra BB6_33; + bra.uni BB6_22; -BB10_33: +BB6_33: setp.gt.f64 %p29, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB10_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; + bra.uni BB6_52; -BB10_15: +BB6_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB10_51; + @%p20 bra BB6_49; setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB10_35; - bra.uni BB10_17; + @%p21 bra BB6_35; + bra.uni BB6_17; -BB10_35: +BB6_35: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -1246,7 +471,7 @@ BB10_35: shl.b64 %rd1, %rd11, %r22; setp.eq.s64 %p32, %rd1, -9223372036854775808; abs.f64 %fd11, %fd1; - // Callseq Start 1 + // Callseq Start 0 { .reg .b32 temp_param_reg; // <end>} @@ -1261,133 +486,133 @@ BB10_35: param0, param1 ); - ld.param.f64 %fd38, [retval0+0]; + ld.param.f64 %fd37, [retval0+0]; //{ - }// Callseq End 1 + }// Callseq End 0 setp.lt.s32 %p33, %r8, 0; and.pred %p1, %p33, %p32; - @!%p1 bra BB10_37; - bra.uni BB10_36; + @!%p1 bra BB6_37; + bra.uni BB6_36; -BB10_36: +BB6_36: { .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd38; + mov.b64 {%temp, %r23}, %fd37; } xor.b32 %r24, %r23, -2147483648; { .reg .b32 %temp; - mov.b64 {%r25, %temp}, %fd38; + mov.b64 {%r25, %temp}, %fd37; } - mov.b64 %fd38, {%r25, %r24}; + mov.b64 %fd37, {%r25, %r24}; -BB10_37: - mov.f64 %fd37, %fd38; +BB6_37: + mov.f64 %fd36, %fd37; setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB10_40; - bra.uni BB10_38; + @%p34 bra BB6_40; + bra.uni BB6_38; -BB10_40: +BB6_40: selp.b32 %r26, %r8, 0, %p32; or.b32 %r27, %r26, 2146435072; setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; - mov.b64 %fd37, {%r29, %r28}; - bra.uni BB10_41; + mov.b64 %fd36, {%r29, %r28}; + bra.uni BB6_41; -BB10_24: +BB6_24: setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB10_28; + @%p11 bra BB6_28; setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB10_32; - bra.uni BB10_26; + @%p14 bra BB6_32; + bra.uni BB6_26; -BB10_32: +BB6_32: setp.eq.f64 %p27, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB10_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB6_52; -BB10_28: +BB6_28: setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB10_31; - bra.uni BB10_29; + @%p12 bra BB6_31; + bra.uni BB6_29; -BB10_31: - min.f64 %fd39, %fd1, %fd2; - bra.uni BB10_54; +BB6_31: + min.f64 %fd38, %fd1, %fd2; + bra.uni BB6_52; -BB10_53: - add.f64 %fd39, %fd1, %fd2; - bra.uni BB10_54; +BB6_51: + add.f64 %fd38, %fd1, %fd2; + bra.uni BB6_52; -BB10_13: +BB6_13: setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB10_14; - bra.uni BB10_54; + @%p25 bra BB6_14; + bra.uni BB6_52; -BB10_14: - mul.f64 %fd39, %fd1, %fd2; - bra.uni BB10_54; +BB6_14: + mul.f64 %fd38, %fd1, %fd2; + bra.uni BB6_52; -BB10_34: +BB6_34: setp.le.f64 %p30, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB10_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; + bra.uni BB6_52; -BB10_22: +BB6_22: setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB10_23; - bra.uni BB10_54; + @%p18 bra BB6_23; + bra.uni BB6_52; -BB10_23: +BB6_23: setp.ge.f64 %p28, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB10_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; + bra.uni BB6_52; -BB10_51: - div.rn.f64 %fd39, %fd1, %fd2; - bra.uni BB10_54; +BB6_49: + div.rn.f64 %fd38, %fd1, %fd2; + bra.uni BB6_52; -BB10_17: +BB6_17: setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB10_18; - bra.uni BB10_54; + @%p22 bra BB6_18; + bra.uni BB6_52; -BB10_18: +BB6_18: setp.lt.f64 %p31, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB10_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; + bra.uni BB6_52; -BB10_26: +BB6_26: setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB10_27; - bra.uni BB10_54; + @%p15 bra BB6_27; + bra.uni BB6_52; -BB10_27: +BB6_27: setp.neu.f64 %p26, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB10_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB6_52; -BB10_29: +BB6_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB10_54; + @%p13 bra BB6_52; - max.f64 %fd39, %fd1, %fd2; - bra.uni BB10_54; + max.f64 %fd38, %fd1, %fd2; + bra.uni BB6_52; -BB10_38: +BB6_38: setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB10_41; + @%p35 bra BB6_41; cvt.rzi.f64.f64 %fd29, %fd2; setp.neu.f64 %p36, %fd29, %fd2; - selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36; + selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; -BB10_41: - mov.f64 %fd17, %fd37; +BB6_41: + mov.f64 %fd17, %fd36; add.f64 %fd18, %fd1, %fd2; { .reg .b32 %temp; @@ -1395,78 +620,76 @@ BB10_41: } and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; - mov.f64 %fd36, %fd17; - @%p39 bra BB10_50; + mov.f64 %fd35, %fd17; + @%p39 bra BB6_48; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; - mov.f64 %fd36, %fd18; - @%p40 bra BB10_50; + mov.f64 %fd35, %fd18; + @%p40 bra BB6_48; abs.f64 %fd30, %fd2; setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; - mov.f64 %fd35, %fd18; - mov.f64 %fd36, %fd35; - @%p41 bra BB10_50; - - and.b32 %r32, %r9, 2147483647; - setp.ne.s32 %p42, %r32, 2146435072; - @%p42 bra BB10_46; + mov.f64 %fd34, %fd18; + mov.f64 %fd35, %fd34; + @%p41 bra BB6_48; { .reg .b32 %temp; - mov.b64 {%r33, %temp}, %fd2; + mov.b64 {%r32, %temp}, %fd2; } - setp.eq.s32 %p43, %r33, 0; - @%p43 bra BB10_49; - -BB10_46: - and.b32 %r34, %r8, 2147483647; - setp.ne.s32 %p44, %r34, 2146435072; - mov.f64 %fd33, %fd17; - mov.f64 %fd36, %fd33; - @%p44 bra BB10_50; - + and.b32 %r33, %r9, 2147483647; + setp.eq.s32 %p42, %r33, 2146435072; + setp.eq.s32 %p43, %r32, 0; + and.pred %p44, %p42, %p43; + @%p44 bra BB6_47; + bra.uni BB6_45; + +BB6_47: + setp.gt.f64 %p48, %fd11, 0d3FF0000000000000; + selp.b32 %r41, 2146435072, 0, %p48; + xor.b32 %r42, %r41, 2146435072; + setp.lt.s32 %p49, %r9, 0; + selp.b32 %r43, %r42, %r41, %p49; + setp.eq.f64 %p50, %fd1, 0dBFF0000000000000; + selp.b32 %r44, 1072693248, %r43, %p50; + mov.u32 %r45, 0; + mov.b64 %fd35, {%r45, %r44}; + bra.uni BB6_48; + +BB6_45: { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd1; + mov.b64 {%r34, %temp}, %fd1; } - setp.ne.s32 %p45, %r35, 0; - mov.f64 %fd36, %fd17; - @%p45 bra BB10_50; - + and.b32 %r35, %r8, 2147483647; + setp.eq.s32 %p45, %r35, 2146435072; + setp.eq.s32 %p46, %r34, 0; + and.pred %p47, %p45, %p46; + mov.f64 %fd35, %fd17; + @!%p47 bra BB6_48; + bra.uni BB6_46; + +BB6_46: shr.s32 %r36, %r9, 31; and.b32 %r37, %r36, -2146435072; - add.s32 %r38, %r37, 2146435072; - or.b32 %r39, %r38, -2147483648; - selp.b32 %r40, %r39, %r38, %p1; - mov.u32 %r41, 0; - mov.b64 %fd36, {%r41, %r40}; - bra.uni BB10_50; - -BB10_49: - setp.gt.f64 %p46, %fd11, 0d3FF0000000000000; - selp.b32 %r42, 2146435072, 0, %p46; - xor.b32 %r43, %r42, 2146435072; - setp.lt.s32 %p47, %r9, 0; - selp.b32 %r44, %r43, %r42, %p47; - setp.eq.f64 %p48, %fd1, 0dBFF0000000000000; - selp.b32 %r45, 1072693248, %r44, %p48; - mov.u32 %r46, 0; - mov.b64 %fd36, {%r46, %r45}; - -BB10_50: - setp.eq.f64 %p49, %fd2, 0d0000000000000000; - setp.eq.f64 %p50, %fd1, 0d3FF0000000000000; - or.pred %p51, %p50, %p49; - selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51; - -BB10_54: + selp.b32 %r38, -1048576, 2146435072, %p1; + add.s32 %r39, %r38, %r37; + mov.u32 %r40, 0; + mov.b64 %fd35, {%r40, %r39}; + +BB6_48: + setp.eq.f64 %p51, %fd2, 0d0000000000000000; + setp.eq.f64 %p52, %fd1, 0d3FF0000000000000; + or.pred %p53, %p52, %p51; + selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53; + +BB6_52: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; - st.global.f64 [%rd14], %fd39; + st.global.f64 [%rd14], %fd38; -BB10_55: +BB6_53: ret; } @@ -1481,9 +704,9 @@ BB10_55: .param .u32 binCellScalarOp_param_6 ) { - .reg .pred %p<89>; - .reg .b32 %r<71>; - .reg .f64 %fd<77>; + .reg .pred %p<93>; + .reg .b32 %r<69>; + .reg .f64 %fd<75>; .reg .b64 %rd<12>; @@ -1505,7 +728,7 @@ BB10_55: mad.lo.s32 %r1, %r14, %r15, %r17; mul.lo.s32 %r18, %r9, %r8; setp.ge.s32 %p3, %r1, %r18; - @%p3 bra BB11_92; + @%p3 bra BB7_88; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -1514,178 +737,178 @@ BB10_55: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB11_47; + @%p4 bra BB7_45; setp.eq.s32 %p5, %r6, 0; - @%p5 bra BB11_45; + @%p5 bra BB7_43; - mov.f64 %fd67, 0dC08F380000000000; + mov.f64 %fd66, 0dC08F380000000000; setp.gt.s32 %p6, %r6, 6; - @%p6 bra BB11_13; + @%p6 bra BB7_13; setp.gt.s32 %p14, %r6, 3; - @%p14 bra BB11_9; + @%p14 bra BB7_9; setp.eq.s32 %p18, %r6, 1; - @%p18 bra BB11_44; + @%p18 bra BB7_42; setp.eq.s32 %p19, %r6, 2; - @%p19 bra BB11_43; - bra.uni BB11_7; + @%p19 bra BB7_41; + bra.uni BB7_7; -BB11_43: - mul.f64 %fd67, %fd1, %fd52; - bra.uni BB11_46; +BB7_41: + mul.f64 %fd66, %fd1, %fd52; + bra.uni BB7_44; -BB11_47: - setp.eq.s32 %p47, %r6, 0; - @%p47 bra BB11_90; +BB7_45: + setp.eq.s32 %p49, %r6, 0; + @%p49 bra BB7_86; - mov.f64 %fd76, 0dC08F380000000000; - setp.gt.s32 %p48, %r6, 6; - @%p48 bra BB11_58; + mov.f64 %fd74, 0dC08F380000000000; + setp.gt.s32 %p50, %r6, 6; + @%p50 bra BB7_56; - setp.gt.s32 %p56, %r6, 3; - @%p56 bra BB11_54; + setp.gt.s32 %p58, %r6, 3; + @%p58 bra BB7_52; - setp.eq.s32 %p60, %r6, 1; - @%p60 bra BB11_89; + setp.eq.s32 %p62, %r6, 1; + @%p62 bra BB7_85; - setp.eq.s32 %p61, %r6, 2; - @%p61 bra BB11_88; - bra.uni BB11_52; + setp.eq.s32 %p63, %r6, 2; + @%p63 bra BB7_84; + bra.uni BB7_50; -BB11_88: - mul.f64 %fd76, %fd1, %fd52; - bra.uni BB11_91; +BB7_84: + mul.f64 %fd74, %fd1, %fd52; + bra.uni BB7_87; -BB11_45: - add.f64 %fd67, %fd1, %fd52; +BB7_43: + add.f64 %fd66, %fd1, %fd52; -BB11_46: - st.global.f64 [%rd1], %fd67; - bra.uni BB11_92; +BB7_44: + st.global.f64 [%rd1], %fd66; + bra.uni BB7_88; -BB11_13: +BB7_13: setp.gt.s32 %p7, %r6, 9; - @%p7 bra BB11_18; + @%p7 bra BB7_18; setp.eq.s32 %p11, %r6, 7; - @%p11 bra BB11_25; + @%p11 bra BB7_25; setp.eq.s32 %p12, %r6, 8; - @%p12 bra BB11_24; - bra.uni BB11_16; + @%p12 bra BB7_24; + bra.uni BB7_16; -BB11_24: +BB7_24: setp.le.f64 %p23, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB11_46; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; + bra.uni BB7_44; -BB11_90: - add.f64 %fd76, %fd1, %fd52; +BB7_86: + add.f64 %fd74, %fd1, %fd52; -BB11_91: - st.global.f64 [%rd1], %fd76; +BB7_87: + st.global.f64 [%rd1], %fd74; -BB11_92: +BB7_88: ret; -BB11_58: - setp.gt.s32 %p49, %r6, 9; - @%p49 bra BB11_63; +BB7_56: + setp.gt.s32 %p51, %r6, 9; + @%p51 bra BB7_61; - setp.eq.s32 %p53, %r6, 7; - @%p53 bra BB11_70; + setp.eq.s32 %p55, %r6, 7; + @%p55 bra BB7_68; - setp.eq.s32 %p54, %r6, 8; - @%p54 bra BB11_69; - bra.uni BB11_61; + setp.eq.s32 %p56, %r6, 8; + @%p56 bra BB7_67; + bra.uni BB7_59; -BB11_69: - setp.ge.f64 %p65, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65; - bra.uni BB11_91; +BB7_67: + setp.ge.f64 %p67, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67; + bra.uni BB7_87; -BB11_9: +BB7_9: setp.eq.s32 %p15, %r6, 4; - @%p15 bra BB11_27; + @%p15 bra BB7_27; setp.eq.s32 %p16, %r6, 5; - @%p16 bra BB11_26; - bra.uni BB11_11; + @%p16 bra BB7_26; + bra.uni BB7_11; -BB11_26: +BB7_26: setp.gt.f64 %p26, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB11_46; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB7_44; -BB11_18: +BB7_18: setp.eq.s32 %p8, %r6, 10; - @%p8 bra BB11_23; + @%p8 bra BB7_23; setp.eq.s32 %p9, %r6, 11; - @%p9 bra BB11_22; - bra.uni BB11_20; + @%p9 bra BB7_22; + bra.uni BB7_20; -BB11_22: - min.f64 %fd67, %fd52, %fd1; - bra.uni BB11_46; +BB7_22: + min.f64 %fd66, %fd52, %fd1; + bra.uni BB7_44; -BB11_54: - setp.eq.s32 %p57, %r6, 4; - @%p57 bra BB11_72; +BB7_52: + setp.eq.s32 %p59, %r6, 4; + @%p59 bra BB7_70; - setp.eq.s32 %p58, %r6, 5; - @%p58 bra BB11_71; - bra.uni BB11_56; + setp.eq.s32 %p60, %r6, 5; + @%p60 bra BB7_69; + bra.uni BB7_54; -BB11_71: - setp.lt.f64 %p68, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68; - bra.uni BB11_91; +BB7_69: + setp.lt.f64 %p70, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70; + bra.uni BB7_87; -BB11_63: - setp.eq.s32 %p50, %r6, 10; - @%p50 bra BB11_68; +BB7_61: + setp.eq.s32 %p52, %r6, 10; + @%p52 bra BB7_66; - setp.eq.s32 %p51, %r6, 11; - @%p51 bra BB11_67; - bra.uni BB11_65; + setp.eq.s32 %p53, %r6, 11; + @%p53 bra BB7_65; + bra.uni BB7_63; -BB11_67: - min.f64 %fd76, %fd1, %fd52; - bra.uni BB11_91; +BB7_65: + min.f64 %fd74, %fd1, %fd52; + bra.uni BB7_87; -BB11_44: - sub.f64 %fd67, %fd52, %fd1; - bra.uni BB11_46; +BB7_42: + sub.f64 %fd66, %fd52, %fd1; + bra.uni BB7_44; -BB11_7: +BB7_7: setp.eq.s32 %p20, %r6, 3; - @%p20 bra BB11_8; - bra.uni BB11_46; + @%p20 bra BB7_8; + bra.uni BB7_44; -BB11_8: - div.rn.f64 %fd67, %fd52, %fd1; - bra.uni BB11_46; +BB7_8: + div.rn.f64 %fd66, %fd52, %fd1; + bra.uni BB7_44; -BB11_25: +BB7_25: setp.lt.f64 %p24, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB11_46; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; + bra.uni BB7_44; -BB11_16: +BB7_16: setp.eq.s32 %p13, %r6, 9; - @%p13 bra BB11_17; - bra.uni BB11_46; + @%p13 bra BB7_17; + bra.uni BB7_44; -BB11_17: +BB7_17: setp.eq.f64 %p22, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB11_46; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; + bra.uni BB7_44; -BB11_27: +BB7_27: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd52; @@ -1700,7 +923,7 @@ BB11_27: shl.b64 %rd2, %rd10, %r20; setp.eq.s64 %p27, %rd2, -9223372036854775808; abs.f64 %fd10, %fd52; - // Callseq Start 2 + // Callseq Start 1 { .reg .b32 temp_param_reg; // <end>} @@ -1715,93 +938,93 @@ BB11_27: param0, param1 ); - ld.param.f64 %fd66, [retval0+0]; + ld.param.f64 %fd65, [retval0+0]; //{ - }// Callseq End 2 + }// Callseq End 1 setp.lt.s32 %p28, %r2, 0; and.pred %p1, %p28, %p27; - @!%p1 bra BB11_29; - bra.uni BB11_28; + @!%p1 bra BB7_29; + bra.uni BB7_28; -BB11_28: +BB7_28: { .reg .b32 %temp; - mov.b64 {%temp, %r21}, %fd66; + mov.b64 {%temp, %r21}, %fd65; } xor.b32 %r22, %r21, -2147483648; { .reg .b32 %temp; - mov.b64 {%r23, %temp}, %fd66; + mov.b64 {%r23, %temp}, %fd65; } - mov.b64 %fd66, {%r23, %r22}; + mov.b64 %fd65, {%r23, %r22}; -BB11_29: - mov.f64 %fd65, %fd66; +BB7_29: + mov.f64 %fd64, %fd65; setp.eq.f64 %p29, %fd52, 0d0000000000000000; - @%p29 bra BB11_32; - bra.uni BB11_30; + @%p29 bra BB7_32; + bra.uni BB7_30; -BB11_32: +BB7_32: selp.b32 %r24, %r2, 0, %p27; or.b32 %r25, %r24, 2146435072; setp.lt.s32 %p33, %r3, 0; selp.b32 %r26, %r25, %r24, %p33; mov.u32 %r27, 0; - mov.b64 %fd65, {%r27, %r26}; - bra.uni BB11_33; + mov.b64 %fd64, {%r27, %r26}; + bra.uni BB7_33; -BB11_11: +BB7_11: setp.eq.s32 %p17, %r6, 6; - @%p17 bra BB11_12; - bra.uni BB11_46; + @%p17 bra BB7_12; + bra.uni BB7_44; -BB11_12: +BB7_12: setp.ge.f64 %p25, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB11_46; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; + bra.uni BB7_44; -BB11_23: +BB7_23: setp.neu.f64 %p21, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21; - bra.uni BB11_46; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21; + bra.uni BB7_44; -BB11_20: +BB7_20: setp.ne.s32 %p10, %r6, 12; - @%p10 bra BB11_46; + @%p10 bra BB7_44; - max.f64 %fd67, %fd52, %fd1; - bra.uni BB11_46; + max.f64 %fd66, %fd52, %fd1; + bra.uni BB7_44; -BB11_89: - sub.f64 %fd76, %fd1, %fd52; - bra.uni BB11_91; +BB7_85: + sub.f64 %fd74, %fd1, %fd52; + bra.uni BB7_87; -BB11_52: - setp.eq.s32 %p62, %r6, 3; - @%p62 bra BB11_53; - bra.uni BB11_91; +BB7_50: + setp.eq.s32 %p64, %r6, 3; + @%p64 bra BB7_51; + bra.uni BB7_87; -BB11_53: - div.rn.f64 %fd76, %fd1, %fd52; - bra.uni BB11_91; +BB7_51: + div.rn.f64 %fd74, %fd1, %fd52; + bra.uni BB7_87; -BB11_70: - setp.gt.f64 %p66, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66; - bra.uni BB11_91; +BB7_68: + setp.gt.f64 %p68, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68; + bra.uni BB7_87; -BB11_61: - setp.eq.s32 %p55, %r6, 9; - @%p55 bra BB11_62; - bra.uni BB11_91; +BB7_59: + setp.eq.s32 %p57, %r6, 9; + @%p57 bra BB7_60; + bra.uni BB7_87; -BB11_62: - setp.eq.f64 %p64, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64; - bra.uni BB11_91; +BB7_60: + setp.eq.f64 %p66, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66; + bra.uni BB7_87; -BB11_72: +BB7_70: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1810,13 +1033,13 @@ BB11_72: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd52; } - bfe.u32 %r45, %r5, 20, 11; - add.s32 %r46, %r45, -1012; + bfe.u32 %r44, %r5, 20, 11; + add.s32 %r45, %r44, -1012; mov.b64 %rd11, %fd52; - shl.b64 %rd3, %rd11, %r46; - setp.eq.s64 %p69, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd11, %r45; + setp.eq.s64 %p71, %rd3, -9223372036854775808; abs.f64 %fd35, %fd1; - // Callseq Start 3 + // Callseq Start 2 { .reg .b32 temp_param_reg; // <end>} @@ -1831,74 +1054,74 @@ BB11_72: param0, param1 ); - ld.param.f64 %fd75, [retval0+0]; + ld.param.f64 %fd73, [retval0+0]; //{ - }// Callseq End 3 - setp.lt.s32 %p70, %r4, 0; - and.pred %p2, %p70, %p69; - @!%p2 bra BB11_74; - bra.uni BB11_73; + }// Callseq End 2 + setp.lt.s32 %p72, %r4, 0; + and.pred %p2, %p72, %p71; + @!%p2 bra BB7_72; + bra.uni BB7_71; -BB11_73: +BB7_71: { .reg .b32 %temp; - mov.b64 {%temp, %r47}, %fd75; + mov.b64 {%temp, %r46}, %fd73; } - xor.b32 %r48, %r47, -2147483648; + xor.b32 %r47, %r46, -2147483648; { .reg .b32 %temp; - mov.b64 {%r49, %temp}, %fd75; + mov.b64 {%r48, %temp}, %fd73; } - mov.b64 %fd75, {%r49, %r48}; - -BB11_74: - mov.f64 %fd74, %fd75; - setp.eq.f64 %p71, %fd1, 0d0000000000000000; - @%p71 bra BB11_77; - bra.uni BB11_75; - -BB11_77: - selp.b32 %r50, %r4, 0, %p69; - or.b32 %r51, %r50, 2146435072; - setp.lt.s32 %p75, %r5, 0; - selp.b32 %r52, %r51, %r50, %p75; - mov.u32 %r53, 0; - mov.b64 %fd74, {%r53, %r52}; - bra.uni BB11_78; - -BB11_56: - setp.eq.s32 %p59, %r6, 6; - @%p59 bra BB11_57; - bra.uni BB11_91; - -BB11_57: - setp.le.f64 %p67, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67; - bra.uni BB11_91; - -BB11_68: - setp.neu.f64 %p63, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63; - bra.uni BB11_91; - -BB11_65: - setp.ne.s32 %p52, %r6, 12; - @%p52 bra BB11_91; - - max.f64 %fd76, %fd1, %fd52; - bra.uni BB11_91; - -BB11_30: + mov.b64 %fd73, {%r48, %r47}; + +BB7_72: + mov.f64 %fd72, %fd73; + setp.eq.f64 %p73, %fd1, 0d0000000000000000; + @%p73 bra BB7_75; + bra.uni BB7_73; + +BB7_75: + selp.b32 %r49, %r4, 0, %p71; + or.b32 %r50, %r49, 2146435072; + setp.lt.s32 %p77, %r5, 0; + selp.b32 %r51, %r50, %r49, %p77; + mov.u32 %r52, 0; + mov.b64 %fd72, {%r52, %r51}; + bra.uni BB7_76; + +BB7_54: + setp.eq.s32 %p61, %r6, 6; + @%p61 bra BB7_55; + bra.uni BB7_87; + +BB7_55: + setp.le.f64 %p69, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69; + bra.uni BB7_87; + +BB7_66: + setp.neu.f64 %p65, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65; + bra.uni BB7_87; + +BB7_63: + setp.ne.s32 %p54, %r6, 12; + @%p54 bra BB7_87; + + max.f64 %fd74, %fd1, %fd52; + bra.uni BB7_87; + +BB7_30: setp.gt.s32 %p30, %r2, -1; - @%p30 bra BB11_33; + @%p30 bra BB7_33; cvt.rzi.f64.f64 %fd54, %fd1; setp.neu.f64 %p31, %fd54, %fd1; - selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p31; + selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p31; -BB11_33: - mov.f64 %fd16, %fd65; +BB7_33: + mov.f64 %fd16, %fd64; add.f64 %fd17, %fd1, %fd52; { .reg .b32 %temp; @@ -1906,154 +1129,150 @@ BB11_33: } and.b32 %r29, %r28, 2146435072; setp.ne.s32 %p34, %r29, 2146435072; - mov.f64 %fd64, %fd16; - @%p34 bra BB11_42; + mov.f64 %fd63, %fd16; + @%p34 bra BB7_40; setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000; - mov.f64 %fd64, %fd17; - @%p35 bra BB11_42; + mov.f64 %fd63, %fd17; + @%p35 bra BB7_40; abs.f64 %fd55, %fd1; setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000; - mov.f64 %fd63, %fd17; - mov.f64 %fd64, %fd63; - @%p36 bra BB11_42; - - and.b32 %r30, %r3, 2147483647; - setp.ne.s32 %p37, %r30, 2146435072; - @%p37 bra BB11_38; - - { - .reg .b32 %temp; - mov.b64 {%r31, %temp}, %fd1; - } - setp.eq.s32 %p38, %r31, 0; - @%p38 bra BB11_41; - -BB11_38: - and.b32 %r32, %r2, 2147483647; - setp.ne.s32 %p39, %r32, 2146435072; - mov.f64 %fd61, %fd16; - mov.f64 %fd64, %fd61; - @%p39 bra BB11_42; + mov.f64 %fd62, %fd17; + mov.f64 %fd63, %fd62; + @%p36 bra BB7_40; { .reg .b32 %temp; - mov.b64 {%r33, %temp}, %fd52; + mov.b64 {%r30, %temp}, %fd1; } - setp.ne.s32 %p40, %r33, 0; - mov.f64 %fd64, %fd16; - @%p40 bra BB11_42; - - shr.s32 %r34, %r3, 31; - and.b32 %r35, %r34, -2146435072; - add.s32 %r36, %r35, 2146435072; - or.b32 %r37, %r36, -2147483648; - selp.b32 %r38, %r37, %r36, %p1; - mov.u32 %r39, 0; - mov.b64 %fd64, {%r39, %r38}; - bra.uni BB11_42; - -BB11_75: - setp.gt.s32 %p72, %r4, -1; - @%p72 bra BB11_78; + and.b32 %r31, %r3, 2147483647; + setp.eq.s32 %p37, %r31, 2146435072; + setp.eq.s32 %p38, %r30, 0; + and.pred %p39, %p37, %p38; + @%p39 bra BB7_39; + bra.uni BB7_37; + +BB7_39: + setp.gt.f64 %p43, %fd10, 0d3FF0000000000000; + selp.b32 %r39, 2146435072, 0, %p43; + xor.b32 %r40, %r39, 2146435072; + setp.lt.s32 %p44, %r3, 0; + selp.b32 %r41, %r40, %r39, %p44; + setp.eq.f64 %p45, %fd52, 0dBFF0000000000000; + selp.b32 %r42, 1072693248, %r41, %p45; + mov.u32 %r43, 0; + mov.b64 %fd63, {%r43, %r42}; + bra.uni BB7_40; + +BB7_73: + setp.gt.s32 %p74, %r4, -1; + @%p74 bra BB7_76; cvt.rzi.f64.f64 %fd57, %fd52; - setp.neu.f64 %p73, %fd57, %fd52; - selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p73; + setp.neu.f64 %p75, %fd57, %fd52; + selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p75; -BB11_78: - mov.f64 %fd41, %fd74; +BB7_76: + mov.f64 %fd41, %fd72; add.f64 %fd42, %fd1, %fd52; { .reg .b32 %temp; - mov.b64 {%temp, %r54}, %fd42; + mov.b64 {%temp, %r53}, %fd42; } - and.b32 %r55, %r54, 2146435072; - setp.ne.s32 %p76, %r55, 2146435072; - mov.f64 %fd73, %fd41; - @%p76 bra BB11_87; + and.b32 %r54, %r53, 2146435072; + setp.ne.s32 %p78, %r54, 2146435072; + mov.f64 %fd71, %fd41; + @%p78 bra BB7_83; - setp.gtu.f64 %p77, %fd35, 0d7FF0000000000000; - mov.f64 %fd73, %fd42; - @%p77 bra BB11_87; + setp.gtu.f64 %p79, %fd35, 0d7FF0000000000000; + mov.f64 %fd71, %fd42; + @%p79 bra BB7_83; abs.f64 %fd58, %fd52; - setp.gtu.f64 %p78, %fd58, 0d7FF0000000000000; - mov.f64 %fd72, %fd42; - mov.f64 %fd73, %fd72; - @%p78 bra BB11_87; + setp.gtu.f64 %p80, %fd58, 0d7FF0000000000000; + mov.f64 %fd70, %fd42; + mov.f64 %fd71, %fd70; + @%p80 bra BB7_83; + { + .reg .b32 %temp; + mov.b64 {%r55, %temp}, %fd52; + } and.b32 %r56, %r5, 2147483647; - setp.ne.s32 %p79, %r56, 2146435072; - @%p79 bra BB11_83; - + setp.eq.s32 %p81, %r56, 2146435072; + setp.eq.s32 %p82, %r55, 0; + and.pred %p83, %p81, %p82; + @%p83 bra BB7_82; + bra.uni BB7_80; + +BB7_82: + setp.gt.f64 %p87, %fd35, 0d3FF0000000000000; + selp.b32 %r64, 2146435072, 0, %p87; + xor.b32 %r65, %r64, 2146435072; + setp.lt.s32 %p88, %r5, 0; + selp.b32 %r66, %r65, %r64, %p88; + setp.eq.f64 %p89, %fd1, 0dBFF0000000000000; + selp.b32 %r67, 1072693248, %r66, %p89; + mov.u32 %r68, 0; + mov.b64 %fd71, {%r68, %r67}; + bra.uni BB7_83; + +BB7_37: { .reg .b32 %temp; - mov.b64 {%r57, %temp}, %fd52; + mov.b64 {%r32, %temp}, %fd52; } - setp.eq.s32 %p80, %r57, 0; - @%p80 bra BB11_86; - -BB11_83: - and.b32 %r58, %r4, 2147483647; - setp.ne.s32 %p81, %r58, 2146435072; - mov.f64 %fd70, %fd41; - mov.f64 %fd73, %fd70; - @%p81 bra BB11_87; - + and.b32 %r33, %r2, 2147483647; + setp.eq.s32 %p40, %r33, 2146435072; + setp.eq.s32 %p41, %r32, 0; + and.pred %p42, %p40, %p41; + mov.f64 %fd63, %fd16; + @!%p42 bra BB7_40; + bra.uni BB7_38; + +BB7_38: + shr.s32 %r34, %r3, 31; + and.b32 %r35, %r34, -2146435072; + selp.b32 %r36, -1048576, 2146435072, %p1; + add.s32 %r37, %r36, %r35; + mov.u32 %r38, 0; + mov.b64 %fd63, {%r38, %r37}; + +BB7_40: + setp.eq.f64 %p46, %fd1, 0d0000000000000000; + setp.eq.f64 %p47, %fd52, 0d3FF0000000000000; + or.pred %p48, %p47, %p46; + selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p48; + bra.uni BB7_44; + +BB7_80: { .reg .b32 %temp; - mov.b64 {%r59, %temp}, %fd1; + mov.b64 {%r57, %temp}, %fd1; } - setp.ne.s32 %p82, %r59, 0; - mov.f64 %fd73, %fd41; - @%p82 bra BB11_87; - - shr.s32 %r60, %r5, 31; - and.b32 %r61, %r60, -2146435072; - add.s32 %r62, %r61, 2146435072; - or.b32 %r63, %r62, -2147483648; - selp.b32 %r64, %r63, %r62, %p2; - mov.u32 %r65, 0; - mov.b64 %fd73, {%r65, %r64}; - bra.uni BB11_87; - -BB11_41: - setp.gt.f64 %p41, %fd10, 0d3FF0000000000000; - selp.b32 %r40, 2146435072, 0, %p41; - xor.b32 %r41, %r40, 2146435072; - setp.lt.s32 %p42, %r3, 0; - selp.b32 %r42, %r41, %r40, %p42; - setp.eq.f64 %p43, %fd52, 0dBFF0000000000000; - selp.b32 %r43, 1072693248, %r42, %p43; - mov.u32 %r44, 0; - mov.b64 %fd64, {%r44, %r43}; - -BB11_42: - setp.eq.f64 %p44, %fd1, 0d0000000000000000; - setp.eq.f64 %p45, %fd52, 0d3FF0000000000000; - or.pred %p46, %p45, %p44; - selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p46; - bra.uni BB11_46; - -BB11_86: - setp.gt.f64 %p83, %fd35, 0d3FF0000000000000; - selp.b32 %r66, 2146435072, 0, %p83; - xor.b32 %r67, %r66, 2146435072; - setp.lt.s32 %p84, %r5, 0; - selp.b32 %r68, %r67, %r66, %p84; - setp.eq.f64 %p85, %fd1, 0dBFF0000000000000; - selp.b32 %r69, 1072693248, %r68, %p85; - mov.u32 %r70, 0; - mov.b64 %fd73, {%r70, %r69}; - -BB11_87: - setp.eq.f64 %p86, %fd52, 0d0000000000000000; - setp.eq.f64 %p87, %fd1, 0d3FF0000000000000; - or.pred %p88, %p87, %p86; - selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p88; - bra.uni BB11_91; + and.b32 %r58, %r4, 2147483647; + setp.eq.s32 %p84, %r58, 2146435072; + setp.eq.s32 %p85, %r57, 0; + and.pred %p86, %p84, %p85; + mov.f64 %fd71, %fd41; + @!%p86 bra BB7_83; + bra.uni BB7_81; + +BB7_81: + shr.s32 %r59, %r5, 31; + and.b32 %r60, %r59, -2146435072; + selp.b32 %r61, -1048576, 2146435072, %p2; + add.s32 %r62, %r61, %r60; + mov.u32 %r63, 0; + mov.b64 %fd71, {%r63, %r62}; + +BB7_83: + setp.eq.f64 %p90, %fd52, 0d0000000000000000; + setp.eq.f64 %p91, %fd1, 0d3FF0000000000000; + or.pred %p92, %p91, %p90; + selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p92; + bra.uni BB7_87; } // .globl fill @@ -2077,14 +1296,14 @@ BB11_87: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB12_2; + @%p1 bra BB8_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB12_2: +BB8_2: ret; } @@ -2108,17 +1327,17 @@ BB12_2: ld.param.u32 %r4, [reduce_row_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB13_31; + @%p1 bra BB9_31; mov.u32 %r35, %tid.x; mov.f64 %fd63, 0d0000000000000000; mov.f64 %fd64, %fd63; setp.ge.u32 %p2, %r35, %r4; - @%p2 bra BB13_4; + @%p2 bra BB9_4; cvta.to.global.u64 %rd3, %rd1; -BB13_3: +BB9_3: mad.lo.s32 %r8, %r6, %r4, %r35; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -2128,9 +1347,9 @@ BB13_3: add.s32 %r35, %r9, %r35; setp.lt.u32 %p3, %r35, %r4; mov.f64 %fd63, %fd64; - @%p3 bra BB13_3; + @%p3 bra BB9_3; -BB13_4: +BB9_4: mov.f64 %fd61, %fd63; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -2140,113 +1359,113 @@ BB13_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 512; - @%p4 bra BB13_8; + @%p4 bra BB9_8; setp.gt.u32 %p5, %r10, 255; mov.f64 %fd62, %fd61; - @%p5 bra BB13_7; + @%p5 bra BB9_7; ld.shared.f64 %fd26, [%rd8+2048]; add.f64 %fd62, %fd61, %fd26; st.shared.f64 [%rd8], %fd62; -BB13_7: +BB9_7: mov.f64 %fd61, %fd62; bar.sync 0; -BB13_8: +BB9_8: mov.f64 %fd59, %fd61; setp.lt.u32 %p6, %r11, 256; - @%p6 bra BB13_12; + @%p6 bra BB9_12; setp.gt.u32 %p7, %r10, 127; mov.f64 %fd60, %fd59; - @%p7 bra BB13_11; + @%p7 bra BB9_11; ld.shared.f64 %fd27, [%rd8+1024]; add.f64 %fd60, %fd59, %fd27; st.shared.f64 [%rd8], %fd60; -BB13_11: +BB9_11: mov.f64 %fd59, %fd60; bar.sync 0; -BB13_12: +BB9_12: mov.f64 %fd57, %fd59; setp.lt.u32 %p8, %r11, 128; - @%p8 bra BB13_16; + @%p8 bra BB9_16; setp.gt.u32 %p9, %r10, 63; mov.f64 %fd58, %fd57; - @%p9 bra BB13_15; + @%p9 bra BB9_15; ld.shared.f64 %fd28, [%rd8+512]; add.f64 %fd58, %fd57, %fd28; st.shared.f64 [%rd8], %fd58; -BB13_15: +BB9_15: mov.f64 %fd57, %fd58; bar.sync 0; -BB13_16: +BB9_16: mov.f64 %fd56, %fd57; setp.gt.u32 %p10, %r10, 31; - @%p10 bra BB13_29; + @%p10 bra BB9_29; setp.lt.u32 %p11, %r11, 64; - @%p11 bra BB13_19; + @%p11 bra BB9_19; ld.volatile.shared.f64 %fd29, [%rd8+256]; add.f64 %fd56, %fd56, %fd29; st.volatile.shared.f64 [%rd8], %fd56; -BB13_19: +BB9_19: mov.f64 %fd55, %fd56; setp.lt.u32 %p12, %r11, 32; - @%p12 bra BB13_21; + @%p12 bra BB9_21; ld.volatile.shared.f64 %fd30, [%rd8+128]; add.f64 %fd55, %fd55, %fd30; st.volatile.shared.f64 [%rd8], %fd55; -BB13_21: +BB9_21: mov.f64 %fd54, %fd55; setp.lt.u32 %p13, %r11, 16; - @%p13 bra BB13_23; + @%p13 bra BB9_23; ld.volatile.shared.f64 %fd31, [%rd8+64]; add.f64 %fd54, %fd54, %fd31; st.volatile.shared.f64 [%rd8], %fd54; -BB13_23: +BB9_23: mov.f64 %fd53, %fd54; setp.lt.u32 %p14, %r11, 8; - @%p14 bra BB13_25; + @%p14 bra BB9_25; ld.volatile.shared.f64 %fd32, [%rd8+32]; add.f64 %fd53, %fd53, %fd32; st.volatile.shared.f64 [%rd8], %fd53; -BB13_25: +BB9_25: mov.f64 %fd52, %fd53; setp.lt.u32 %p15, %r11, 4; - @%p15 bra BB13_27; + @%p15 bra BB9_27; ld.volatile.shared.f64 %fd33, [%rd8+16]; add.f64 %fd52, %fd52, %fd33; st.volatile.shared.f64 [%rd8], %fd52; -BB13_27: +BB9_27: setp.lt.u32 %p16, %r11, 2; - @%p16 bra BB13_29; + @%p16 bra BB9_29; ld.volatile.shared.f64 %fd34, [%rd8+8]; add.f64 %fd35, %fd52, %fd34; st.volatile.shared.f64 [%rd8], %fd35; -BB13_29: +BB9_29: setp.ne.s32 %p17, %r10, 0; - @%p17 bra BB13_31; + @%p17 bra BB9_31; ld.shared.f64 %fd36, [sdata]; cvta.to.global.u64 %rd36, %rd2; @@ -2254,7 +1473,7 @@ BB13_29: add.s64 %rd38, %rd36, %rd37; st.global.f64 [%rd38], %fd36; -BB13_31: +BB9_31: ret; } @@ -2281,18 +1500,18 @@ BB13_31: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB14_5; + @%p1 bra BB10_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d0000000000000000; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB14_4; + @%p2 bra BB10_4; mov.u32 %r10, %r1; -BB14_3: +BB10_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -2302,15 +1521,15 @@ BB14_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB14_3; + @%p3 bra BB10_3; -BB14_4: +BB10_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB14_5: +BB10_5: ret; } @@ -2338,9 +1557,9 @@ BB14_5: mov.f64 %fd67, 0d0000000000000000; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB15_4; + @%p1 bra BB11_4; -BB15_1: +BB11_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -2349,23 +1568,23 @@ BB15_1: add.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB15_3; + @%p2 bra BB11_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; add.f64 %fd69, %fd69, %fd28; -BB15_3: +BB11_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB15_1; + @%p3 bra BB11_1; -BB15_4: +BB11_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2373,113 +1592,113 @@ BB15_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB15_8; + @%p4 bra BB11_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB15_7; + @%p5 bra BB11_7; ld.shared.f64 %fd29, [%rd1+2048]; add.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB15_7: +BB11_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB15_8: +BB11_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB15_12; + @%p6 bra BB11_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB15_11; + @%p7 bra BB11_11; ld.shared.f64 %fd30, [%rd1+1024]; add.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB15_11: +BB11_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB15_12: +BB11_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB15_16; + @%p8 bra BB11_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB15_15; + @%p9 bra BB11_15; ld.shared.f64 %fd31, [%rd1+512]; add.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB15_15: +BB11_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB15_16: +BB11_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB15_29; + @%p10 bra BB11_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB15_19; + @%p11 bra BB11_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; add.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB15_19: +BB11_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB15_21; + @%p12 bra BB11_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; add.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB15_21: +BB11_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB15_23; + @%p13 bra BB11_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; add.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB15_23: +BB11_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB15_25; + @%p14 bra BB11_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; add.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB15_25: +BB11_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB15_27; + @%p15 bra BB11_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; add.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB15_27: +BB11_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB15_29; + @%p16 bra BB11_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; add.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB15_29: +BB11_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB15_31; + @%p17 bra BB11_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2487,7 +1706,7 @@ BB15_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB15_31: +BB11_31: ret; } @@ -2515,9 +1734,9 @@ BB15_31: mov.f64 %fd67, 0d0010000000000000; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB16_4; + @%p1 bra BB12_4; -BB16_1: +BB12_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -2526,23 +1745,23 @@ BB16_1: max.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB16_3; + @%p2 bra BB12_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; max.f64 %fd69, %fd69, %fd28; -BB16_3: +BB12_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB16_1; + @%p3 bra BB12_1; -BB16_4: +BB12_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2550,113 +1769,113 @@ BB16_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB16_8; + @%p4 bra BB12_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB16_7; + @%p5 bra BB12_7; ld.shared.f64 %fd29, [%rd1+2048]; max.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB16_7: +BB12_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB16_8: +BB12_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB16_12; + @%p6 bra BB12_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB16_11; + @%p7 bra BB12_11; ld.shared.f64 %fd30, [%rd1+1024]; max.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB16_11: +BB12_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB16_12: +BB12_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB16_16; + @%p8 bra BB12_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB16_15; + @%p9 bra BB12_15; ld.shared.f64 %fd31, [%rd1+512]; max.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB16_15: +BB12_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB16_16: +BB12_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB16_29; + @%p10 bra BB12_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB16_19; + @%p11 bra BB12_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; max.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB16_19: +BB12_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB16_21; + @%p12 bra BB12_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; max.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB16_21: +BB12_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB16_23; + @%p13 bra BB12_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; max.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB16_23: +BB12_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB16_25; + @%p14 bra BB12_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; max.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB16_25: +BB12_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB16_27; + @%p15 bra BB12_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; max.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB16_27: +BB12_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB16_29; + @%p16 bra BB12_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; max.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB16_29: +BB12_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB16_31; + @%p17 bra BB12_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2664,7 +1883,7 @@ BB16_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB16_31: +BB12_31: ret; } @@ -2692,9 +1911,9 @@ BB16_31: mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB17_4; + @%p1 bra BB13_4; -BB17_1: +BB13_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -2703,23 +1922,23 @@ BB17_1: min.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB17_3; + @%p2 bra BB13_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; min.f64 %fd69, %fd69, %fd28; -BB17_3: +BB13_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB17_1; + @%p3 bra BB13_1; -BB17_4: +BB13_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2727,113 +1946,113 @@ BB17_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB17_8; + @%p4 bra BB13_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB17_7; + @%p5 bra BB13_7; ld.shared.f64 %fd29, [%rd1+2048]; min.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB17_7: +BB13_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB17_8: +BB13_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB17_12; + @%p6 bra BB13_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB17_11; + @%p7 bra BB13_11; ld.shared.f64 %fd30, [%rd1+1024]; min.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB17_11: +BB13_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB17_12: +BB13_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB17_16; + @%p8 bra BB13_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB17_15; + @%p9 bra BB13_15; ld.shared.f64 %fd31, [%rd1+512]; min.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB17_15: +BB13_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB17_16: +BB13_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB17_29; + @%p10 bra BB13_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB17_19; + @%p11 bra BB13_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; min.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB17_19: +BB13_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB17_21; + @%p12 bra BB13_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; min.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB17_21: +BB13_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB17_23; + @%p13 bra BB13_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; min.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB17_23: +BB13_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB17_25; + @%p14 bra BB13_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; min.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB17_25: +BB13_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB17_27; + @%p15 bra BB13_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; min.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB17_27: +BB13_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB17_29; + @%p16 bra BB13_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; min.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB17_29: +BB13_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB17_31; + @%p17 bra BB13_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2841,7 +2060,7 @@ BB17_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB17_31: +BB13_31: ret; } @@ -2850,10 +2069,10 @@ BB17_31: .param .b64 __internal_accurate_pow_param_1 ) { - .reg .pred %p<9>; + .reg .pred %p<10>; .reg .f32 %f<3>; .reg .b32 %r<52>; - .reg .f64 %fd<135>; + .reg .f64 %fd<134>; ld.param.f64 %fd12, [__internal_accurate_pow_param_0]; @@ -2868,7 +2087,7 @@ BB17_31: } shr.u32 %r50, %r49, 20; setp.ne.s32 %p1, %r50, 0; - @%p1 bra BB18_2; + @%p1 bra BB14_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -2882,28 +2101,28 @@ BB17_31: shr.u32 %r16, %r49, 20; add.s32 %r50, %r16, -54; -BB18_2: +BB14_2: add.s32 %r51, %r50, -1023; and.b32 %r17, %r49, -2146435073; or.b32 %r18, %r17, 1072693248; - mov.b64 %fd133, {%r48, %r18}; + mov.b64 %fd132, {%r48, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB18_4; + @%p2 bra BB14_4; { .reg .b32 %temp; - mov.b64 {%r19, %temp}, %fd133; + mov.b64 {%r19, %temp}, %fd132; } { .reg .b32 %temp; - mov.b64 {%temp, %r20}, %fd133; + mov.b64 {%temp, %r20}, %fd132; } add.s32 %r21, %r20, -1048576; - mov.b64 %fd133, {%r19, %r21}; + mov.b64 %fd132, {%r19, %r21}; add.s32 %r51, %r50, -1022; -BB18_4: - add.f64 %fd16, %fd133, 0d3FF0000000000000; +BB14_4: + add.f64 %fd16, %fd132, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; // inline asm @@ -2912,7 +2131,7 @@ BB18_4: fma.rn.f64 %fd19, %fd17, %fd15, %fd18; fma.rn.f64 %fd20, %fd19, %fd19, %fd19; fma.rn.f64 %fd21, %fd20, %fd15, %fd15; - add.f64 %fd22, %fd133, 0dBFF0000000000000; + add.f64 %fd22, %fd132, 0dBFF0000000000000; mul.f64 %fd23, %fd22, %fd21; fma.rn.f64 %fd24, %fd22, %fd21, %fd23; mul.f64 %fd25, %fd24, %fd24; @@ -3015,52 +2234,51 @@ BB18_4: add.f64 %fd4, %fd94, %fd97; sub.f64 %fd98, %fd94, %fd4; add.f64 %fd5, %fd97, %fd98; - mov.f64 %fd99, 0d3FF71547652B82FE; - mul.rn.f64 %fd100, %fd4, %fd99; - mov.f64 %fd101, 0d4338000000000000; - add.rn.f64 %fd102, %fd100, %fd101; + mov.f64 %fd99, 0d4338000000000000; + mov.f64 %fd100, 0d3FF71547652B82FE; + fma.rn.f64 %fd101, %fd4, %fd100, %fd99; { .reg .b32 %temp; - mov.b64 {%r13, %temp}, %fd102; + mov.b64 {%r13, %temp}, %fd101; } - mov.f64 %fd103, 0dC338000000000000; - add.rn.f64 %fd104, %fd102, %fd103; - mov.f64 %fd105, 0dBFE62E42FEFA39EF; - fma.rn.f64 %fd106, %fd104, %fd105, %fd4; - mov.f64 %fd107, 0dBC7ABC9E3B39803F; - fma.rn.f64 %fd108, %fd104, %fd107, %fd106; - mov.f64 %fd109, 0d3E928AF3FCA213EA; - mov.f64 %fd110, 0d3E5ADE1569CE2BDF; - fma.rn.f64 %fd111, %fd110, %fd108, %fd109; - mov.f64 %fd112, 0d3EC71DEE62401315; - fma.rn.f64 %fd113, %fd111, %fd108, %fd112; - mov.f64 %fd114, 0d3EFA01997C89EB71; - fma.rn.f64 %fd115, %fd113, %fd108, %fd114; - mov.f64 %fd116, 0d3F2A01A014761F65; - fma.rn.f64 %fd117, %fd115, %fd108, %fd116; - mov.f64 %fd118, 0d3F56C16C1852B7AF; - fma.rn.f64 %fd119, %fd117, %fd108, %fd118; - mov.f64 %fd120, 0d3F81111111122322; - fma.rn.f64 %fd121, %fd119, %fd108, %fd120; - mov.f64 %fd122, 0d3FA55555555502A1; - fma.rn.f64 %fd123, %fd121, %fd108, %fd122; - mov.f64 %fd124, 0d3FC5555555555511; - fma.rn.f64 %fd125, %fd123, %fd108, %fd124; - mov.f64 %fd126, 0d3FE000000000000B; - fma.rn.f64 %fd127, %fd125, %fd108, %fd126; - fma.rn.f64 %fd128, %fd127, %fd108, %fd18; - fma.rn.f64 %fd129, %fd128, %fd108, %fd18; + mov.f64 %fd102, 0dC338000000000000; + add.rn.f64 %fd103, %fd101, %fd102; + mov.f64 %fd104, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd105, %fd103, %fd104, %fd4; + mov.f64 %fd106, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd107, %fd103, %fd106, %fd105; + mov.f64 %fd108, 0d3E928AF3FCA213EA; + mov.f64 %fd109, 0d3E5ADE1569CE2BDF; + fma.rn.f64 %fd110, %fd109, %fd107, %fd108; + mov.f64 %fd111, 0d3EC71DEE62401315; + fma.rn.f64 %fd112, %fd110, %fd107, %fd111; + mov.f64 %fd113, 0d3EFA01997C89EB71; + fma.rn.f64 %fd114, %fd112, %fd107, %fd113; + mov.f64 %fd115, 0d3F2A01A014761F65; + fma.rn.f64 %fd116, %fd114, %fd107, %fd115; + mov.f64 %fd117, 0d3F56C16C1852B7AF; + fma.rn.f64 %fd118, %fd116, %fd107, %fd117; + mov.f64 %fd119, 0d3F81111111122322; + fma.rn.f64 %fd120, %fd118, %fd107, %fd119; + mov.f64 %fd121, 0d3FA55555555502A1; + fma.rn.f64 %fd122, %fd120, %fd107, %fd121; + mov.f64 %fd123, 0d3FC5555555555511; + fma.rn.f64 %fd124, %fd122, %fd107, %fd123; + mov.f64 %fd125, 0d3FE000000000000B; + fma.rn.f64 %fd126, %fd124, %fd107, %fd125; + fma.rn.f64 %fd127, %fd126, %fd107, %fd18; + fma.rn.f64 %fd128, %fd127, %fd107, %fd18; { .reg .b32 %temp; - mov.b64 {%r14, %temp}, %fd129; + mov.b64 {%r14, %temp}, %fd128; } { .reg .b32 %temp; - mov.b64 {%temp, %r15}, %fd129; + mov.b64 {%temp, %r15}, %fd128; } shl.b32 %r33, %r13, 20; add.s32 %r34, %r15, %r33; - mov.b64 %fd134, {%r14, %r34}; + mov.b64 %fd133, {%r14, %r34}; { .reg .b32 %temp; mov.b64 {%temp, %r35}, %fd4; @@ -3068,48 +2286,48 @@ BB18_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB18_7; + @%p4 bra BB14_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; - add.f64 %fd130, %fd4, 0d7FF0000000000000; - selp.f64 %fd134, 0d0000000000000000, %fd130, %p5; + add.f64 %fd129, %fd4, 0d7FF0000000000000; + selp.f64 %fd133, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB18_7; + @%p6 bra BB14_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; shr.s32 %r38, %r37, 1; shl.b32 %r39, %r38, 20; add.s32 %r40, %r39, %r15; - mov.b64 %fd131, {%r14, %r40}; + mov.b64 %fd130, {%r14, %r40}; sub.s32 %r41, %r13, %r38; shl.b32 %r42, %r41, 20; add.s32 %r43, %r42, 1072693248; mov.u32 %r44, 0; - mov.b64 %fd132, {%r44, %r43}; - mul.f64 %fd134, %fd131, %fd132; + mov.b64 %fd131, {%r44, %r43}; + mul.f64 %fd133, %fd130, %fd131; -BB18_7: +BB14_7: { .reg .b32 %temp; - mov.b64 {%temp, %r45}, %fd134; + mov.b64 {%temp, %r45}, %fd133; } and.b32 %r46, %r45, 2147483647; setp.ne.s32 %p7, %r46, 2146435072; - @%p7 bra BB18_9; - { .reg .b32 %temp; - mov.b64 {%r47, %temp}, %fd134; + mov.b64 {%r47, %temp}, %fd133; } - setp.eq.s32 %p8, %r47, 0; - @%p8 bra BB18_10; + setp.ne.s32 %p8, %r47, 0; + or.pred %p9, %p8, %p7; + @!%p9 bra BB14_9; + bra.uni BB14_8; -BB18_9: - fma.rn.f64 %fd134, %fd134, %fd5, %fd134; +BB14_8: + fma.rn.f64 %fd133, %fd133, %fd5, %fd133; -BB18_10: - st.param.f64 [func_retval0+0], %fd134; +BB14_9: + st.param.f64 [func_retval0+0], %fd133; ret; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/hops/BinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java index 7d97e44..7321d68 100644 --- a/src/main/java/org/apache/sysml/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java @@ -36,6 +36,7 @@ import org.apache.sysml.lops.CentralMoment; import org.apache.sysml.lops.CoVariance; import org.apache.sysml.lops.CombineBinary; import org.apache.sysml.lops.CombineUnary; +import org.apache.sysml.lops.ConvolutionTransform; import org.apache.sysml.lops.Data; import org.apache.sysml.lops.DataPartition; import org.apache.sysml.lops.Group; @@ -593,7 +594,22 @@ public class BinaryOp extends Hop et = ExecType.GPU; } - Binary binary = new Binary(getInput().get(0).constructLops(), getInput().get(1).constructLops(), HopsOpOp2LopsB.get(op), + Lop binary = null; + + boolean isLeftXGt = (getInput().get(0) instanceof BinaryOp) && ((BinaryOp) getInput().get(0)).getOp() == OpOp2.GREATER; + Hop potentialZero = isLeftXGt ? ((BinaryOp) getInput().get(0)).getInput().get(1) : null; + + boolean isLeftXGt0 = isLeftXGt && potentialZero != null + && potentialZero instanceof LiteralOp && ((LiteralOp) potentialZero).getDoubleValue() == 0; + + if(op == OpOp2.MULT && isLeftXGt0 && + !getInput().get(0).isVector() && !getInput().get(1).isVector()) { + binary = new ConvolutionTransform(getInput().get(0).getInput().get(0).constructLops(), + getInput().get(1).constructLops(), + ConvolutionTransform.OperationTypes.RELU_BACKWARD, getDataType(), getValueType(), et, -1); + } + else + binary = new Binary(getInput().get(0).constructLops(), getInput().get(1).constructLops(), HopsOpOp2LopsB.get(op), getDataType(), getValueType(), et); setOutputDimensions(binary); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java index d69bd93..6f2a20e 100644 --- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java +++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java @@ -30,7 +30,7 @@ public class ConvolutionTransform extends Lop public enum OperationTypes { - MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, + MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, RELU_BACKWARD, DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA, BIAS_ADD }; @@ -54,6 +54,16 @@ public class ConvolutionTransform extends Lop init(input, op, dt, vt, et); numThreads = k; } + + public ConvolutionTransform(Lop input1, Lop input2, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) + { + super(Lop.Type.Transform, dt, vt); + init(input1, op, dt, vt, et); + numThreads = k; + this.addInput(input2); + input2.addOutput(this); + setLevel(); + } private void init (Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et) { @@ -102,6 +112,9 @@ public class ConvolutionTransform extends Lop case RELU_MAX_POOLING: return "relu_maxpooling"; + case RELU_BACKWARD: + return "relu_backward"; + case MAX_POOLING_BACKWARD: return "maxpooling_backward"; @@ -124,7 +137,7 @@ public class ConvolutionTransform extends Lop } public String getInstructions(String input, String bias, String output) throws LopsException { - if(operation == OperationTypes.BIAS_ADD) { + if(operation == OperationTypes.BIAS_ADD || operation == OperationTypes.RELU_BACKWARD) { StringBuilder sb = new StringBuilder(); sb.append( getExecType() ); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java index 3355a6e..11d4661 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java @@ -220,6 +220,7 @@ public class CPInstructionParser extends InstructionParser String2CPInstructionType.put( "rsort" , CPINSTRUCTION_TYPE.Reorg); // Opcodes related to convolutions + String2CPInstructionType.put( "relu_backward" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "relu_maxpooling" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "maxpooling" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "maxpooling_backward" , CPINSTRUCTION_TYPE.Convolution); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 9dce34a..5e3ab62 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -39,6 +39,7 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType = new HashMap<String, GPUINSTRUCTION_TYPE>(); // Neural Network Operators + String2GPUInstructionType.put( "relu_backward", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d_backward_filter", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d_backward_data", GPUINSTRUCTION_TYPE.Convolution); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 56f1460..9c115c6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -46,8 +46,8 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads) throws DMLRuntimeException { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr); - if(!opcode.equals("bias_add")) { - throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add, but found " + opcode); + if( !(opcode.equals("bias_add") || opcode.equals("relu_backward")) ) { + throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or relu_backward, but found " + opcode); } _in2 = in2; _cptype = CPINSTRUCTION_TYPE.Convolution; @@ -153,7 +153,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { return new ConvolutionCPInstruction(in, in2, out, opcode, str, stride, padding, input_shape, filter_shape, k); } - else if (opcode.equalsIgnoreCase("bias_add")) { + else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward")) { InstructionUtils.checkNumFields(parts, 4); in.split(parts[1]); CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); @@ -174,6 +174,26 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { .getLongValue(); } + public void processReluBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException { + // (X > 0) * dout + MatrixBlock outputBlock = null; + MatrixBlock input = ec.getMatrixInput(input1.getName()); + MatrixBlock dout = ec.getMatrixInput(_in2.getName()); + + if(input.isEmptyBlock() || dout.isEmptyBlock()) { + outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true, 0); + } + else { + outputBlock = getDenseOutputBlock(ec, input.getNumRows(), input.getNumColumns()); + LibMatrixDNN.relu_backward(input, dout, outputBlock, _numThreads); + } + + // release inputs/outputs + ec.releaseMatrixInput(input1.getName()); + ec.releaseMatrixInput(_in2.getName()); + ec.setMatrixOutput(getOutputVariableName(), outputBlock); + } + public void processBiasInstruction(ExecutionContext ec) throws DMLRuntimeException { MatrixBlock outputBlock = null; MatrixBlock input = ec.getMatrixInput(input1.getName()); @@ -210,6 +230,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { processBiasInstruction(ec); return; } + else if (instOpcode.equalsIgnoreCase("relu_backward")) { + processReluBackwardInstruction(ec); + return; + } // acquire inputs MatrixBlock outputBlock = null;