[SYSTEMML-540] Improved the performance of bias_add and added relu_backward (CP 
+ GPU)

Closes #337.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/afe61b5a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/afe61b5a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/afe61b5a

Branch: refs/heads/master
Commit: afe61b5a295704f3a994f522a2f91d9c242d5c4c
Parents: b0fb707
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Tue Jan 10 08:35:56 2017 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Tue Jan 10 08:35:56 2017 -0800

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |   12 +-
 src/main/cpp/kernels/SystemML.ptx               | 2410 ++++++------------
 .../java/org/apache/sysml/hops/BinaryOp.java    |   18 +-
 .../apache/sysml/lops/ConvolutionTransform.java |   17 +-
 .../instructions/CPInstructionParser.java       |    1 +
 .../instructions/GPUInstructionParser.java      |    1 +
 .../cp/ConvolutionCPInstruction.java            |   30 +-
 .../gpu/ConvolutionGPUInstruction.java          |   24 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |   18 +
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |  139 +-
 .../functions/tensor/ReluBackwardTest.java      |  117 +
 .../scripts/functions/tensor/ReluBackwardTest.R |   30 +
 .../functions/tensor/ReluBackwardTest.dml       |   27 +
 13 files changed, 1215 insertions(+), 1629 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 1812b6a..7e32f0e 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -20,7 +20,7 @@
 /**********************************
 When updating a kernel or adding a new one,
 please compile the ptx file and commit it:
-nvcc -ptx SystemML.cu
+nvcc -ptx -arch=sm_30 SystemML.cu
 ***********************************/
 
 #include <cfloat>
@@ -116,6 +116,16 @@ __global__ void relu(double* A,  double* ret, int rlen, 
int clen) {
        }
 }
 
+extern "C"
+__global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, 
int clen) {
+       int ix = blockIdx.x * blockDim.x + threadIdx.x;
+       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       if(ix < rlen && iy < clen) {
+               int index = ix * clen + iy;
+               ret[index] = X[index] > 0 ?  dout[index] : 0;
+       }
+}
+
 // Compares the value and set
 extern "C"
 __global__ void compareAndSet(double* A,  double* ret, int rlen, int clen, 
double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double 
ifGreaterThanVal) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 99d5898..e30e00a 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,859 +1,24 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-19856038
-// Cuda compilation tools, release 7.5, V7.5.17
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
 // Based on LLVM 3.4svn
 //
 
-.version 4.3
+.version 5.0
 .target sm_30
 .address_size 64
 
-       // .globl       getBoolean
-.func  (.param .b64 func_retval0) __internal_accurate_pow
-(
-       .param .b64 __internal_accurate_pow_param_0,
-       .param .b64 __internal_accurate_pow_param_1
-)
-;
-.extern .shared .align 8 .b8 sdata[];
-
-.visible .func  (.param .b64 func_retval0) getBoolean(
-       .param .b32 getBoolean_param_0
-)
-{
-       .reg .pred      %p<2>;
-       .reg .b32       %r<2>;
-       .reg .f64       %fd<2>;
-
-
-       ld.param.u32    %r1, [getBoolean_param_0];
-       setp.eq.s32     %p1, %r1, 0;
-       selp.f64        %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
-       st.param.f64    [func_retval0+0], %fd1;
-       ret;
-}
-
-       // .globl       binaryOp
-.visible .func  (.param .b64 func_retval0) binaryOp(
-       .param .b64 binaryOp_param_0,
-       .param .b64 binaryOp_param_1,
-       .param .b32 binaryOp_param_2
-)
-{
-       .reg .pred      %p<41>;
-       .reg .b32       %r<30>;
-       .reg .f64       %fd<40>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.f64    %fd26, [binaryOp_param_0];
-       ld.param.f64    %fd27, [binaryOp_param_1];
-       ld.param.u32    %r3, [binaryOp_param_2];
-       setp.eq.s32     %p2, %r3, 0;
-       @%p2 bra        BB1_40;
-
-       setp.eq.s32     %p3, %r3, 1;
-       @%p3 bra        BB1_39;
-       bra.uni         BB1_2;
-
-BB1_39:
-       sub.f64         %fd39, %fd26, %fd27;
-       bra.uni         BB1_41;
-
-BB1_40:
-       add.f64         %fd39, %fd26, %fd27;
-       bra.uni         BB1_41;
-
-BB1_2:
-       setp.eq.s32     %p4, %r3, 2;
-       @%p4 bra        BB1_38;
-       bra.uni         BB1_3;
-
-BB1_38:
-       mul.f64         %fd39, %fd26, %fd27;
-       bra.uni         BB1_41;
-
-BB1_3:
-       setp.eq.s32     %p5, %r3, 3;
-       @%p5 bra        BB1_37;
-       bra.uni         BB1_4;
-
-BB1_37:
-       div.rn.f64      %fd39, %fd26, %fd27;
-       bra.uni         BB1_41;
-
-BB1_4:
-       setp.eq.s32     %p6, %r3, 4;
-       @%p6 bra        BB1_21;
-       bra.uni         BB1_5;
-
-BB1_21:
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r1}, %fd26;
-       }
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r2}, %fd27;
-       }
-       bfe.u32         %r4, %r2, 20, 11;
-       add.s32         %r5, %r4, -1012;
-       mov.b64          %rd2, %fd27;
-       shl.b64         %rd1, %rd2, %r5;
-       setp.eq.s64     %p21, %rd1, -9223372036854775808;
-       abs.f64         %fd9, %fd26;
-       // Callseq Start 0
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.f64    [param0+0], %fd9;
-       .param .b64 param1;
-       st.param.f64    [param1+0], %fd27;
-       .param .b64 retval0;
-       call.uni (retval0), 
-       __internal_accurate_pow, 
-       (
-       param0, 
-       param1
-       );
-       ld.param.f64    %fd38, [retval0+0];
-       
-       //{
-       }// Callseq End 0
-       setp.lt.s32     %p22, %r1, 0;
-       and.pred        %p1, %p22, %p21;
-       @!%p1 bra       BB1_23;
-       bra.uni         BB1_22;
-
-BB1_22:
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r6}, %fd38;
-       }
-       xor.b32         %r7, %r6, -2147483648;
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r8, %temp}, %fd38;
-       }
-       mov.b64         %fd38, {%r8, %r7};
-
-BB1_23:
-       mov.f64         %fd37, %fd38;
-       setp.eq.f64     %p23, %fd26, 0d0000000000000000;
-       @%p23 bra       BB1_26;
-       bra.uni         BB1_24;
-
-BB1_26:
-       selp.b32        %r9, %r1, 0, %p21;
-       or.b32          %r10, %r9, 2146435072;
-       setp.lt.s32     %p27, %r2, 0;
-       selp.b32        %r11, %r10, %r9, %p27;
-       mov.u32         %r12, 0;
-       mov.b64         %fd37, {%r12, %r11};
-       bra.uni         BB1_27;
-
-BB1_5:
-       setp.eq.s32     %p7, %r3, 5;
-       @%p7 bra        BB1_20;
-       bra.uni         BB1_6;
-
-BB1_20:
-       setp.lt.f64     %p20, %fd26, %fd27;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p20;
-       bra.uni         BB1_41;
-
-BB1_6:
-       setp.eq.s32     %p8, %r3, 6;
-       @%p8 bra        BB1_19;
-       bra.uni         BB1_7;
-
-BB1_19:
-       setp.le.f64     %p19, %fd26, %fd27;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p19;
-       bra.uni         BB1_41;
-
-BB1_24:
-       setp.gt.s32     %p24, %r1, -1;
-       @%p24 bra       BB1_27;
-
-       cvt.rzi.f64.f64 %fd29, %fd27;
-       setp.neu.f64    %p25, %fd29, %fd27;
-       selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p25;
-
-BB1_27:
-       mov.f64         %fd15, %fd37;
-       add.f64         %fd16, %fd26, %fd27;
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r13}, %fd16;
-       }
-       and.b32         %r14, %r13, 2146435072;
-       setp.ne.s32     %p28, %r14, 2146435072;
-       mov.f64         %fd36, %fd15;
-       @%p28 bra       BB1_36;
-
-       setp.gtu.f64    %p29, %fd9, 0d7FF0000000000000;
-       mov.f64         %fd36, %fd16;
-       @%p29 bra       BB1_36;
-
-       abs.f64         %fd30, %fd27;
-       setp.gtu.f64    %p30, %fd30, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd16;
-       mov.f64         %fd36, %fd35;
-       @%p30 bra       BB1_36;
-
-       and.b32         %r15, %r2, 2147483647;
-       setp.ne.s32     %p31, %r15, 2146435072;
-       @%p31 bra       BB1_32;
-
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r16, %temp}, %fd27;
-       }
-       setp.eq.s32     %p32, %r16, 0;
-       @%p32 bra       BB1_35;
-
-BB1_32:
-       and.b32         %r17, %r1, 2147483647;
-       setp.ne.s32     %p33, %r17, 2146435072;
-       mov.f64         %fd33, %fd15;
-       mov.f64         %fd36, %fd33;
-       @%p33 bra       BB1_36;
-
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r18, %temp}, %fd26;
-       }
-       setp.ne.s32     %p34, %r18, 0;
-       mov.f64         %fd36, %fd15;
-       @%p34 bra       BB1_36;
-
-       shr.s32         %r19, %r2, 31;
-       and.b32         %r20, %r19, -2146435072;
-       add.s32         %r21, %r20, 2146435072;
-       or.b32          %r22, %r21, -2147483648;
-       selp.b32        %r23, %r22, %r21, %p1;
-       mov.u32         %r24, 0;
-       mov.b64         %fd36, {%r24, %r23};
-       bra.uni         BB1_36;
-
-BB1_7:
-       setp.eq.s32     %p9, %r3, 7;
-       @%p9 bra        BB1_18;
-       bra.uni         BB1_8;
-
-BB1_18:
-       setp.gt.f64     %p18, %fd26, %fd27;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p18;
-       bra.uni         BB1_41;
-
-BB1_8:
-       setp.eq.s32     %p10, %r3, 8;
-       @%p10 bra       BB1_17;
-       bra.uni         BB1_9;
-
-BB1_17:
-       setp.ge.f64     %p17, %fd26, %fd27;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p17;
-       bra.uni         BB1_41;
-
-BB1_9:
-       setp.eq.s32     %p11, %r3, 9;
-       @%p11 bra       BB1_16;
-       bra.uni         BB1_10;
-
-BB1_16:
-       setp.eq.f64     %p16, %fd26, %fd27;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p16;
-       bra.uni         BB1_41;
-
-BB1_10:
-       setp.eq.s32     %p12, %r3, 10;
-       @%p12 bra       BB1_15;
-       bra.uni         BB1_11;
-
-BB1_15:
-       setp.neu.f64    %p15, %fd26, %fd27;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p15;
-       bra.uni         BB1_41;
-
-BB1_35:
-       setp.gt.f64     %p35, %fd9, 0d3FF0000000000000;
-       selp.b32        %r25, 2146435072, 0, %p35;
-       xor.b32         %r26, %r25, 2146435072;
-       setp.lt.s32     %p36, %r2, 0;
-       selp.b32        %r27, %r26, %r25, %p36;
-       setp.eq.f64     %p37, %fd26, 0dBFF0000000000000;
-       selp.b32        %r28, 1072693248, %r27, %p37;
-       mov.u32         %r29, 0;
-       mov.b64         %fd36, {%r29, %r28};
-
-BB1_36:
-       setp.eq.f64     %p38, %fd27, 0d0000000000000000;
-       setp.eq.f64     %p39, %fd26, 0d3FF0000000000000;
-       or.pred         %p40, %p39, %p38;
-       selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p40;
-
-BB1_41:
-       st.param.f64    [func_retval0+0], %fd39;
-       ret;
-
-BB1_11:
-       setp.eq.s32     %p13, %r3, 11;
-       @%p13 bra       BB1_14;
-       bra.uni         BB1_12;
-
-BB1_14:
-       min.f64         %fd39, %fd26, %fd27;
-       bra.uni         BB1_41;
-
-BB1_12:
-       mov.f64         %fd39, 0dC08F380000000000;
-       setp.ne.s32     %p14, %r3, 12;
-       @%p14 bra       BB1_41;
-
-       max.f64         %fd39, %fd26, %fd27;
-       bra.uni         BB1_41;
-}
-
-       // .globl       _Z6reduceI5SumOpEvPdS1_jT_d
-.visible .func _Z6reduceI5SumOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI5SumOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI5SumOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<18>;
-       .reg .b32       %r<31>;
-       .reg .f64       %fd<70>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI5SumOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI5SumOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI5SumOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd67, [_Z6reduceI5SumOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r30, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB2_5;
-
-       mov.f64         %fd68, %fd67;
-
-BB2_2:
-       mov.f64         %fd1, %fd68;
-       mul.wide.u32    %rd4, %r30, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd26, [%rd5];
-       add.f64         %fd69, %fd1, %fd26;
-       add.s32         %r3, %r30, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB2_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd27, [%rd7];
-       add.f64         %fd69, %fd69, %fd27;
-
-BB2_4:
-       mov.f64         %fd68, %fd69;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r30, %r12, %r13, %r30;
-       setp.lt.u32     %p3, %r30, %r5;
-       mov.f64         %fd67, %fd68;
-       @%p3 bra        BB2_2;
-
-BB2_5:
-       mov.f64         %fd65, %fd67;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd65;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB2_9;
-
-       setp.gt.u32     %p5, %r6, 255;
-       mov.f64         %fd66, %fd65;
-       @%p5 bra        BB2_8;
-
-       ld.shared.f64   %fd28, [%rd1+2048];
-       add.f64         %fd66, %fd65, %fd28;
-       st.shared.f64   [%rd1], %fd66;
-
-BB2_8:
-       mov.f64         %fd65, %fd66;
-       bar.sync        0;
-
-BB2_9:
-       mov.f64         %fd63, %fd65;
-       setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB2_13;
-
-       setp.gt.u32     %p7, %r6, 127;
-       mov.f64         %fd64, %fd63;
-       @%p7 bra        BB2_12;
-
-       ld.shared.f64   %fd29, [%rd1+1024];
-       add.f64         %fd64, %fd63, %fd29;
-       st.shared.f64   [%rd1], %fd64;
-
-BB2_12:
-       mov.f64         %fd63, %fd64;
-       bar.sync        0;
-
-BB2_13:
-       mov.f64         %fd61, %fd63;
-       setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB2_17;
-
-       setp.gt.u32     %p9, %r6, 63;
-       mov.f64         %fd62, %fd61;
-       @%p9 bra        BB2_16;
-
-       ld.shared.f64   %fd30, [%rd1+512];
-       add.f64         %fd62, %fd61, %fd30;
-       st.shared.f64   [%rd1], %fd62;
-
-BB2_16:
-       mov.f64         %fd61, %fd62;
-       bar.sync        0;
-
-BB2_17:
-       mov.f64         %fd60, %fd61;
-       setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB2_30;
-
-       setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB2_20;
-
-       ld.volatile.shared.f64  %fd31, [%rd1+256];
-       add.f64         %fd60, %fd60, %fd31;
-       st.volatile.shared.f64  [%rd1], %fd60;
-
-BB2_20:
-       mov.f64         %fd59, %fd60;
-       setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB2_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+128];
-       add.f64         %fd59, %fd59, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd59;
-
-BB2_22:
-       mov.f64         %fd58, %fd59;
-       setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB2_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+64];
-       add.f64         %fd58, %fd58, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd58;
-
-BB2_24:
-       mov.f64         %fd57, %fd58;
-       setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB2_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+32];
-       add.f64         %fd57, %fd57, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd57;
-
-BB2_26:
-       mov.f64         %fd56, %fd57;
-       setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB2_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+16];
-       add.f64         %fd56, %fd56, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd56;
-
-BB2_28:
-       setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB2_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+8];
-       add.f64         %fd37, %fd56, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd37;
-
-BB2_30:
-       setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB2_32;
-
-       ld.shared.f64   %fd38, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd38;
-
-BB2_32:
-       ret;
-}
-
-       // .globl       _Z6reduceI5MaxOpEvPdS1_jT_d
-.visible .func _Z6reduceI5MaxOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI5MaxOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI5MaxOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<18>;
-       .reg .b32       %r<31>;
-       .reg .f64       %fd<70>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd67, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r30, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB3_5;
-
-       mov.f64         %fd68, %fd67;
-
-BB3_2:
-       mov.f64         %fd1, %fd68;
-       mul.wide.u32    %rd4, %r30, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd26, [%rd5];
-       max.f64         %fd69, %fd1, %fd26;
-       add.s32         %r3, %r30, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB3_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd27, [%rd7];
-       max.f64         %fd69, %fd69, %fd27;
-
-BB3_4:
-       mov.f64         %fd68, %fd69;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r30, %r12, %r13, %r30;
-       setp.lt.u32     %p3, %r30, %r5;
-       mov.f64         %fd67, %fd68;
-       @%p3 bra        BB3_2;
-
-BB3_5:
-       mov.f64         %fd65, %fd67;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd65;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB3_9;
-
-       setp.gt.u32     %p5, %r6, 255;
-       mov.f64         %fd66, %fd65;
-       @%p5 bra        BB3_8;
-
-       ld.shared.f64   %fd28, [%rd1+2048];
-       max.f64         %fd66, %fd65, %fd28;
-       st.shared.f64   [%rd1], %fd66;
-
-BB3_8:
-       mov.f64         %fd65, %fd66;
-       bar.sync        0;
-
-BB3_9:
-       mov.f64         %fd63, %fd65;
-       setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB3_13;
-
-       setp.gt.u32     %p7, %r6, 127;
-       mov.f64         %fd64, %fd63;
-       @%p7 bra        BB3_12;
-
-       ld.shared.f64   %fd29, [%rd1+1024];
-       max.f64         %fd64, %fd63, %fd29;
-       st.shared.f64   [%rd1], %fd64;
-
-BB3_12:
-       mov.f64         %fd63, %fd64;
-       bar.sync        0;
-
-BB3_13:
-       mov.f64         %fd61, %fd63;
-       setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB3_17;
-
-       setp.gt.u32     %p9, %r6, 63;
-       mov.f64         %fd62, %fd61;
-       @%p9 bra        BB3_16;
-
-       ld.shared.f64   %fd30, [%rd1+512];
-       max.f64         %fd62, %fd61, %fd30;
-       st.shared.f64   [%rd1], %fd62;
-
-BB3_16:
-       mov.f64         %fd61, %fd62;
-       bar.sync        0;
-
-BB3_17:
-       mov.f64         %fd60, %fd61;
-       setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB3_30;
-
-       setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB3_20;
-
-       ld.volatile.shared.f64  %fd31, [%rd1+256];
-       max.f64         %fd60, %fd60, %fd31;
-       st.volatile.shared.f64  [%rd1], %fd60;
-
-BB3_20:
-       mov.f64         %fd59, %fd60;
-       setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB3_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+128];
-       max.f64         %fd59, %fd59, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd59;
-
-BB3_22:
-       mov.f64         %fd58, %fd59;
-       setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB3_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+64];
-       max.f64         %fd58, %fd58, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd58;
-
-BB3_24:
-       mov.f64         %fd57, %fd58;
-       setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB3_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+32];
-       max.f64         %fd57, %fd57, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd57;
-
-BB3_26:
-       mov.f64         %fd56, %fd57;
-       setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB3_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+16];
-       max.f64         %fd56, %fd56, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd56;
-
-BB3_28:
-       setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB3_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+8];
-       max.f64         %fd37, %fd56, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd37;
-
-BB3_30:
-       setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB3_32;
-
-       ld.shared.f64   %fd38, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd38;
-
-BB3_32:
-       ret;
-}
-
-       // .globl       _Z6reduceI5MinOpEvPdS1_jT_d
-.visible .func _Z6reduceI5MinOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI5MinOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI5MinOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<18>;
-       .reg .b32       %r<31>;
-       .reg .f64       %fd<70>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI5MinOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI5MinOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI5MinOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd67, [_Z6reduceI5MinOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r30, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB4_5;
-
-       mov.f64         %fd68, %fd67;
-
-BB4_2:
-       mov.f64         %fd1, %fd68;
-       mul.wide.u32    %rd4, %r30, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd26, [%rd5];
-       min.f64         %fd69, %fd1, %fd26;
-       add.s32         %r3, %r30, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB4_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd27, [%rd7];
-       min.f64         %fd69, %fd69, %fd27;
-
-BB4_4:
-       mov.f64         %fd68, %fd69;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r30, %r12, %r13, %r30;
-       setp.lt.u32     %p3, %r30, %r5;
-       mov.f64         %fd67, %fd68;
-       @%p3 bra        BB4_2;
-
-BB4_5:
-       mov.f64         %fd65, %fd67;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd65;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB4_9;
-
-       setp.gt.u32     %p5, %r6, 255;
-       mov.f64         %fd66, %fd65;
-       @%p5 bra        BB4_8;
-
-       ld.shared.f64   %fd28, [%rd1+2048];
-       min.f64         %fd66, %fd65, %fd28;
-       st.shared.f64   [%rd1], %fd66;
-
-BB4_8:
-       mov.f64         %fd65, %fd66;
-       bar.sync        0;
-
-BB4_9:
-       mov.f64         %fd63, %fd65;
-       setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB4_13;
-
-       setp.gt.u32     %p7, %r6, 127;
-       mov.f64         %fd64, %fd63;
-       @%p7 bra        BB4_12;
-
-       ld.shared.f64   %fd29, [%rd1+1024];
-       min.f64         %fd64, %fd63, %fd29;
-       st.shared.f64   [%rd1], %fd64;
-
-BB4_12:
-       mov.f64         %fd63, %fd64;
-       bar.sync        0;
-
-BB4_13:
-       mov.f64         %fd61, %fd63;
-       setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB4_17;
-
-       setp.gt.u32     %p9, %r6, 63;
-       mov.f64         %fd62, %fd61;
-       @%p9 bra        BB4_16;
-
-       ld.shared.f64   %fd30, [%rd1+512];
-       min.f64         %fd62, %fd61, %fd30;
-       st.shared.f64   [%rd1], %fd62;
-
-BB4_16:
-       mov.f64         %fd61, %fd62;
-       bar.sync        0;
-
-BB4_17:
-       mov.f64         %fd60, %fd61;
-       setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB4_30;
-
-       setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB4_20;
-
-       ld.volatile.shared.f64  %fd31, [%rd1+256];
-       min.f64         %fd60, %fd60, %fd31;
-       st.volatile.shared.f64  [%rd1], %fd60;
-
-BB4_20:
-       mov.f64         %fd59, %fd60;
-       setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB4_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+128];
-       min.f64         %fd59, %fd59, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd59;
-
-BB4_22:
-       mov.f64         %fd58, %fd59;
-       setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB4_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+64];
-       min.f64         %fd58, %fd58, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd58;
-
-BB4_24:
-       mov.f64         %fd57, %fd58;
-       setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB4_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+32];
-       min.f64         %fd57, %fd57, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd57;
-
-BB4_26:
-       mov.f64         %fd56, %fd57;
-       setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB4_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+16];
-       min.f64         %fd56, %fd56, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd56;
-
-BB4_28:
-       setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB4_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+8];
-       min.f64         %fd37, %fd56, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd37;
-
-BB4_30:
-       setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB4_32;
-
-       ld.shared.f64   %fd38, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd38;
-
-BB4_32:
-       ret;
-}
-
        // .globl       copyUpperToLowerTriangleDense
+.func  (.param .b64 func_retval0) __internal_accurate_pow
+(
+       .param .b64 __internal_accurate_pow_param_0,
+       .param .b64 __internal_accurate_pow_param_1
+)
+;
+.extern .shared .align 8 .b8 sdata[];
+
 .visible .entry copyUpperToLowerTriangleDense(
        .param .u64 copyUpperToLowerTriangleDense_param_0,
        .param .u32 copyUpperToLowerTriangleDense_param_1,
@@ -881,10 +46,10 @@ BB4_32:
        setp.gt.s32     %p1, %r2, %r1;
        setp.lt.s32     %p2, %r3, %r5;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB5_2;
-       bra.uni         BB5_1;
+       @!%p3 bra       BB0_2;
+       bra.uni         BB0_1;
 
-BB5_1:
+BB0_1:
        cvta.to.global.u64      %rd2, %rd1;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd3, %r12, 8;
@@ -894,7 +59,7 @@ BB5_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB5_2:
+BB0_2:
        ret;
 }
 
@@ -927,14 +92,14 @@ BB5_2:
        mad.lo.s32      %r1, %r8, %r9, %r11;
        mul.lo.s32      %r12, %r3, %r2;
        setp.ge.s32     %p1, %r1, %r12;
-       @%p1 bra        BB6_2;
+       @%p1 bra        BB1_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB6_2:
+BB1_2:
        ret;
 }
 
@@ -968,10 +133,10 @@ BB6_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB7_2;
-       bra.uni         BB7_1;
+       @!%p3 bra       BB2_2;
+       bra.uni         BB2_1;
 
-BB7_1:
+BB2_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -980,7 +145,7 @@ BB7_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd1;
 
-BB7_2:
+BB2_2:
        ret;
 }
 
@@ -1013,10 +178,10 @@ BB7_2:
        setp.lt.s32     %p1, %r1, %r4;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB8_2;
-       bra.uni         BB8_1;
+       @!%p3 bra       BB3_2;
+       bra.uni         BB3_1;
 
-BB8_1:
+BB3_1:
        cvta.to.global.u64      %rd3, %rd1;
        mad.lo.s32      %r11, %r1, %r3, %r2;
        mul.wide.s32    %rd4, %r11, 8;
@@ -1028,7 +193,67 @@ BB8_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB8_2:
+BB3_2:
+       ret;
+}
+
+       // .globl       relu_backward
+.visible .entry relu_backward(
+       .param .u64 relu_backward_param_0,
+       .param .u64 relu_backward_param_1,
+       .param .u64 relu_backward_param_2,
+       .param .u32 relu_backward_param_3,
+       .param .u32 relu_backward_param_4
+)
+{
+       .reg .pred      %p<5>;
+       .reg .b32       %r<12>;
+       .reg .f64       %fd<6>;
+       .reg .b64       %rd<14>;
+
+
+       ld.param.u64    %rd2, [relu_backward_param_0];
+       ld.param.u64    %rd3, [relu_backward_param_1];
+       ld.param.u64    %rd4, [relu_backward_param_2];
+       ld.param.u32    %r4, [relu_backward_param_3];
+       ld.param.u32    %r3, [relu_backward_param_4];
+       mov.u32         %r5, %ntid.x;
+       mov.u32         %r6, %ctaid.x;
+       mov.u32         %r7, %tid.x;
+       mad.lo.s32      %r1, %r5, %r6, %r7;
+       mov.u32         %r8, %ntid.y;
+       mov.u32         %r9, %ctaid.y;
+       mov.u32         %r10, %tid.y;
+       mad.lo.s32      %r2, %r8, %r9, %r10;
+       setp.lt.s32     %p1, %r1, %r4;
+       setp.lt.s32     %p2, %r2, %r3;
+       and.pred        %p3, %p1, %p2;
+       @!%p3 bra       BB4_4;
+       bra.uni         BB4_1;
+
+BB4_1:
+       cvta.to.global.u64      %rd5, %rd2;
+       mad.lo.s32      %r11, %r1, %r3, %r2;
+       cvt.s64.s32     %rd1, %r11;
+       mul.wide.s32    %rd6, %r11, 8;
+       add.s64         %rd7, %rd5, %rd6;
+       ld.global.f64   %fd4, [%rd7];
+       mov.f64         %fd5, 0d0000000000000000;
+       setp.leu.f64    %p4, %fd4, 0d0000000000000000;
+       @%p4 bra        BB4_3;
+
+       cvta.to.global.u64      %rd8, %rd3;
+       shl.b64         %rd9, %rd1, 3;
+       add.s64         %rd10, %rd8, %rd9;
+       ld.global.f64   %fd5, [%rd10];
+
+BB4_3:
+       cvta.to.global.u64      %rd11, %rd4;
+       shl.b64         %rd12, %rd1, 3;
+       add.s64         %rd13, %rd11, %rd12;
+       st.global.f64   [%rd13], %fd5;
+
+BB4_4:
        ret;
 }
 
@@ -1072,10 +297,10 @@ BB8_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB9_6;
-       bra.uni         BB9_1;
+       @!%p3 bra       BB5_6;
+       bra.uni         BB5_1;
 
-BB9_1:
+BB5_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -1085,26 +310,26 @@ BB9_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB9_5;
-       bra.uni         BB9_2;
+       @%p4 bra        BB5_5;
+       bra.uni         BB5_2;
 
-BB9_5:
+BB5_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB9_6;
+       bra.uni         BB5_6;
 
-BB9_2:
+BB5_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB9_4;
-       bra.uni         BB9_3;
+       @%p5 bra        BB5_4;
+       bra.uni         BB5_3;
 
-BB9_4:
+BB5_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB9_6;
+       bra.uni         BB5_6;
 
-BB9_3:
+BB5_3:
        st.global.f64   [%rd1], %fd6;
 
-BB9_6:
+BB5_6:
        ret;
 }
 
@@ -1120,9 +345,9 @@ BB9_6:
        .param .u32 binCellOp_param_7
 )
 {
-       .reg .pred      %p<52>;
-       .reg .b32       %r<56>;
-       .reg .f64       %fd<40>;
+       .reg .pred      %p<54>;
+       .reg .b32       %r<55>;
+       .reg .f64       %fd<39>;
        .reg .b64       %rd<15>;
 
 
@@ -1145,93 +370,93 @@ BB9_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB10_55;
-       bra.uni         BB10_1;
+       @!%p4 bra       BB6_53;
+       bra.uni         BB6_1;
 
-BB10_1:
+BB6_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r54, %r1;
-       @%p5 bra        BB10_5;
+       mov.u32         %r53, %r1;
+       @%p5 bra        BB6_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r55, %r3;
-       @%p6 bra        BB10_4;
+       mov.u32         %r54, %r3;
+       @%p6 bra        BB6_4;
 
-       mov.u32         %r55, %r2;
+       mov.u32         %r54, %r2;
 
-BB10_4:
-       mov.u32         %r49, %r55;
-       mov.u32         %r4, %r49;
-       mov.u32         %r54, %r4;
+BB6_4:
+       mov.u32         %r48, %r54;
+       mov.u32         %r4, %r48;
+       mov.u32         %r53, %r4;
 
-BB10_5:
-       mov.u32         %r5, %r54;
+BB6_5:
+       mov.u32         %r5, %r53;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r52, %r1;
-       @%p7 bra        BB10_9;
+       mov.u32         %r51, %r1;
+       @%p7 bra        BB6_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r53, %r3;
-       @%p8 bra        BB10_8;
+       mov.u32         %r52, %r3;
+       @%p8 bra        BB6_8;
 
-       mov.u32         %r53, %r2;
+       mov.u32         %r52, %r2;
 
-BB10_8:
-       mov.u32         %r52, %r53;
+BB6_8:
+       mov.u32         %r51, %r52;
 
-BB10_9:
+BB6_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r52, 8;
+       mul.wide.s32    %rd9, %r51, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd39, 0dC08F380000000000;
+       mov.f64         %fd38, 0dC08F380000000000;
        setp.gt.s32     %p9, %r13, 5;
-       @%p9 bra        BB10_19;
+       @%p9 bra        BB6_19;
 
        setp.gt.s32     %p19, %r13, 2;
-       @%p19 bra       BB10_15;
+       @%p19 bra       BB6_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB10_53;
+       @%p23 bra       BB6_51;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB10_52;
-       bra.uni         BB10_13;
+       @%p24 bra       BB6_50;
+       bra.uni         BB6_13;
 
-BB10_52:
-       sub.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB10_54;
+BB6_50:
+       sub.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB6_52;
 
-BB10_19:
+BB6_19:
        setp.gt.s32     %p10, %r13, 8;
-       @%p10 bra       BB10_24;
+       @%p10 bra       BB6_24;
 
        setp.eq.s32     %p16, %r13, 6;
-       @%p16 bra       BB10_34;
+       @%p16 bra       BB6_34;
 
        setp.eq.s32     %p17, %r13, 7;
-       @%p17 bra       BB10_33;
-       bra.uni         BB10_22;
+       @%p17 bra       BB6_33;
+       bra.uni         BB6_22;
 
-BB10_33:
+BB6_33:
        setp.gt.f64     %p29, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB10_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
+       bra.uni         BB6_52;
 
-BB10_15:
+BB6_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB10_51;
+       @%p20 bra       BB6_49;
 
        setp.eq.s32     %p21, %r13, 4;
-       @%p21 bra       BB10_35;
-       bra.uni         BB10_17;
+       @%p21 bra       BB6_35;
+       bra.uni         BB6_17;
 
-BB10_35:
+BB6_35:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -1246,7 +471,7 @@ BB10_35:
        shl.b64         %rd1, %rd11, %r22;
        setp.eq.s64     %p32, %rd1, -9223372036854775808;
        abs.f64         %fd11, %fd1;
-       // Callseq Start 1
+       // Callseq Start 0
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -1261,133 +486,133 @@ BB10_35:
        param0, 
        param1
        );
-       ld.param.f64    %fd38, [retval0+0];
+       ld.param.f64    %fd37, [retval0+0];
        
        //{
-       }// Callseq End 1
+       }// Callseq End 0
        setp.lt.s32     %p33, %r8, 0;
        and.pred        %p1, %p33, %p32;
-       @!%p1 bra       BB10_37;
-       bra.uni         BB10_36;
+       @!%p1 bra       BB6_37;
+       bra.uni         BB6_36;
 
-BB10_36:
+BB6_36:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r23}, %fd38;
+       mov.b64         {%temp, %r23}, %fd37;
        }
        xor.b32         %r24, %r23, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r25, %temp}, %fd38;
+       mov.b64         {%r25, %temp}, %fd37;
        }
-       mov.b64         %fd38, {%r25, %r24};
+       mov.b64         %fd37, {%r25, %r24};
 
-BB10_37:
-       mov.f64         %fd37, %fd38;
+BB6_37:
+       mov.f64         %fd36, %fd37;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       @%p34 bra       BB10_40;
-       bra.uni         BB10_38;
+       @%p34 bra       BB6_40;
+       bra.uni         BB6_38;
 
-BB10_40:
+BB6_40:
        selp.b32        %r26, %r8, 0, %p32;
        or.b32          %r27, %r26, 2146435072;
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
-       mov.b64         %fd37, {%r29, %r28};
-       bra.uni         BB10_41;
+       mov.b64         %fd36, {%r29, %r28};
+       bra.uni         BB6_41;
 
-BB10_24:
+BB6_24:
        setp.gt.s32     %p11, %r13, 10;
-       @%p11 bra       BB10_28;
+       @%p11 bra       BB6_28;
 
        setp.eq.s32     %p14, %r13, 9;
-       @%p14 bra       BB10_32;
-       bra.uni         BB10_26;
+       @%p14 bra       BB6_32;
+       bra.uni         BB6_26;
 
-BB10_32:
+BB6_32:
        setp.eq.f64     %p27, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB10_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
+       bra.uni         BB6_52;
 
-BB10_28:
+BB6_28:
        setp.eq.s32     %p12, %r13, 11;
-       @%p12 bra       BB10_31;
-       bra.uni         BB10_29;
+       @%p12 bra       BB6_31;
+       bra.uni         BB6_29;
 
-BB10_31:
-       min.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB10_54;
+BB6_31:
+       min.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB6_52;
 
-BB10_53:
-       add.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB10_54;
+BB6_51:
+       add.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB6_52;
 
-BB10_13:
+BB6_13:
        setp.eq.s32     %p25, %r13, 2;
-       @%p25 bra       BB10_14;
-       bra.uni         BB10_54;
+       @%p25 bra       BB6_14;
+       bra.uni         BB6_52;
 
-BB10_14:
-       mul.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB10_54;
+BB6_14:
+       mul.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB6_52;
 
-BB10_34:
+BB6_34:
        setp.le.f64     %p30, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB10_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
+       bra.uni         BB6_52;
 
-BB10_22:
+BB6_22:
        setp.eq.s32     %p18, %r13, 8;
-       @%p18 bra       BB10_23;
-       bra.uni         BB10_54;
+       @%p18 bra       BB6_23;
+       bra.uni         BB6_52;
 
-BB10_23:
+BB6_23:
        setp.ge.f64     %p28, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB10_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
+       bra.uni         BB6_52;
 
-BB10_51:
-       div.rn.f64      %fd39, %fd1, %fd2;
-       bra.uni         BB10_54;
+BB6_49:
+       div.rn.f64      %fd38, %fd1, %fd2;
+       bra.uni         BB6_52;
 
-BB10_17:
+BB6_17:
        setp.eq.s32     %p22, %r13, 5;
-       @%p22 bra       BB10_18;
-       bra.uni         BB10_54;
+       @%p22 bra       BB6_18;
+       bra.uni         BB6_52;
 
-BB10_18:
+BB6_18:
        setp.lt.f64     %p31, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB10_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
+       bra.uni         BB6_52;
 
-BB10_26:
+BB6_26:
        setp.eq.s32     %p15, %r13, 10;
-       @%p15 bra       BB10_27;
-       bra.uni         BB10_54;
+       @%p15 bra       BB6_27;
+       bra.uni         BB6_52;
 
-BB10_27:
+BB6_27:
        setp.neu.f64    %p26, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB10_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB6_52;
 
-BB10_29:
+BB6_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB10_54;
+       @%p13 bra       BB6_52;
 
-       max.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB10_54;
+       max.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB6_52;
 
-BB10_38:
+BB6_38:
        setp.gt.s32     %p35, %r8, -1;
-       @%p35 bra       BB10_41;
+       @%p35 bra       BB6_41;
 
        cvt.rzi.f64.f64 %fd29, %fd2;
        setp.neu.f64    %p36, %fd29, %fd2;
-       selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p36;
+       selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p36;
 
-BB10_41:
-       mov.f64         %fd17, %fd37;
+BB6_41:
+       mov.f64         %fd17, %fd36;
        add.f64         %fd18, %fd1, %fd2;
        {
        .reg .b32 %temp; 
@@ -1395,78 +620,76 @@ BB10_41:
        }
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
-       mov.f64         %fd36, %fd17;
-       @%p39 bra       BB10_50;
+       mov.f64         %fd35, %fd17;
+       @%p39 bra       BB6_48;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
-       mov.f64         %fd36, %fd18;
-       @%p40 bra       BB10_50;
+       mov.f64         %fd35, %fd18;
+       @%p40 bra       BB6_48;
 
        abs.f64         %fd30, %fd2;
        setp.gtu.f64    %p41, %fd30, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd18;
-       mov.f64         %fd36, %fd35;
-       @%p41 bra       BB10_50;
-
-       and.b32         %r32, %r9, 2147483647;
-       setp.ne.s32     %p42, %r32, 2146435072;
-       @%p42 bra       BB10_46;
+       mov.f64         %fd34, %fd18;
+       mov.f64         %fd35, %fd34;
+       @%p41 bra       BB6_48;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r33, %temp}, %fd2;
+       mov.b64         {%r32, %temp}, %fd2;
        }
-       setp.eq.s32     %p43, %r33, 0;
-       @%p43 bra       BB10_49;
-
-BB10_46:
-       and.b32         %r34, %r8, 2147483647;
-       setp.ne.s32     %p44, %r34, 2146435072;
-       mov.f64         %fd33, %fd17;
-       mov.f64         %fd36, %fd33;
-       @%p44 bra       BB10_50;
-
+       and.b32         %r33, %r9, 2147483647;
+       setp.eq.s32     %p42, %r33, 2146435072;
+       setp.eq.s32     %p43, %r32, 0;
+       and.pred        %p44, %p42, %p43;
+       @%p44 bra       BB6_47;
+       bra.uni         BB6_45;
+
+BB6_47:
+       setp.gt.f64     %p48, %fd11, 0d3FF0000000000000;
+       selp.b32        %r41, 2146435072, 0, %p48;
+       xor.b32         %r42, %r41, 2146435072;
+       setp.lt.s32     %p49, %r9, 0;
+       selp.b32        %r43, %r42, %r41, %p49;
+       setp.eq.f64     %p50, %fd1, 0dBFF0000000000000;
+       selp.b32        %r44, 1072693248, %r43, %p50;
+       mov.u32         %r45, 0;
+       mov.b64         %fd35, {%r45, %r44};
+       bra.uni         BB6_48;
+
+BB6_45:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd1;
+       mov.b64         {%r34, %temp}, %fd1;
        }
-       setp.ne.s32     %p45, %r35, 0;
-       mov.f64         %fd36, %fd17;
-       @%p45 bra       BB10_50;
-
+       and.b32         %r35, %r8, 2147483647;
+       setp.eq.s32     %p45, %r35, 2146435072;
+       setp.eq.s32     %p46, %r34, 0;
+       and.pred        %p47, %p45, %p46;
+       mov.f64         %fd35, %fd17;
+       @!%p47 bra      BB6_48;
+       bra.uni         BB6_46;
+
+BB6_46:
        shr.s32         %r36, %r9, 31;
        and.b32         %r37, %r36, -2146435072;
-       add.s32         %r38, %r37, 2146435072;
-       or.b32          %r39, %r38, -2147483648;
-       selp.b32        %r40, %r39, %r38, %p1;
-       mov.u32         %r41, 0;
-       mov.b64         %fd36, {%r41, %r40};
-       bra.uni         BB10_50;
-
-BB10_49:
-       setp.gt.f64     %p46, %fd11, 0d3FF0000000000000;
-       selp.b32        %r42, 2146435072, 0, %p46;
-       xor.b32         %r43, %r42, 2146435072;
-       setp.lt.s32     %p47, %r9, 0;
-       selp.b32        %r44, %r43, %r42, %p47;
-       setp.eq.f64     %p48, %fd1, 0dBFF0000000000000;
-       selp.b32        %r45, 1072693248, %r44, %p48;
-       mov.u32         %r46, 0;
-       mov.b64         %fd36, {%r46, %r45};
-
-BB10_50:
-       setp.eq.f64     %p49, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p50, %fd1, 0d3FF0000000000000;
-       or.pred         %p51, %p50, %p49;
-       selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p51;
-
-BB10_54:
+       selp.b32        %r38, -1048576, 2146435072, %p1;
+       add.s32         %r39, %r38, %r37;
+       mov.u32         %r40, 0;
+       mov.b64         %fd35, {%r40, %r39};
+
+BB6_48:
+       setp.eq.f64     %p51, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p52, %fd1, 0d3FF0000000000000;
+       or.pred         %p53, %p52, %p51;
+       selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p53;
+
+BB6_52:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
-       st.global.f64   [%rd14], %fd39;
+       st.global.f64   [%rd14], %fd38;
 
-BB10_55:
+BB6_53:
        ret;
 }
 
@@ -1481,9 +704,9 @@ BB10_55:
        .param .u32 binCellScalarOp_param_6
 )
 {
-       .reg .pred      %p<89>;
-       .reg .b32       %r<71>;
-       .reg .f64       %fd<77>;
+       .reg .pred      %p<93>;
+       .reg .b32       %r<69>;
+       .reg .f64       %fd<75>;
        .reg .b64       %rd<12>;
 
 
@@ -1505,7 +728,7 @@ BB10_55:
        mad.lo.s32      %r1, %r14, %r15, %r17;
        mul.lo.s32      %r18, %r9, %r8;
        setp.ge.s32     %p3, %r1, %r18;
-       @%p3 bra        BB11_92;
+       @%p3 bra        BB7_88;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -1514,178 +737,178 @@ BB10_55:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB11_47;
+       @%p4 bra        BB7_45;
 
        setp.eq.s32     %p5, %r6, 0;
-       @%p5 bra        BB11_45;
+       @%p5 bra        BB7_43;
 
-       mov.f64         %fd67, 0dC08F380000000000;
+       mov.f64         %fd66, 0dC08F380000000000;
        setp.gt.s32     %p6, %r6, 6;
-       @%p6 bra        BB11_13;
+       @%p6 bra        BB7_13;
 
        setp.gt.s32     %p14, %r6, 3;
-       @%p14 bra       BB11_9;
+       @%p14 bra       BB7_9;
 
        setp.eq.s32     %p18, %r6, 1;
-       @%p18 bra       BB11_44;
+       @%p18 bra       BB7_42;
 
        setp.eq.s32     %p19, %r6, 2;
-       @%p19 bra       BB11_43;
-       bra.uni         BB11_7;
+       @%p19 bra       BB7_41;
+       bra.uni         BB7_7;
 
-BB11_43:
-       mul.f64         %fd67, %fd1, %fd52;
-       bra.uni         BB11_46;
+BB7_41:
+       mul.f64         %fd66, %fd1, %fd52;
+       bra.uni         BB7_44;
 
-BB11_47:
-       setp.eq.s32     %p47, %r6, 0;
-       @%p47 bra       BB11_90;
+BB7_45:
+       setp.eq.s32     %p49, %r6, 0;
+       @%p49 bra       BB7_86;
 
-       mov.f64         %fd76, 0dC08F380000000000;
-       setp.gt.s32     %p48, %r6, 6;
-       @%p48 bra       BB11_58;
+       mov.f64         %fd74, 0dC08F380000000000;
+       setp.gt.s32     %p50, %r6, 6;
+       @%p50 bra       BB7_56;
 
-       setp.gt.s32     %p56, %r6, 3;
-       @%p56 bra       BB11_54;
+       setp.gt.s32     %p58, %r6, 3;
+       @%p58 bra       BB7_52;
 
-       setp.eq.s32     %p60, %r6, 1;
-       @%p60 bra       BB11_89;
+       setp.eq.s32     %p62, %r6, 1;
+       @%p62 bra       BB7_85;
 
-       setp.eq.s32     %p61, %r6, 2;
-       @%p61 bra       BB11_88;
-       bra.uni         BB11_52;
+       setp.eq.s32     %p63, %r6, 2;
+       @%p63 bra       BB7_84;
+       bra.uni         BB7_50;
 
-BB11_88:
-       mul.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB11_91;
+BB7_84:
+       mul.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB7_87;
 
-BB11_45:
-       add.f64         %fd67, %fd1, %fd52;
+BB7_43:
+       add.f64         %fd66, %fd1, %fd52;
 
-BB11_46:
-       st.global.f64   [%rd1], %fd67;
-       bra.uni         BB11_92;
+BB7_44:
+       st.global.f64   [%rd1], %fd66;
+       bra.uni         BB7_88;
 
-BB11_13:
+BB7_13:
        setp.gt.s32     %p7, %r6, 9;
-       @%p7 bra        BB11_18;
+       @%p7 bra        BB7_18;
 
        setp.eq.s32     %p11, %r6, 7;
-       @%p11 bra       BB11_25;
+       @%p11 bra       BB7_25;
 
        setp.eq.s32     %p12, %r6, 8;
-       @%p12 bra       BB11_24;
-       bra.uni         BB11_16;
+       @%p12 bra       BB7_24;
+       bra.uni         BB7_16;
 
-BB11_24:
+BB7_24:
        setp.le.f64     %p23, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB11_46;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
+       bra.uni         BB7_44;
 
-BB11_90:
-       add.f64         %fd76, %fd1, %fd52;
+BB7_86:
+       add.f64         %fd74, %fd1, %fd52;
 
-BB11_91:
-       st.global.f64   [%rd1], %fd76;
+BB7_87:
+       st.global.f64   [%rd1], %fd74;
 
-BB11_92:
+BB7_88:
        ret;
 
-BB11_58:
-       setp.gt.s32     %p49, %r6, 9;
-       @%p49 bra       BB11_63;
+BB7_56:
+       setp.gt.s32     %p51, %r6, 9;
+       @%p51 bra       BB7_61;
 
-       setp.eq.s32     %p53, %r6, 7;
-       @%p53 bra       BB11_70;
+       setp.eq.s32     %p55, %r6, 7;
+       @%p55 bra       BB7_68;
 
-       setp.eq.s32     %p54, %r6, 8;
-       @%p54 bra       BB11_69;
-       bra.uni         BB11_61;
+       setp.eq.s32     %p56, %r6, 8;
+       @%p56 bra       BB7_67;
+       bra.uni         BB7_59;
 
-BB11_69:
-       setp.ge.f64     %p65, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
-       bra.uni         BB11_91;
+BB7_67:
+       setp.ge.f64     %p67, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
+       bra.uni         BB7_87;
 
-BB11_9:
+BB7_9:
        setp.eq.s32     %p15, %r6, 4;
-       @%p15 bra       BB11_27;
+       @%p15 bra       BB7_27;
 
        setp.eq.s32     %p16, %r6, 5;
-       @%p16 bra       BB11_26;
-       bra.uni         BB11_11;
+       @%p16 bra       BB7_26;
+       bra.uni         BB7_11;
 
-BB11_26:
+BB7_26:
        setp.gt.f64     %p26, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB11_46;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB7_44;
 
-BB11_18:
+BB7_18:
        setp.eq.s32     %p8, %r6, 10;
-       @%p8 bra        BB11_23;
+       @%p8 bra        BB7_23;
 
        setp.eq.s32     %p9, %r6, 11;
-       @%p9 bra        BB11_22;
-       bra.uni         BB11_20;
+       @%p9 bra        BB7_22;
+       bra.uni         BB7_20;
 
-BB11_22:
-       min.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB11_46;
+BB7_22:
+       min.f64         %fd66, %fd52, %fd1;
+       bra.uni         BB7_44;
 
-BB11_54:
-       setp.eq.s32     %p57, %r6, 4;
-       @%p57 bra       BB11_72;
+BB7_52:
+       setp.eq.s32     %p59, %r6, 4;
+       @%p59 bra       BB7_70;
 
-       setp.eq.s32     %p58, %r6, 5;
-       @%p58 bra       BB11_71;
-       bra.uni         BB11_56;
+       setp.eq.s32     %p60, %r6, 5;
+       @%p60 bra       BB7_69;
+       bra.uni         BB7_54;
 
-BB11_71:
-       setp.lt.f64     %p68, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
-       bra.uni         BB11_91;
+BB7_69:
+       setp.lt.f64     %p70, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
+       bra.uni         BB7_87;
 
-BB11_63:
-       setp.eq.s32     %p50, %r6, 10;
-       @%p50 bra       BB11_68;
+BB7_61:
+       setp.eq.s32     %p52, %r6, 10;
+       @%p52 bra       BB7_66;
 
-       setp.eq.s32     %p51, %r6, 11;
-       @%p51 bra       BB11_67;
-       bra.uni         BB11_65;
+       setp.eq.s32     %p53, %r6, 11;
+       @%p53 bra       BB7_65;
+       bra.uni         BB7_63;
 
-BB11_67:
-       min.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB11_91;
+BB7_65:
+       min.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB7_87;
 
-BB11_44:
-       sub.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB11_46;
+BB7_42:
+       sub.f64         %fd66, %fd52, %fd1;
+       bra.uni         BB7_44;
 
-BB11_7:
+BB7_7:
        setp.eq.s32     %p20, %r6, 3;
-       @%p20 bra       BB11_8;
-       bra.uni         BB11_46;
+       @%p20 bra       BB7_8;
+       bra.uni         BB7_44;
 
-BB11_8:
-       div.rn.f64      %fd67, %fd52, %fd1;
-       bra.uni         BB11_46;
+BB7_8:
+       div.rn.f64      %fd66, %fd52, %fd1;
+       bra.uni         BB7_44;
 
-BB11_25:
+BB7_25:
        setp.lt.f64     %p24, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB11_46;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
+       bra.uni         BB7_44;
 
-BB11_16:
+BB7_16:
        setp.eq.s32     %p13, %r6, 9;
-       @%p13 bra       BB11_17;
-       bra.uni         BB11_46;
+       @%p13 bra       BB7_17;
+       bra.uni         BB7_44;
 
-BB11_17:
+BB7_17:
        setp.eq.f64     %p22, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB11_46;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
+       bra.uni         BB7_44;
 
-BB11_27:
+BB7_27:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd52;
@@ -1700,7 +923,7 @@ BB11_27:
        shl.b64         %rd2, %rd10, %r20;
        setp.eq.s64     %p27, %rd2, -9223372036854775808;
        abs.f64         %fd10, %fd52;
-       // Callseq Start 2
+       // Callseq Start 1
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -1715,93 +938,93 @@ BB11_27:
        param0, 
        param1
        );
-       ld.param.f64    %fd66, [retval0+0];
+       ld.param.f64    %fd65, [retval0+0];
        
        //{
-       }// Callseq End 2
+       }// Callseq End 1
        setp.lt.s32     %p28, %r2, 0;
        and.pred        %p1, %p28, %p27;
-       @!%p1 bra       BB11_29;
-       bra.uni         BB11_28;
+       @!%p1 bra       BB7_29;
+       bra.uni         BB7_28;
 
-BB11_28:
+BB7_28:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r21}, %fd66;
+       mov.b64         {%temp, %r21}, %fd65;
        }
        xor.b32         %r22, %r21, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r23, %temp}, %fd66;
+       mov.b64         {%r23, %temp}, %fd65;
        }
-       mov.b64         %fd66, {%r23, %r22};
+       mov.b64         %fd65, {%r23, %r22};
 
-BB11_29:
-       mov.f64         %fd65, %fd66;
+BB7_29:
+       mov.f64         %fd64, %fd65;
        setp.eq.f64     %p29, %fd52, 0d0000000000000000;
-       @%p29 bra       BB11_32;
-       bra.uni         BB11_30;
+       @%p29 bra       BB7_32;
+       bra.uni         BB7_30;
 
-BB11_32:
+BB7_32:
        selp.b32        %r24, %r2, 0, %p27;
        or.b32          %r25, %r24, 2146435072;
        setp.lt.s32     %p33, %r3, 0;
        selp.b32        %r26, %r25, %r24, %p33;
        mov.u32         %r27, 0;
-       mov.b64         %fd65, {%r27, %r26};
-       bra.uni         BB11_33;
+       mov.b64         %fd64, {%r27, %r26};
+       bra.uni         BB7_33;
 
-BB11_11:
+BB7_11:
        setp.eq.s32     %p17, %r6, 6;
-       @%p17 bra       BB11_12;
-       bra.uni         BB11_46;
+       @%p17 bra       BB7_12;
+       bra.uni         BB7_44;
 
-BB11_12:
+BB7_12:
        setp.ge.f64     %p25, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB11_46;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
+       bra.uni         BB7_44;
 
-BB11_23:
+BB7_23:
        setp.neu.f64    %p21, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
-       bra.uni         BB11_46;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21;
+       bra.uni         BB7_44;
 
-BB11_20:
+BB7_20:
        setp.ne.s32     %p10, %r6, 12;
-       @%p10 bra       BB11_46;
+       @%p10 bra       BB7_44;
 
-       max.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB11_46;
+       max.f64         %fd66, %fd52, %fd1;
+       bra.uni         BB7_44;
 
-BB11_89:
-       sub.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB11_91;
+BB7_85:
+       sub.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB7_87;
 
-BB11_52:
-       setp.eq.s32     %p62, %r6, 3;
-       @%p62 bra       BB11_53;
-       bra.uni         BB11_91;
+BB7_50:
+       setp.eq.s32     %p64, %r6, 3;
+       @%p64 bra       BB7_51;
+       bra.uni         BB7_87;
 
-BB11_53:
-       div.rn.f64      %fd76, %fd1, %fd52;
-       bra.uni         BB11_91;
+BB7_51:
+       div.rn.f64      %fd74, %fd1, %fd52;
+       bra.uni         BB7_87;
 
-BB11_70:
-       setp.gt.f64     %p66, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
-       bra.uni         BB11_91;
+BB7_68:
+       setp.gt.f64     %p68, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
+       bra.uni         BB7_87;
 
-BB11_61:
-       setp.eq.s32     %p55, %r6, 9;
-       @%p55 bra       BB11_62;
-       bra.uni         BB11_91;
+BB7_59:
+       setp.eq.s32     %p57, %r6, 9;
+       @%p57 bra       BB7_60;
+       bra.uni         BB7_87;
 
-BB11_62:
-       setp.eq.f64     %p64, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
-       bra.uni         BB11_91;
+BB7_60:
+       setp.eq.f64     %p66, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66;
+       bra.uni         BB7_87;
 
-BB11_72:
+BB7_70:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -1810,13 +1033,13 @@ BB11_72:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd52;
        }
-       bfe.u32         %r45, %r5, 20, 11;
-       add.s32         %r46, %r45, -1012;
+       bfe.u32         %r44, %r5, 20, 11;
+       add.s32         %r45, %r44, -1012;
        mov.b64          %rd11, %fd52;
-       shl.b64         %rd3, %rd11, %r46;
-       setp.eq.s64     %p69, %rd3, -9223372036854775808;
+       shl.b64         %rd3, %rd11, %r45;
+       setp.eq.s64     %p71, %rd3, -9223372036854775808;
        abs.f64         %fd35, %fd1;
-       // Callseq Start 3
+       // Callseq Start 2
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -1831,74 +1054,74 @@ BB11_72:
        param0, 
        param1
        );
-       ld.param.f64    %fd75, [retval0+0];
+       ld.param.f64    %fd73, [retval0+0];
        
        //{
-       }// Callseq End 3
-       setp.lt.s32     %p70, %r4, 0;
-       and.pred        %p2, %p70, %p69;
-       @!%p2 bra       BB11_74;
-       bra.uni         BB11_73;
+       }// Callseq End 2
+       setp.lt.s32     %p72, %r4, 0;
+       and.pred        %p2, %p72, %p71;
+       @!%p2 bra       BB7_72;
+       bra.uni         BB7_71;
 
-BB11_73:
+BB7_71:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r47}, %fd75;
+       mov.b64         {%temp, %r46}, %fd73;
        }
-       xor.b32         %r48, %r47, -2147483648;
+       xor.b32         %r47, %r46, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r49, %temp}, %fd75;
+       mov.b64         {%r48, %temp}, %fd73;
        }
-       mov.b64         %fd75, {%r49, %r48};
-
-BB11_74:
-       mov.f64         %fd74, %fd75;
-       setp.eq.f64     %p71, %fd1, 0d0000000000000000;
-       @%p71 bra       BB11_77;
-       bra.uni         BB11_75;
-
-BB11_77:
-       selp.b32        %r50, %r4, 0, %p69;
-       or.b32          %r51, %r50, 2146435072;
-       setp.lt.s32     %p75, %r5, 0;
-       selp.b32        %r52, %r51, %r50, %p75;
-       mov.u32         %r53, 0;
-       mov.b64         %fd74, {%r53, %r52};
-       bra.uni         BB11_78;
-
-BB11_56:
-       setp.eq.s32     %p59, %r6, 6;
-       @%p59 bra       BB11_57;
-       bra.uni         BB11_91;
-
-BB11_57:
-       setp.le.f64     %p67, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
-       bra.uni         BB11_91;
-
-BB11_68:
-       setp.neu.f64    %p63, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
-       bra.uni         BB11_91;
-
-BB11_65:
-       setp.ne.s32     %p52, %r6, 12;
-       @%p52 bra       BB11_91;
-
-       max.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB11_91;
-
-BB11_30:
+       mov.b64         %fd73, {%r48, %r47};
+
+BB7_72:
+       mov.f64         %fd72, %fd73;
+       setp.eq.f64     %p73, %fd1, 0d0000000000000000;
+       @%p73 bra       BB7_75;
+       bra.uni         BB7_73;
+
+BB7_75:
+       selp.b32        %r49, %r4, 0, %p71;
+       or.b32          %r50, %r49, 2146435072;
+       setp.lt.s32     %p77, %r5, 0;
+       selp.b32        %r51, %r50, %r49, %p77;
+       mov.u32         %r52, 0;
+       mov.b64         %fd72, {%r52, %r51};
+       bra.uni         BB7_76;
+
+BB7_54:
+       setp.eq.s32     %p61, %r6, 6;
+       @%p61 bra       BB7_55;
+       bra.uni         BB7_87;
+
+BB7_55:
+       setp.le.f64     %p69, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
+       bra.uni         BB7_87;
+
+BB7_66:
+       setp.neu.f64    %p65, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65;
+       bra.uni         BB7_87;
+
+BB7_63:
+       setp.ne.s32     %p54, %r6, 12;
+       @%p54 bra       BB7_87;
+
+       max.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB7_87;
+
+BB7_30:
        setp.gt.s32     %p30, %r2, -1;
-       @%p30 bra       BB11_33;
+       @%p30 bra       BB7_33;
 
        cvt.rzi.f64.f64 %fd54, %fd1;
        setp.neu.f64    %p31, %fd54, %fd1;
-       selp.f64        %fd65, 0dFFF8000000000000, %fd65, %p31;
+       selp.f64        %fd64, 0dFFF8000000000000, %fd64, %p31;
 
-BB11_33:
-       mov.f64         %fd16, %fd65;
+BB7_33:
+       mov.f64         %fd16, %fd64;
        add.f64         %fd17, %fd1, %fd52;
        {
        .reg .b32 %temp; 
@@ -1906,154 +1129,150 @@ BB11_33:
        }
        and.b32         %r29, %r28, 2146435072;
        setp.ne.s32     %p34, %r29, 2146435072;
-       mov.f64         %fd64, %fd16;
-       @%p34 bra       BB11_42;
+       mov.f64         %fd63, %fd16;
+       @%p34 bra       BB7_40;
 
        setp.gtu.f64    %p35, %fd10, 0d7FF0000000000000;
-       mov.f64         %fd64, %fd17;
-       @%p35 bra       BB11_42;
+       mov.f64         %fd63, %fd17;
+       @%p35 bra       BB7_40;
 
        abs.f64         %fd55, %fd1;
        setp.gtu.f64    %p36, %fd55, 0d7FF0000000000000;
-       mov.f64         %fd63, %fd17;
-       mov.f64         %fd64, %fd63;
-       @%p36 bra       BB11_42;
-
-       and.b32         %r30, %r3, 2147483647;
-       setp.ne.s32     %p37, %r30, 2146435072;
-       @%p37 bra       BB11_38;
-
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r31, %temp}, %fd1;
-       }
-       setp.eq.s32     %p38, %r31, 0;
-       @%p38 bra       BB11_41;
-
-BB11_38:
-       and.b32         %r32, %r2, 2147483647;
-       setp.ne.s32     %p39, %r32, 2146435072;
-       mov.f64         %fd61, %fd16;
-       mov.f64         %fd64, %fd61;
-       @%p39 bra       BB11_42;
+       mov.f64         %fd62, %fd17;
+       mov.f64         %fd63, %fd62;
+       @%p36 bra       BB7_40;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r33, %temp}, %fd52;
+       mov.b64         {%r30, %temp}, %fd1;
        }
-       setp.ne.s32     %p40, %r33, 0;
-       mov.f64         %fd64, %fd16;
-       @%p40 bra       BB11_42;
-
-       shr.s32         %r34, %r3, 31;
-       and.b32         %r35, %r34, -2146435072;
-       add.s32         %r36, %r35, 2146435072;
-       or.b32          %r37, %r36, -2147483648;
-       selp.b32        %r38, %r37, %r36, %p1;
-       mov.u32         %r39, 0;
-       mov.b64         %fd64, {%r39, %r38};
-       bra.uni         BB11_42;
-
-BB11_75:
-       setp.gt.s32     %p72, %r4, -1;
-       @%p72 bra       BB11_78;
+       and.b32         %r31, %r3, 2147483647;
+       setp.eq.s32     %p37, %r31, 2146435072;
+       setp.eq.s32     %p38, %r30, 0;
+       and.pred        %p39, %p37, %p38;
+       @%p39 bra       BB7_39;
+       bra.uni         BB7_37;
+
+BB7_39:
+       setp.gt.f64     %p43, %fd10, 0d3FF0000000000000;
+       selp.b32        %r39, 2146435072, 0, %p43;
+       xor.b32         %r40, %r39, 2146435072;
+       setp.lt.s32     %p44, %r3, 0;
+       selp.b32        %r41, %r40, %r39, %p44;
+       setp.eq.f64     %p45, %fd52, 0dBFF0000000000000;
+       selp.b32        %r42, 1072693248, %r41, %p45;
+       mov.u32         %r43, 0;
+       mov.b64         %fd63, {%r43, %r42};
+       bra.uni         BB7_40;
+
+BB7_73:
+       setp.gt.s32     %p74, %r4, -1;
+       @%p74 bra       BB7_76;
 
        cvt.rzi.f64.f64 %fd57, %fd52;
-       setp.neu.f64    %p73, %fd57, %fd52;
-       selp.f64        %fd74, 0dFFF8000000000000, %fd74, %p73;
+       setp.neu.f64    %p75, %fd57, %fd52;
+       selp.f64        %fd72, 0dFFF8000000000000, %fd72, %p75;
 
-BB11_78:
-       mov.f64         %fd41, %fd74;
+BB7_76:
+       mov.f64         %fd41, %fd72;
        add.f64         %fd42, %fd1, %fd52;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r54}, %fd42;
+       mov.b64         {%temp, %r53}, %fd42;
        }
-       and.b32         %r55, %r54, 2146435072;
-       setp.ne.s32     %p76, %r55, 2146435072;
-       mov.f64         %fd73, %fd41;
-       @%p76 bra       BB11_87;
+       and.b32         %r54, %r53, 2146435072;
+       setp.ne.s32     %p78, %r54, 2146435072;
+       mov.f64         %fd71, %fd41;
+       @%p78 bra       BB7_83;
 
-       setp.gtu.f64    %p77, %fd35, 0d7FF0000000000000;
-       mov.f64         %fd73, %fd42;
-       @%p77 bra       BB11_87;
+       setp.gtu.f64    %p79, %fd35, 0d7FF0000000000000;
+       mov.f64         %fd71, %fd42;
+       @%p79 bra       BB7_83;
 
        abs.f64         %fd58, %fd52;
-       setp.gtu.f64    %p78, %fd58, 0d7FF0000000000000;
-       mov.f64         %fd72, %fd42;
-       mov.f64         %fd73, %fd72;
-       @%p78 bra       BB11_87;
+       setp.gtu.f64    %p80, %fd58, 0d7FF0000000000000;
+       mov.f64         %fd70, %fd42;
+       mov.f64         %fd71, %fd70;
+       @%p80 bra       BB7_83;
 
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r55, %temp}, %fd52;
+       }
        and.b32         %r56, %r5, 2147483647;
-       setp.ne.s32     %p79, %r56, 2146435072;
-       @%p79 bra       BB11_83;
-
+       setp.eq.s32     %p81, %r56, 2146435072;
+       setp.eq.s32     %p82, %r55, 0;
+       and.pred        %p83, %p81, %p82;
+       @%p83 bra       BB7_82;
+       bra.uni         BB7_80;
+
+BB7_82:
+       setp.gt.f64     %p87, %fd35, 0d3FF0000000000000;
+       selp.b32        %r64, 2146435072, 0, %p87;
+       xor.b32         %r65, %r64, 2146435072;
+       setp.lt.s32     %p88, %r5, 0;
+       selp.b32        %r66, %r65, %r64, %p88;
+       setp.eq.f64     %p89, %fd1, 0dBFF0000000000000;
+       selp.b32        %r67, 1072693248, %r66, %p89;
+       mov.u32         %r68, 0;
+       mov.b64         %fd71, {%r68, %r67};
+       bra.uni         BB7_83;
+
+BB7_37:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r57, %temp}, %fd52;
+       mov.b64         {%r32, %temp}, %fd52;
        }
-       setp.eq.s32     %p80, %r57, 0;
-       @%p80 bra       BB11_86;
-
-BB11_83:
-       and.b32         %r58, %r4, 2147483647;
-       setp.ne.s32     %p81, %r58, 2146435072;
-       mov.f64         %fd70, %fd41;
-       mov.f64         %fd73, %fd70;
-       @%p81 bra       BB11_87;
-
+       and.b32         %r33, %r2, 2147483647;
+       setp.eq.s32     %p40, %r33, 2146435072;
+       setp.eq.s32     %p41, %r32, 0;
+       and.pred        %p42, %p40, %p41;
+       mov.f64         %fd63, %fd16;
+       @!%p42 bra      BB7_40;
+       bra.uni         BB7_38;
+
+BB7_38:
+       shr.s32         %r34, %r3, 31;
+       and.b32         %r35, %r34, -2146435072;
+       selp.b32        %r36, -1048576, 2146435072, %p1;
+       add.s32         %r37, %r36, %r35;
+       mov.u32         %r38, 0;
+       mov.b64         %fd63, {%r38, %r37};
+
+BB7_40:
+       setp.eq.f64     %p46, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p47, %fd52, 0d3FF0000000000000;
+       or.pred         %p48, %p47, %p46;
+       selp.f64        %fd66, 0d3FF0000000000000, %fd63, %p48;
+       bra.uni         BB7_44;
+
+BB7_80:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r59, %temp}, %fd1;
+       mov.b64         {%r57, %temp}, %fd1;
        }
-       setp.ne.s32     %p82, %r59, 0;
-       mov.f64         %fd73, %fd41;
-       @%p82 bra       BB11_87;
-
-       shr.s32         %r60, %r5, 31;
-       and.b32         %r61, %r60, -2146435072;
-       add.s32         %r62, %r61, 2146435072;
-       or.b32          %r63, %r62, -2147483648;
-       selp.b32        %r64, %r63, %r62, %p2;
-       mov.u32         %r65, 0;
-       mov.b64         %fd73, {%r65, %r64};
-       bra.uni         BB11_87;
-
-BB11_41:
-       setp.gt.f64     %p41, %fd10, 0d3FF0000000000000;
-       selp.b32        %r40, 2146435072, 0, %p41;
-       xor.b32         %r41, %r40, 2146435072;
-       setp.lt.s32     %p42, %r3, 0;
-       selp.b32        %r42, %r41, %r40, %p42;
-       setp.eq.f64     %p43, %fd52, 0dBFF0000000000000;
-       selp.b32        %r43, 1072693248, %r42, %p43;
-       mov.u32         %r44, 0;
-       mov.b64         %fd64, {%r44, %r43};
-
-BB11_42:
-       setp.eq.f64     %p44, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p45, %fd52, 0d3FF0000000000000;
-       or.pred         %p46, %p45, %p44;
-       selp.f64        %fd67, 0d3FF0000000000000, %fd64, %p46;
-       bra.uni         BB11_46;
-
-BB11_86:
-       setp.gt.f64     %p83, %fd35, 0d3FF0000000000000;
-       selp.b32        %r66, 2146435072, 0, %p83;
-       xor.b32         %r67, %r66, 2146435072;
-       setp.lt.s32     %p84, %r5, 0;
-       selp.b32        %r68, %r67, %r66, %p84;
-       setp.eq.f64     %p85, %fd1, 0dBFF0000000000000;
-       selp.b32        %r69, 1072693248, %r68, %p85;
-       mov.u32         %r70, 0;
-       mov.b64         %fd73, {%r70, %r69};
-
-BB11_87:
-       setp.eq.f64     %p86, %fd52, 0d0000000000000000;
-       setp.eq.f64     %p87, %fd1, 0d3FF0000000000000;
-       or.pred         %p88, %p87, %p86;
-       selp.f64        %fd76, 0d3FF0000000000000, %fd73, %p88;
-       bra.uni         BB11_91;
+       and.b32         %r58, %r4, 2147483647;
+       setp.eq.s32     %p84, %r58, 2146435072;
+       setp.eq.s32     %p85, %r57, 0;
+       and.pred        %p86, %p84, %p85;
+       mov.f64         %fd71, %fd41;
+       @!%p86 bra      BB7_83;
+       bra.uni         BB7_81;
+
+BB7_81:
+       shr.s32         %r59, %r5, 31;
+       and.b32         %r60, %r59, -2146435072;
+       selp.b32        %r61, -1048576, 2146435072, %p2;
+       add.s32         %r62, %r61, %r60;
+       mov.u32         %r63, 0;
+       mov.b64         %fd71, {%r63, %r62};
+
+BB7_83:
+       setp.eq.f64     %p90, %fd52, 0d0000000000000000;
+       setp.eq.f64     %p91, %fd1, 0d3FF0000000000000;
+       or.pred         %p92, %p91, %p90;
+       selp.f64        %fd74, 0d3FF0000000000000, %fd71, %p92;
+       bra.uni         BB7_87;
 }
 
        // .globl       fill
@@ -2077,14 +1296,14 @@ BB11_87:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB12_2;
+       @%p1 bra        BB8_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB12_2:
+BB8_2:
        ret;
 }
 
@@ -2108,17 +1327,17 @@ BB12_2:
        ld.param.u32    %r4, [reduce_row_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB13_31;
+       @%p1 bra        BB9_31;
 
        mov.u32         %r35, %tid.x;
        mov.f64         %fd63, 0d0000000000000000;
        mov.f64         %fd64, %fd63;
        setp.ge.u32     %p2, %r35, %r4;
-       @%p2 bra        BB13_4;
+       @%p2 bra        BB9_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB13_3:
+BB9_3:
        mad.lo.s32      %r8, %r6, %r4, %r35;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -2128,9 +1347,9 @@ BB13_3:
        add.s32         %r35, %r9, %r35;
        setp.lt.u32     %p3, %r35, %r4;
        mov.f64         %fd63, %fd64;
-       @%p3 bra        BB13_3;
+       @%p3 bra        BB9_3;
 
-BB13_4:
+BB9_4:
        mov.f64         %fd61, %fd63;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -2140,113 +1359,113 @@ BB13_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 512;
-       @%p4 bra        BB13_8;
+       @%p4 bra        BB9_8;
 
        setp.gt.u32     %p5, %r10, 255;
        mov.f64         %fd62, %fd61;
-       @%p5 bra        BB13_7;
+       @%p5 bra        BB9_7;
 
        ld.shared.f64   %fd26, [%rd8+2048];
        add.f64         %fd62, %fd61, %fd26;
        st.shared.f64   [%rd8], %fd62;
 
-BB13_7:
+BB9_7:
        mov.f64         %fd61, %fd62;
        bar.sync        0;
 
-BB13_8:
+BB9_8:
        mov.f64         %fd59, %fd61;
        setp.lt.u32     %p6, %r11, 256;
-       @%p6 bra        BB13_12;
+       @%p6 bra        BB9_12;
 
        setp.gt.u32     %p7, %r10, 127;
        mov.f64         %fd60, %fd59;
-       @%p7 bra        BB13_11;
+       @%p7 bra        BB9_11;
 
        ld.shared.f64   %fd27, [%rd8+1024];
        add.f64         %fd60, %fd59, %fd27;
        st.shared.f64   [%rd8], %fd60;
 
-BB13_11:
+BB9_11:
        mov.f64         %fd59, %fd60;
        bar.sync        0;
 
-BB13_12:
+BB9_12:
        mov.f64         %fd57, %fd59;
        setp.lt.u32     %p8, %r11, 128;
-       @%p8 bra        BB13_16;
+       @%p8 bra        BB9_16;
 
        setp.gt.u32     %p9, %r10, 63;
        mov.f64         %fd58, %fd57;
-       @%p9 bra        BB13_15;
+       @%p9 bra        BB9_15;
 
        ld.shared.f64   %fd28, [%rd8+512];
        add.f64         %fd58, %fd57, %fd28;
        st.shared.f64   [%rd8], %fd58;
 
-BB13_15:
+BB9_15:
        mov.f64         %fd57, %fd58;
        bar.sync        0;
 
-BB13_16:
+BB9_16:
        mov.f64         %fd56, %fd57;
        setp.gt.u32     %p10, %r10, 31;
-       @%p10 bra       BB13_29;
+       @%p10 bra       BB9_29;
 
        setp.lt.u32     %p11, %r11, 64;
-       @%p11 bra       BB13_19;
+       @%p11 bra       BB9_19;
 
        ld.volatile.shared.f64  %fd29, [%rd8+256];
        add.f64         %fd56, %fd56, %fd29;
        st.volatile.shared.f64  [%rd8], %fd56;
 
-BB13_19:
+BB9_19:
        mov.f64         %fd55, %fd56;
        setp.lt.u32     %p12, %r11, 32;
-       @%p12 bra       BB13_21;
+       @%p12 bra       BB9_21;
 
        ld.volatile.shared.f64  %fd30, [%rd8+128];
        add.f64         %fd55, %fd55, %fd30;
        st.volatile.shared.f64  [%rd8], %fd55;
 
-BB13_21:
+BB9_21:
        mov.f64         %fd54, %fd55;
        setp.lt.u32     %p13, %r11, 16;
-       @%p13 bra       BB13_23;
+       @%p13 bra       BB9_23;
 
        ld.volatile.shared.f64  %fd31, [%rd8+64];
        add.f64         %fd54, %fd54, %fd31;
        st.volatile.shared.f64  [%rd8], %fd54;
 
-BB13_23:
+BB9_23:
        mov.f64         %fd53, %fd54;
        setp.lt.u32     %p14, %r11, 8;
-       @%p14 bra       BB13_25;
+       @%p14 bra       BB9_25;
 
        ld.volatile.shared.f64  %fd32, [%rd8+32];
        add.f64         %fd53, %fd53, %fd32;
        st.volatile.shared.f64  [%rd8], %fd53;
 
-BB13_25:
+BB9_25:
        mov.f64         %fd52, %fd53;
        setp.lt.u32     %p15, %r11, 4;
-       @%p15 bra       BB13_27;
+       @%p15 bra       BB9_27;
 
        ld.volatile.shared.f64  %fd33, [%rd8+16];
        add.f64         %fd52, %fd52, %fd33;
        st.volatile.shared.f64  [%rd8], %fd52;
 
-BB13_27:
+BB9_27:
        setp.lt.u32     %p16, %r11, 2;
-       @%p16 bra       BB13_29;
+       @%p16 bra       BB9_29;
 
        ld.volatile.shared.f64  %fd34, [%rd8+8];
        add.f64         %fd35, %fd52, %fd34;
        st.volatile.shared.f64  [%rd8], %fd35;
 
-BB13_29:
+BB9_29:
        setp.ne.s32     %p17, %r10, 0;
-       @%p17 bra       BB13_31;
+       @%p17 bra       BB9_31;
 
        ld.shared.f64   %fd36, [sdata];
        cvta.to.global.u64      %rd36, %rd2;
@@ -2254,7 +1473,7 @@ BB13_29:
        add.s64         %rd38, %rd36, %rd37;
        st.global.f64   [%rd38], %fd36;
 
-BB13_31:
+BB9_31:
        ret;
 }
 
@@ -2281,18 +1500,18 @@ BB13_31:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB14_5;
+       @%p1 bra        BB10_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0d0000000000000000;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB14_4;
+       @%p2 bra        BB10_4;
 
        mov.u32         %r10, %r1;
 
-BB14_3:
+BB10_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -2302,15 +1521,15 @@ BB14_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB14_3;
+       @%p3 bra        BB10_3;
 
-BB14_4:
+BB10_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB14_5:
+BB10_5:
        ret;
 }
 
@@ -2338,9 +1557,9 @@ BB14_5:
        mov.f64         %fd67, 0d0000000000000000;
        mov.f64         %fd68, %fd67;
        setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB15_4;
+       @%p1 bra        BB11_4;
 
-BB15_1:
+BB11_1:
        mov.f64         %fd1, %fd68;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r30, 8;
@@ -2349,23 +1568,23 @@ BB15_1:
        add.f64         %fd69, %fd1, %fd27;
        add.s32         %r3, %r30, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB15_3;
+       @%p2 bra        BB11_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd28, [%rd9];
        add.f64         %fd69, %fd69, %fd28;
 
-BB15_3:
+BB11_3:
        mov.f64         %fd68, %fd69;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r30, %r12, %r13, %r30;
        setp.lt.u32     %p3, %r30, %r5;
        mov.f64         %fd67, %fd68;
-       @%p3 bra        BB15_1;
+       @%p3 bra        BB11_1;
 
-BB15_4:
+BB11_4:
        mov.f64         %fd65, %fd67;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2373,113 +1592,113 @@ BB15_4:
        st.shared.f64   [%rd1], %fd65;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB15_8;
+       @%p4 bra        BB11_8;
 
        setp.gt.u32     %p5, %r6, 255;
        mov.f64         %fd66, %fd65;
-       @%p5 bra        BB15_7;
+       @%p5 bra        BB11_7;
 
        ld.shared.f64   %fd29, [%rd1+2048];
        add.f64         %fd66, %fd65, %fd29;
        st.shared.f64   [%rd1], %fd66;
 
-BB15_7:
+BB11_7:
        mov.f64         %fd65, %fd66;
        bar.sync        0;
 
-BB15_8:
+BB11_8:
        mov.f64         %fd63, %fd65;
        setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB15_12;
+       @%p6 bra        BB11_12;
 
        setp.gt.u32     %p7, %r6, 127;
        mov.f64         %fd64, %fd63;
-       @%p7 bra        BB15_11;
+       @%p7 bra        BB11_11;
 
        ld.shared.f64   %fd30, [%rd1+1024];
        add.f64         %fd64, %fd63, %fd30;
        st.shared.f64   [%rd1], %fd64;
 
-BB15_11:
+BB11_11:
        mov.f64         %fd63, %fd64;
        bar.sync        0;
 
-BB15_12:
+BB11_12:
        mov.f64         %fd61, %fd63;
        setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB15_16;
+       @%p8 bra        BB11_16;
 
        setp.gt.u32     %p9, %r6, 63;
        mov.f64         %fd62, %fd61;
-       @%p9 bra        BB15_15;
+       @%p9 bra        BB11_15;
 
        ld.shared.f64   %fd31, [%rd1+512];
        add.f64         %fd62, %fd61, %fd31;
        st.shared.f64   [%rd1], %fd62;
 
-BB15_15:
+BB11_15:
        mov.f64         %fd61, %fd62;
        bar.sync        0;
 
-BB15_16:
+BB11_16:
        mov.f64         %fd60, %fd61;
        setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB15_29;
+       @%p10 bra       BB11_29;
 
        setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB15_19;
+       @%p11 bra       BB11_19;
 
        ld.volatile.shared.f64  %fd32, [%rd1+256];
        add.f64         %fd60, %fd60, %fd32;
        st.volatile.shared.f64  [%rd1], %fd60;
 
-BB15_19:
+BB11_19:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB15_21;
+       @%p12 bra       BB11_21;
 
        ld.volatile.shared.f64  %fd33, [%rd1+128];
        add.f64         %fd59, %fd59, %fd33;
        st.volatile.shared.f64  [%rd1], %fd59;
 
-BB15_21:
+BB11_21:
        mov.f64         %fd58, %fd59;
        setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB15_23;
+       @%p13 bra       BB11_23;
 
        ld.volatile.shared.f64  %fd34, [%rd1+64];
        add.f64         %fd58, %fd58, %fd34;
        st.volatile.shared.f64  [%rd1], %fd58;
 
-BB15_23:
+BB11_23:
        mov.f64         %fd57, %fd58;
        setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB15_25;
+       @%p14 bra       BB11_25;
 
        ld.volatile.shared.f64  %fd35, [%rd1+32];
        add.f64         %fd57, %fd57, %fd35;
        st.volatile.shared.f64  [%rd1], %fd57;
 
-BB15_25:
+BB11_25:
        mov.f64         %fd56, %fd57;
        setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB15_27;
+       @%p15 bra       BB11_27;
 
        ld.volatile.shared.f64  %fd36, [%rd1+16];
        add.f64         %fd56, %fd56, %fd36;
        st.volatile.shared.f64  [%rd1], %fd56;
 
-BB15_27:
+BB11_27:
        setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB15_29;
+       @%p16 bra       BB11_29;
 
        ld.volatile.shared.f64  %fd37, [%rd1+8];
        add.f64         %fd38, %fd56, %fd37;
        st.volatile.shared.f64  [%rd1], %fd38;
 
-BB15_29:
+BB11_29:
        setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB15_31;
+       @%p17 bra       BB11_31;
 
        ld.shared.f64   %fd39, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2487,7 +1706,7 @@ BB15_29:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd39;
 
-BB15_31:
+BB11_31:
        ret;
 }
 
@@ -2515,9 +1734,9 @@ BB15_31:
        mov.f64         %fd67, 0d0010000000000000;
        mov.f64         %fd68, %fd67;
        setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB16_4;
+       @%p1 bra        BB12_4;
 
-BB16_1:
+BB12_1:
        mov.f64         %fd1, %fd68;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r30, 8;
@@ -2526,23 +1745,23 @@ BB16_1:
        max.f64         %fd69, %fd1, %fd27;
        add.s32         %r3, %r30, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB16_3;
+       @%p2 bra        BB12_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd28, [%rd9];
        max.f64         %fd69, %fd69, %fd28;
 
-BB16_3:
+BB12_3:
        mov.f64         %fd68, %fd69;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r30, %r12, %r13, %r30;
        setp.lt.u32     %p3, %r30, %r5;
        mov.f64         %fd67, %fd68;
-       @%p3 bra        BB16_1;
+       @%p3 bra        BB12_1;
 
-BB16_4:
+BB12_4:
        mov.f64         %fd65, %fd67;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2550,113 +1769,113 @@ BB16_4:
        st.shared.f64   [%rd1], %fd65;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB16_8;
+       @%p4 bra        BB12_8;
 
        setp.gt.u32     %p5, %r6, 255;
        mov.f64         %fd66, %fd65;
-       @%p5 bra        BB16_7;
+       @%p5 bra        BB12_7;
 
        ld.shared.f64   %fd29, [%rd1+2048];
        max.f64         %fd66, %fd65, %fd29;
        st.shared.f64   [%rd1], %fd66;
 
-BB16_7:
+BB12_7:
        mov.f64         %fd65, %fd66;
        bar.sync        0;
 
-BB16_8:
+BB12_8:
        mov.f64         %fd63, %fd65;
        setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB16_12;
+       @%p6 bra        BB12_12;
 
        setp.gt.u32     %p7, %r6, 127;
        mov.f64         %fd64, %fd63;
-       @%p7 bra        BB16_11;
+       @%p7 bra        BB12_11;
 
        ld.shared.f64   %fd30, [%rd1+1024];
        max.f64         %fd64, %fd63, %fd30;
        st.shared.f64   [%rd1], %fd64;
 
-BB16_11:
+BB12_11:
        mov.f64         %fd63, %fd64;
        bar.sync        0;
 
-BB16_12:
+BB12_12:
        mov.f64         %fd61, %fd63;
        setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB16_16;
+       @%p8 bra        BB12_16;
 
        setp.gt.u32     %p9, %r6, 63;
        mov.f64         %fd62, %fd61;
-       @%p9 bra        BB16_15;
+       @%p9 bra        BB12_15;
 
        ld.shared.f64   %fd31, [%rd1+512];
        max.f64         %fd62, %fd61, %fd31;
        st.shared.f64   [%rd1], %fd62;
 
-BB16_15:
+BB12_15:
        mov.f64         %fd61, %fd62;
        bar.sync        0;
 
-BB16_16:
+BB12_16:
        mov.f64         %fd60, %fd61;
        setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB16_29;
+       @%p10 bra       BB12_29;
 
        setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB16_19;
+       @%p11 bra       BB12_19;
 
        ld.volatile.shared.f64  %fd32, [%rd1+256];
        max.f64         %fd60, %fd60, %fd32;
        st.volatile.shared.f64  [%rd1], %fd60;
 
-BB16_19:
+BB12_19:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB16_21;
+       @%p12 bra       BB12_21;
 
        ld.volatile.shared.f64  %fd33, [%rd1+128];
        max.f64         %fd59, %fd59, %fd33;
        st.volatile.shared.f64  [%rd1], %fd59;
 
-BB16_21:
+BB12_21:
        mov.f64         %fd58, %fd59;
        setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB16_23;
+       @%p13 bra       BB12_23;
 
        ld.volatile.shared.f64  %fd34, [%rd1+64];
        max.f64         %fd58, %fd58, %fd34;
        st.volatile.shared.f64  [%rd1], %fd58;
 
-BB16_23:
+BB12_23:
        mov.f64         %fd57, %fd58;
        setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB16_25;
+       @%p14 bra       BB12_25;
 
        ld.volatile.shared.f64  %fd35, [%rd1+32];
        max.f64         %fd57, %fd57, %fd35;
        st.volatile.shared.f64  [%rd1], %fd57;
 
-BB16_25:
+BB12_25:
        mov.f64         %fd56, %fd57;
        setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB16_27;
+       @%p15 bra       BB12_27;
 
        ld.volatile.shared.f64  %fd36, [%rd1+16];
        max.f64         %fd56, %fd56, %fd36;
        st.volatile.shared.f64  [%rd1], %fd56;
 
-BB16_27:
+BB12_27:
        setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB16_29;
+       @%p16 bra       BB12_29;
 
        ld.volatile.shared.f64  %fd37, [%rd1+8];
        max.f64         %fd38, %fd56, %fd37;
        st.volatile.shared.f64  [%rd1], %fd38;
 
-BB16_29:
+BB12_29:
        setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB16_31;
+       @%p17 bra       BB12_31;
 
        ld.shared.f64   %fd39, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2664,7 +1883,7 @@ BB16_29:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd39;
 
-BB16_31:
+BB12_31:
        ret;
 }
 
@@ -2692,9 +1911,9 @@ BB16_31:
        mov.f64         %fd67, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd68, %fd67;
        setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB17_4;
+       @%p1 bra        BB13_4;
 
-BB17_1:
+BB13_1:
        mov.f64         %fd1, %fd68;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r30, 8;
@@ -2703,23 +1922,23 @@ BB17_1:
        min.f64         %fd69, %fd1, %fd27;
        add.s32         %r3, %r30, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB17_3;
+       @%p2 bra        BB13_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd28, [%rd9];
        min.f64         %fd69, %fd69, %fd28;
 
-BB17_3:
+BB13_3:
        mov.f64         %fd68, %fd69;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r30, %r12, %r13, %r30;
        setp.lt.u32     %p3, %r30, %r5;
        mov.f64         %fd67, %fd68;
-       @%p3 bra        BB17_1;
+       @%p3 bra        BB13_1;
 
-BB17_4:
+BB13_4:
        mov.f64         %fd65, %fd67;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2727,113 +1946,113 @@ BB17_4:
        st.shared.f64   [%rd1], %fd65;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB17_8;
+       @%p4 bra        BB13_8;
 
        setp.gt.u32     %p5, %r6, 255;
        mov.f64         %fd66, %fd65;
-       @%p5 bra        BB17_7;
+       @%p5 bra        BB13_7;
 
        ld.shared.f64   %fd29, [%rd1+2048];
        min.f64         %fd66, %fd65, %fd29;
        st.shared.f64   [%rd1], %fd66;
 
-BB17_7:
+BB13_7:
        mov.f64         %fd65, %fd66;
        bar.sync        0;
 
-BB17_8:
+BB13_8:
        mov.f64         %fd63, %fd65;
        setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB17_12;
+       @%p6 bra        BB13_12;
 
        setp.gt.u32     %p7, %r6, 127;
        mov.f64         %fd64, %fd63;
-       @%p7 bra        BB17_11;
+       @%p7 bra        BB13_11;
 
        ld.shared.f64   %fd30, [%rd1+1024];
        min.f64         %fd64, %fd63, %fd30;
        st.shared.f64   [%rd1], %fd64;
 
-BB17_11:
+BB13_11:
        mov.f64         %fd63, %fd64;
        bar.sync        0;
 
-BB17_12:
+BB13_12:
        mov.f64         %fd61, %fd63;
        setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB17_16;
+       @%p8 bra        BB13_16;
 
        setp.gt.u32     %p9, %r6, 63;
        mov.f64         %fd62, %fd61;
-       @%p9 bra        BB17_15;
+       @%p9 bra        BB13_15;
 
        ld.shared.f64   %fd31, [%rd1+512];
        min.f64         %fd62, %fd61, %fd31;
        st.shared.f64   [%rd1], %fd62;
 
-BB17_15:
+BB13_15:
        mov.f64         %fd61, %fd62;
        bar.sync        0;
 
-BB17_16:
+BB13_16:
        mov.f64         %fd60, %fd61;
        setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB17_29;
+       @%p10 bra       BB13_29;
 
        setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB17_19;
+       @%p11 bra       BB13_19;
 
        ld.volatile.shared.f64  %fd32, [%rd1+256];
        min.f64         %fd60, %fd60, %fd32;
        st.volatile.shared.f64  [%rd1], %fd60;
 
-BB17_19:
+BB13_19:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB17_21;
+       @%p12 bra       BB13_21;
 
        ld.volatile.shared.f64  %fd33, [%rd1+128];
        min.f64         %fd59, %fd59, %fd33;
        st.volatile.shared.f64  [%rd1], %fd59;
 
-BB17_21:
+BB13_21:
        mov.f64         %fd58, %fd59;
        setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB17_23;
+       @%p13 bra       BB13_23;
 
        ld.volatile.shared.f64  %fd34, [%rd1+64];
        min.f64         %fd58, %fd58, %fd34;
        st.volatile.shared.f64  [%rd1], %fd58;
 
-BB17_23:
+BB13_23:
        mov.f64         %fd57, %fd58;
        setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB17_25;
+       @%p14 bra       BB13_25;
 
        ld.volatile.shared.f64  %fd35, [%rd1+32];
        min.f64         %fd57, %fd57, %fd35;
        st.volatile.shared.f64  [%rd1], %fd57;
 
-BB17_25:
+BB13_25:
        mov.f64         %fd56, %fd57;
        setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB17_27;
+       @%p15 bra       BB13_27;
 
        ld.volatile.shared.f64  %fd36, [%rd1+16];
        min.f64         %fd56, %fd56, %fd36;
        st.volatile.shared.f64  [%rd1], %fd56;
 
-BB17_27:
+BB13_27:
        setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB17_29;
+       @%p16 bra       BB13_29;
 
        ld.volatile.shared.f64  %fd37, [%rd1+8];
        min.f64         %fd38, %fd56, %fd37;
        st.volatile.shared.f64  [%rd1], %fd38;
 
-BB17_29:
+BB13_29:
        setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB17_31;
+       @%p17 bra       BB13_31;
 
        ld.shared.f64   %fd39, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2841,7 +2060,7 @@ BB17_29:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd39;
 
-BB17_31:
+BB13_31:
        ret;
 }
 
@@ -2850,10 +2069,10 @@ BB17_31:
        .param .b64 __internal_accurate_pow_param_1
 )
 {
-       .reg .pred      %p<9>;
+       .reg .pred      %p<10>;
        .reg .f32       %f<3>;
        .reg .b32       %r<52>;
-       .reg .f64       %fd<135>;
+       .reg .f64       %fd<134>;
 
 
        ld.param.f64    %fd12, [__internal_accurate_pow_param_0];
@@ -2868,7 +2087,7 @@ BB17_31:
        }
        shr.u32         %r50, %r49, 20;
        setp.ne.s32     %p1, %r50, 0;
-       @%p1 bra        BB18_2;
+       @%p1 bra        BB14_2;
 
        mul.f64         %fd14, %fd12, 0d4350000000000000;
        {
@@ -2882,28 +2101,28 @@ BB17_31:
        shr.u32         %r16, %r49, 20;
        add.s32         %r50, %r16, -54;
 
-BB18_2:
+BB14_2:
        add.s32         %r51, %r50, -1023;
        and.b32         %r17, %r49, -2146435073;
        or.b32          %r18, %r17, 1072693248;
-       mov.b64         %fd133, {%r48, %r18};
+       mov.b64         %fd132, {%r48, %r18};
        setp.lt.u32     %p2, %r18, 1073127583;
-       @%p2 bra        BB18_4;
+       @%p2 bra        BB14_4;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r19, %temp}, %fd133;
+       mov.b64         {%r19, %temp}, %fd132;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r20}, %fd133;
+       mov.b64         {%temp, %r20}, %fd132;
        }
        add.s32         %r21, %r20, -1048576;
-       mov.b64         %fd133, {%r19, %r21};
+       mov.b64         %fd132, {%r19, %r21};
        add.s32         %r51, %r50, -1022;
 
-BB18_4:
-       add.f64         %fd16, %fd133, 0d3FF0000000000000;
+BB14_4:
+       add.f64         %fd16, %fd132, 0d3FF0000000000000;
        // inline asm
        rcp.approx.ftz.f64 %fd15,%fd16;
        // inline asm
@@ -2912,7 +2131,7 @@ BB18_4:
        fma.rn.f64      %fd19, %fd17, %fd15, %fd18;
        fma.rn.f64      %fd20, %fd19, %fd19, %fd19;
        fma.rn.f64      %fd21, %fd20, %fd15, %fd15;
-       add.f64         %fd22, %fd133, 0dBFF0000000000000;
+       add.f64         %fd22, %fd132, 0dBFF0000000000000;
        mul.f64         %fd23, %fd22, %fd21;
        fma.rn.f64      %fd24, %fd22, %fd21, %fd23;
        mul.f64         %fd25, %fd24, %fd24;
@@ -3015,52 +2234,51 @@ BB18_4:
        add.f64         %fd4, %fd94, %fd97;
        sub.f64         %fd98, %fd94, %fd4;
        add.f64         %fd5, %fd97, %fd98;
-       mov.f64         %fd99, 0d3FF71547652B82FE;
-       mul.rn.f64      %fd100, %fd4, %fd99;
-       mov.f64         %fd101, 0d4338000000000000;
-       add.rn.f64      %fd102, %fd100, %fd101;
+       mov.f64         %fd99, 0d4338000000000000;
+       mov.f64         %fd100, 0d3FF71547652B82FE;
+       fma.rn.f64      %fd101, %fd4, %fd100, %fd99;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r13, %temp}, %fd102;
+       mov.b64         {%r13, %temp}, %fd101;
        }
-       mov.f64         %fd103, 0dC338000000000000;
-       add.rn.f64      %fd104, %fd102, %fd103;
-       mov.f64         %fd105, 0dBFE62E42FEFA39EF;
-       fma.rn.f64      %fd106, %fd104, %fd105, %fd4;
-       mov.f64         %fd107, 0dBC7ABC9E3B39803F;
-       fma.rn.f64      %fd108, %fd104, %fd107, %fd106;
-       mov.f64         %fd109, 0d3E928AF3FCA213EA;
-       mov.f64         %fd110, 0d3E5ADE1569CE2BDF;
-       fma.rn.f64      %fd111, %fd110, %fd108, %fd109;
-       mov.f64         %fd112, 0d3EC71DEE62401315;
-       fma.rn.f64      %fd113, %fd111, %fd108, %fd112;
-       mov.f64         %fd114, 0d3EFA01997C89EB71;
-       fma.rn.f64      %fd115, %fd113, %fd108, %fd114;
-       mov.f64         %fd116, 0d3F2A01A014761F65;
-       fma.rn.f64      %fd117, %fd115, %fd108, %fd116;
-       mov.f64         %fd118, 0d3F56C16C1852B7AF;
-       fma.rn.f64      %fd119, %fd117, %fd108, %fd118;
-       mov.f64         %fd120, 0d3F81111111122322;
-       fma.rn.f64      %fd121, %fd119, %fd108, %fd120;
-       mov.f64         %fd122, 0d3FA55555555502A1;
-       fma.rn.f64      %fd123, %fd121, %fd108, %fd122;
-       mov.f64         %fd124, 0d3FC5555555555511;
-       fma.rn.f64      %fd125, %fd123, %fd108, %fd124;
-       mov.f64         %fd126, 0d3FE000000000000B;
-       fma.rn.f64      %fd127, %fd125, %fd108, %fd126;
-       fma.rn.f64      %fd128, %fd127, %fd108, %fd18;
-       fma.rn.f64      %fd129, %fd128, %fd108, %fd18;
+       mov.f64         %fd102, 0dC338000000000000;
+       add.rn.f64      %fd103, %fd101, %fd102;
+       mov.f64         %fd104, 0dBFE62E42FEFA39EF;
+       fma.rn.f64      %fd105, %fd103, %fd104, %fd4;
+       mov.f64         %fd106, 0dBC7ABC9E3B39803F;
+       fma.rn.f64      %fd107, %fd103, %fd106, %fd105;
+       mov.f64         %fd108, 0d3E928AF3FCA213EA;
+       mov.f64         %fd109, 0d3E5ADE1569CE2BDF;
+       fma.rn.f64      %fd110, %fd109, %fd107, %fd108;
+       mov.f64         %fd111, 0d3EC71DEE62401315;
+       fma.rn.f64      %fd112, %fd110, %fd107, %fd111;
+       mov.f64         %fd113, 0d3EFA01997C89EB71;
+       fma.rn.f64      %fd114, %fd112, %fd107, %fd113;
+       mov.f64         %fd115, 0d3F2A01A014761F65;
+       fma.rn.f64      %fd116, %fd114, %fd107, %fd115;
+       mov.f64         %fd117, 0d3F56C16C1852B7AF;
+       fma.rn.f64      %fd118, %fd116, %fd107, %fd117;
+       mov.f64         %fd119, 0d3F81111111122322;
+       fma.rn.f64      %fd120, %fd118, %fd107, %fd119;
+       mov.f64         %fd121, 0d3FA55555555502A1;
+       fma.rn.f64      %fd122, %fd120, %fd107, %fd121;
+       mov.f64         %fd123, 0d3FC5555555555511;
+       fma.rn.f64      %fd124, %fd122, %fd107, %fd123;
+       mov.f64         %fd125, 0d3FE000000000000B;
+       fma.rn.f64      %fd126, %fd124, %fd107, %fd125;
+       fma.rn.f64      %fd127, %fd126, %fd107, %fd18;
+       fma.rn.f64      %fd128, %fd127, %fd107, %fd18;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r14, %temp}, %fd129;
+       mov.b64         {%r14, %temp}, %fd128;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r15}, %fd129;
+       mov.b64         {%temp, %r15}, %fd128;
        }
        shl.b32         %r33, %r13, 20;
        add.s32         %r34, %r15, %r33;
-       mov.b64         %fd134, {%r14, %r34};
+       mov.b64         %fd133, {%r14, %r34};
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r35}, %fd4;
@@ -3068,48 +2286,48 @@ BB18_4:
        mov.b32          %f2, %r35;
        abs.f32         %f1, %f2;
        setp.lt.f32     %p4, %f1, 0f4086232B;
-       @%p4 bra        BB18_7;
+       @%p4 bra        BB14_7;
 
        setp.lt.f64     %p5, %fd4, 0d0000000000000000;
-       add.f64         %fd130, %fd4, 0d7FF0000000000000;
-       selp.f64        %fd134, 0d0000000000000000, %fd130, %p5;
+       add.f64         %fd129, %fd4, 0d7FF0000000000000;
+       selp.f64        %fd133, 0d0000000000000000, %fd129, %p5;
        setp.geu.f32    %p6, %f1, 0f40874800;
-       @%p6 bra        BB18_7;
+       @%p6 bra        BB14_7;
 
        shr.u32         %r36, %r13, 31;
        add.s32         %r37, %r13, %r36;
        shr.s32         %r38, %r37, 1;
        shl.b32         %r39, %r38, 20;
        add.s32         %r40, %r39, %r15;
-       mov.b64         %fd131, {%r14, %r40};
+       mov.b64         %fd130, {%r14, %r40};
        sub.s32         %r41, %r13, %r38;
        shl.b32         %r42, %r41, 20;
        add.s32         %r43, %r42, 1072693248;
        mov.u32         %r44, 0;
-       mov.b64         %fd132, {%r44, %r43};
-       mul.f64         %fd134, %fd131, %fd132;
+       mov.b64         %fd131, {%r44, %r43};
+       mul.f64         %fd133, %fd130, %fd131;
 
-BB18_7:
+BB14_7:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r45}, %fd134;
+       mov.b64         {%temp, %r45}, %fd133;
        }
        and.b32         %r46, %r45, 2147483647;
        setp.ne.s32     %p7, %r46, 2146435072;
-       @%p7 bra        BB18_9;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r47, %temp}, %fd134;
+       mov.b64         {%r47, %temp}, %fd133;
        }
-       setp.eq.s32     %p8, %r47, 0;
-       @%p8 bra        BB18_10;
+       setp.ne.s32     %p8, %r47, 0;
+       or.pred         %p9, %p8, %p7;
+       @!%p9 bra       BB14_9;
+       bra.uni         BB14_8;
 
-BB18_9:
-       fma.rn.f64      %fd134, %fd134, %fd5, %fd134;
+BB14_8:
+       fma.rn.f64      %fd133, %fd133, %fd5, %fd133;
 
-BB18_10:
-       st.param.f64    [func_retval0+0], %fd134;
+BB14_9:
+       st.param.f64    [func_retval0+0], %fd133;
        ret;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java 
b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index 7d97e44..7321d68 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -36,6 +36,7 @@ import org.apache.sysml.lops.CentralMoment;
 import org.apache.sysml.lops.CoVariance;
 import org.apache.sysml.lops.CombineBinary;
 import org.apache.sysml.lops.CombineUnary;
+import org.apache.sysml.lops.ConvolutionTransform;
 import org.apache.sysml.lops.Data;
 import org.apache.sysml.lops.DataPartition;
 import org.apache.sysml.lops.Group;
@@ -593,7 +594,22 @@ public class BinaryOp extends Hop
                                        et = ExecType.GPU;
                                }
                                
-                               Binary binary = new 
Binary(getInput().get(0).constructLops(), getInput().get(1).constructLops(), 
HopsOpOp2LopsB.get(op),
+                               Lop binary = null;
+                               
+                               boolean isLeftXGt = (getInput().get(0) 
instanceof BinaryOp) && ((BinaryOp) getInput().get(0)).getOp() == OpOp2.GREATER;
+                               Hop potentialZero = isLeftXGt ? ((BinaryOp) 
getInput().get(0)).getInput().get(1) : null;
+                               
+                               boolean isLeftXGt0 = isLeftXGt && potentialZero 
!= null
+                                               && potentialZero instanceof 
LiteralOp && ((LiteralOp) potentialZero).getDoubleValue() == 0;
+                                               
+                               if(op == OpOp2.MULT && isLeftXGt0 && 
+                                       !getInput().get(0).isVector() && 
!getInput().get(1).isVector()) {
+                                       binary = new 
ConvolutionTransform(getInput().get(0).getInput().get(0).constructLops(), 
+                                                                       
getInput().get(1).constructLops(),
+                                                                       
ConvolutionTransform.OperationTypes.RELU_BACKWARD, getDataType(), 
getValueType(), et, -1);
+                               }
+                               else
+                                       binary = new 
Binary(getInput().get(0).constructLops(), getInput().get(1).constructLops(), 
HopsOpOp2LopsB.get(op),
                                                getDataType(), getValueType(), 
et);
                                
                                setOutputDimensions(binary);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java 
b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index d69bd93..6f2a20e 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -30,7 +30,7 @@ public class ConvolutionTransform extends Lop
 
        
        public enum OperationTypes {
-               MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING,
+               MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, 
RELU_BACKWARD,
                DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, 
DIRECT_CONV2D_BACKWARD_DATA,
                BIAS_ADD
        };
@@ -54,6 +54,16 @@ public class ConvolutionTransform extends Lop
                init(input, op, dt, vt, et);
                numThreads = k;
        }
+       
+       public ConvolutionTransform(Lop input1, Lop input2, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k) 
+       {
+               super(Lop.Type.Transform, dt, vt);              
+               init(input1, op, dt, vt, et);
+               numThreads = k;
+               this.addInput(input2);
+               input2.addOutput(this);
+               setLevel();
+       }
 
        private void init (Lop input, ConvolutionTransform.OperationTypes op, 
DataType dt, ValueType vt, ExecType et) 
        {
@@ -102,6 +112,9 @@ public class ConvolutionTransform extends Lop
                case RELU_MAX_POOLING:
                        return "relu_maxpooling";
                        
+               case RELU_BACKWARD:
+                       return "relu_backward";
+                       
                case MAX_POOLING_BACKWARD:
                        return "maxpooling_backward";
                
@@ -124,7 +137,7 @@ public class ConvolutionTransform extends Lop
        }
        
        public String getInstructions(String input, String bias, String output) 
throws LopsException {
-               if(operation == OperationTypes.BIAS_ADD) {
+               if(operation == OperationTypes.BIAS_ADD || operation == 
OperationTypes.RELU_BACKWARD) {
                        StringBuilder sb = new StringBuilder();
                        sb.append( getExecType() );
                        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 3355a6e..11d4661 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -220,6 +220,7 @@ public class CPInstructionParser extends InstructionParser
                String2CPInstructionType.put( "rsort"      , 
CPINSTRUCTION_TYPE.Reorg);
 
                // Opcodes related to convolutions
+               String2CPInstructionType.put( "relu_backward"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "relu_maxpooling"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "maxpooling"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "maxpooling_backward"      , 
CPINSTRUCTION_TYPE.Convolution);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 9dce34a..5e3ab62 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -39,6 +39,7 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType = new HashMap<String, 
GPUINSTRUCTION_TYPE>();
 
                // Neural Network Operators
+               String2GPUInstructionType.put( "relu_backward",          
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d",                 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_filter", 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Convolution);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/afe61b5a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 56f1460..9c115c6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -46,8 +46,8 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
        public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr, int numThreads) throws DMLRuntimeException {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out,
                                opcode, istr);
-               if(!opcode.equals("bias_add")) {
-                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add, but found " + opcode);
+               if( !(opcode.equals("bias_add") || 
opcode.equals("relu_backward")) ) {
+                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or relu_backward, but found " + opcode);
                }
                _in2 = in2;
                _cptype = CPINSTRUCTION_TYPE.Convolution;
@@ -153,7 +153,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        return new ConvolutionCPInstruction(in, in2, out, 
opcode, str, stride,
                                        padding, input_shape, filter_shape, k);
                } 
-               else if (opcode.equalsIgnoreCase("bias_add")) {
+               else if (opcode.equalsIgnoreCase("bias_add") || 
opcode.equals("relu_backward")) {
                        InstructionUtils.checkNumFields(parts, 4);
                        in.split(parts[1]);
                        CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
@@ -174,6 +174,26 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                .getLongValue();
        }
        
+       public void processReluBackwardInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
+               // (X > 0) * dout
+               MatrixBlock outputBlock = null;
+               MatrixBlock input = ec.getMatrixInput(input1.getName());
+               MatrixBlock dout = ec.getMatrixInput(_in2.getName());
+               
+               if(input.isEmptyBlock() || dout.isEmptyBlock()) {
+                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), true, 0);
+               }
+               else {
+                       outputBlock = getDenseOutputBlock(ec, 
input.getNumRows(), input.getNumColumns());
+                       LibMatrixDNN.relu_backward(input, dout, outputBlock, 
_numThreads);
+               }
+               
+               // release inputs/outputs
+               ec.releaseMatrixInput(input1.getName());
+               ec.releaseMatrixInput(_in2.getName());
+               ec.setMatrixOutput(getOutputVariableName(), outputBlock);
+       }
+       
        public void processBiasInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
                MatrixBlock outputBlock = null;
                MatrixBlock input = ec.getMatrixInput(input1.getName());
@@ -210,6 +230,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        processBiasInstruction(ec);
                        return;
                }
+               else if (instOpcode.equalsIgnoreCase("relu_backward")) {
+                       processReluBackwardInstruction(ec);
+                       return;
+               }
                
                // acquire inputs
                MatrixBlock outputBlock = null;


Reply via email to