http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca91e68/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index d382fc5..c990f27 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21554848
-// Cuda compilation tools, release 8.0, V8.0.61
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
 // Based on LLVM 3.4svn
 //
 
@@ -1277,129 +1277,106 @@ BB21_2:
        .param .u32 matrix_matrix_cellwise_op_d_param_7
 )
 {
-       .reg .pred      %p<73>;
-       .reg .b32       %r<66>;
-       .reg .f64       %fd<56>;
+       .reg .pred      %p<77>;
+       .reg .b32       %r<56>;
+       .reg .f64       %fd<55>;
        .reg .b64       %rd<19>;
 
 
        ld.param.u64    %rd2, [matrix_matrix_cellwise_op_d_param_0];
        ld.param.u64    %rd3, [matrix_matrix_cellwise_op_d_param_1];
        ld.param.u64    %rd4, [matrix_matrix_cellwise_op_d_param_2];
-       ld.param.u32    %r14, [matrix_matrix_cellwise_op_d_param_3];
-       ld.param.u32    %r10, [matrix_matrix_cellwise_op_d_param_4];
-       ld.param.u32    %r11, [matrix_matrix_cellwise_op_d_param_5];
-       ld.param.u32    %r12, [matrix_matrix_cellwise_op_d_param_6];
-       ld.param.u32    %r13, [matrix_matrix_cellwise_op_d_param_7];
-       mov.u32         %r15, %ntid.x;
-       mov.u32         %r16, %ctaid.x;
-       mov.u32         %r17, %tid.x;
-       mad.lo.s32      %r18, %r15, %r16, %r17;
-       div.s32         %r1, %r18, %r10;
-       rem.s32         %r2, %r18, %r10;
-       setp.lt.s32     %p2, %r1, %r14;
-       setp.gt.s32     %p3, %r10, -1;
+       ld.param.u32    %r10, [matrix_matrix_cellwise_op_d_param_3];
+       ld.param.u32    %r6, [matrix_matrix_cellwise_op_d_param_4];
+       ld.param.u32    %r7, [matrix_matrix_cellwise_op_d_param_5];
+       ld.param.u32    %r8, [matrix_matrix_cellwise_op_d_param_6];
+       ld.param.u32    %r9, [matrix_matrix_cellwise_op_d_param_7];
+       mov.u32         %r11, %ctaid.x;
+       mov.u32         %r12, %ntid.x;
+       mov.u32         %r13, %tid.x;
+       mad.lo.s32      %r1, %r12, %r11, %r13;
+       div.s32         %r2, %r1, %r6;
+       setp.lt.s32     %p2, %r2, %r10;
+       setp.gt.s32     %p3, %r6, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB22_77;
+       @!%p4 bra       BB22_65;
        bra.uni         BB22_1;
 
 BB22_1:
-       mad.lo.s32      %r3, %r1, %r10, %r2;
-       setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r64, %r1;
-       @%p5 bra        BB22_5;
-
-       setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r65, %r3;
-       @%p6 bra        BB22_4;
-
-       mov.u32         %r65, %r2;
-
-BB22_4:
-       mov.u32         %r59, %r65;
-       mov.u32         %r4, %r59;
-       mov.u32         %r64, %r4;
-
-BB22_5:
-       mov.u32         %r5, %r64;
-       setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r62, %r1;
-       @%p7 bra        BB22_9;
-
-       setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r63, %r3;
-       @%p8 bra        BB22_8;
-
-       mov.u32         %r63, %r2;
-
-BB22_8:
-       mov.u32         %r62, %r63;
-
-BB22_9:
-       cvta.to.global.u64      %rd5, %rd3;
-       cvta.to.global.u64      %rd6, %rd2;
-       mul.wide.s32    %rd7, %r5, 8;
-       add.s64         %rd8, %rd6, %rd7;
-       ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r62, 8;
-       add.s64         %rd10, %rd5, %rd9;
+       rem.s32         %r14, %r1, %r6;
+       cvta.to.global.u64      %rd5, %rd2;
+       mad.lo.s32      %r3, %r2, %r6, %r14;
+       setp.eq.s32     %p5, %r7, 2;
+       selp.b32        %r15, %r14, %r3, %p5;
+       setp.eq.s32     %p6, %r7, 1;
+       selp.b32        %r16, %r2, %r15, %p6;
+       setp.eq.s32     %p7, %r8, 2;
+       selp.b32        %r17, %r14, %r3, %p7;
+       setp.eq.s32     %p8, %r8, 1;
+       selp.b32        %r18, %r2, %r17, %p8;
+       mul.wide.s32    %rd6, %r16, 8;
+       add.s64         %rd7, %rd5, %rd6;
+       ld.global.f64   %fd1, [%rd7];
+       cvta.to.global.u64      %rd8, %rd3;
+       mul.wide.s32    %rd9, %r18, 8;
+       add.s64         %rd10, %rd8, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd55, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p9, %r13, 8;
-       @%p9 bra        BB22_26;
+       mov.f64         %fd54, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p9, %r9, 8;
+       @%p9 bra        BB22_18;
 
-       setp.gt.s32     %p23, %r13, 3;
-       @%p23 bra       BB22_18;
+       setp.gt.s32     %p23, %r9, 3;
+       @%p23 bra       BB22_10;
 
-       setp.gt.s32     %p30, %r13, 1;
-       @%p30 bra       BB22_15;
+       setp.gt.s32     %p30, %r9, 1;
+       @%p30 bra       BB22_7;
 
-       setp.eq.s32     %p33, %r13, 0;
-       @%p33 bra       BB22_75;
-       bra.uni         BB22_13;
+       setp.eq.s32     %p33, %r9, 0;
+       @%p33 bra       BB22_63;
+       bra.uni         BB22_5;
 
-BB22_75:
-       add.f64         %fd55, %fd1, %fd2;
-       bra.uni         BB22_76;
+BB22_63:
+       add.f64         %fd54, %fd1, %fd2;
+       bra.uni         BB22_64;
 
-BB22_26:
-       setp.gt.s32     %p10, %r13, 13;
-       @%p10 bra       BB22_35;
+BB22_18:
+       setp.gt.s32     %p10, %r9, 13;
+       @%p10 bra       BB22_27;
 
-       setp.gt.s32     %p17, %r13, 10;
-       @%p17 bra       BB22_31;
+       setp.gt.s32     %p17, %r9, 10;
+       @%p17 bra       BB22_23;
 
-       setp.eq.s32     %p21, %r13, 9;
-       @%p21 bra       BB22_55;
-       bra.uni         BB22_29;
+       setp.eq.s32     %p21, %r9, 9;
+       @%p21 bra       BB22_45;
+       bra.uni         BB22_21;
 
-BB22_55:
-       setp.eq.f64     %p48, %fd1, %fd2;
-       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
-       bra.uni         BB22_76;
+BB22_45:
+       setp.eq.f64     %p50, %fd1, %fd2;
+       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
+       bra.uni         BB22_64;
 
-BB22_18:
-       setp.gt.s32     %p24, %r13, 5;
-       @%p24 bra       BB22_22;
+BB22_10:
+       setp.gt.s32     %p24, %r9, 5;
+       @%p24 bra       BB22_14;
 
-       setp.eq.s32     %p28, %r13, 4;
-       @%p28 bra       BB22_58;
-       bra.uni         BB22_20;
+       setp.eq.s32     %p28, %r9, 4;
+       @%p28 bra       BB22_48;
+       bra.uni         BB22_12;
 
-BB22_58:
+BB22_48:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r8}, %fd1;
+       mov.b64         {%temp, %r4}, %fd1;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r9}, %fd2;
+       mov.b64         {%temp, %r5}, %fd2;
        }
-       bfe.u32         %r31, %r9, 20, 11;
+       bfe.u32         %r31, %r5, 20, 11;
        add.s32         %r32, %r31, -1012;
        mov.b64          %rd15, %fd2;
        shl.b64         %rd1, %rd15, %r32;
-       setp.eq.s64     %p53, %rd1, -9223372036854775808;
+       setp.eq.s64     %p55, %rd1, -9223372036854775808;
        abs.f64         %fd19, %fd1;
        // Callseq Start 0
        {
@@ -1416,342 +1393,340 @@ BB22_58:
        param0, 
        param1
        );
-       ld.param.f64    %fd54, [retval0+0];
+       ld.param.f64    %fd53, [retval0+0];
        
        //{
        }// Callseq End 0
-       setp.lt.s32     %p54, %r8, 0;
-       and.pred        %p1, %p54, %p53;
-       @!%p1 bra       BB22_60;
-       bra.uni         BB22_59;
+       setp.lt.s32     %p56, %r4, 0;
+       and.pred        %p1, %p56, %p55;
+       @!%p1 bra       BB22_50;
+       bra.uni         BB22_49;
 
-BB22_59:
+BB22_49:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r33}, %fd54;
+       mov.b64         {%temp, %r33}, %fd53;
        }
        xor.b32         %r34, %r33, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd54;
+       mov.b64         {%r35, %temp}, %fd53;
        }
-       mov.b64         %fd54, {%r35, %r34};
+       mov.b64         %fd53, {%r35, %r34};
 
-BB22_60:
-       mov.f64         %fd53, %fd54;
-       setp.eq.f64     %p55, %fd1, 0d0000000000000000;
-       @%p55 bra       BB22_63;
-       bra.uni         BB22_61;
+BB22_50:
+       mov.f64         %fd52, %fd53;
+       setp.eq.f64     %p57, %fd1, 0d0000000000000000;
+       @%p57 bra       BB22_53;
+       bra.uni         BB22_51;
 
-BB22_63:
-       selp.b32        %r36, %r8, 0, %p53;
+BB22_53:
+       selp.b32        %r36, %r4, 0, %p55;
        or.b32          %r37, %r36, 2146435072;
-       setp.lt.s32     %p59, %r9, 0;
-       selp.b32        %r38, %r37, %r36, %p59;
+       setp.lt.s32     %p61, %r5, 0;
+       selp.b32        %r38, %r37, %r36, %p61;
        mov.u32         %r39, 0;
-       mov.b64         %fd53, {%r39, %r38};
-       bra.uni         BB22_64;
+       mov.b64         %fd52, {%r39, %r38};
+       bra.uni         BB22_54;
 
-BB22_35:
-       setp.gt.s32     %p11, %r13, 15;
-       @%p11 bra       BB22_39;
+BB22_27:
+       setp.gt.s32     %p11, %r9, 15;
+       @%p11 bra       BB22_31;
 
-       setp.eq.s32     %p15, %r13, 14;
-       @%p15 bra       BB22_52;
-       bra.uni         BB22_37;
+       setp.eq.s32     %p15, %r9, 14;
+       @%p15 bra       BB22_42;
+       bra.uni         BB22_29;
 
-BB22_52:
+BB22_42:
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.rni.s64.f64 %rd12, %fd2;
        cvt.u32.u64     %r25, %rd11;
        cvt.u32.u64     %r26, %rd12;
        or.b32          %r27, %r26, %r25;
-       setp.eq.s32     %p45, %r27, 0;
-       selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
-       bra.uni         BB22_76;
+       setp.eq.s32     %p47, %r27, 0;
+       selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
+       bra.uni         BB22_64;
 
-BB22_15:
-       setp.eq.s32     %p31, %r13, 2;
-       @%p31 bra       BB22_74;
-       bra.uni         BB22_16;
+BB22_7:
+       setp.eq.s32     %p31, %r9, 2;
+       @%p31 bra       BB22_62;
+       bra.uni         BB22_8;
 
-BB22_74:
-       mul.f64         %fd55, %fd1, %fd2;
-       bra.uni         BB22_76;
+BB22_62:
+       mul.f64         %fd54, %fd1, %fd2;
+       bra.uni         BB22_64;
 
-BB22_31:
-       setp.eq.s32     %p18, %r13, 11;
-       @%p18 bra       BB22_54;
+BB22_23:
+       setp.eq.s32     %p18, %r9, 11;
+       @%p18 bra       BB22_44;
 
-       setp.eq.s32     %p19, %r13, 12;
-       @%p19 bra       BB22_53;
-       bra.uni         BB22_33;
+       setp.eq.s32     %p19, %r9, 12;
+       @%p19 bra       BB22_43;
+       bra.uni         BB22_25;
 
-BB22_53:
-       max.f64         %fd55, %fd1, %fd2;
-       bra.uni         BB22_76;
+BB22_43:
+       max.f64         %fd54, %fd1, %fd2;
+       bra.uni         BB22_64;
 
-BB22_22:
-       setp.eq.s32     %p25, %r13, 6;
-       @%p25 bra       BB22_57;
+BB22_14:
+       setp.eq.s32     %p25, %r9, 6;
+       @%p25 bra       BB22_47;
 
-       setp.eq.s32     %p26, %r13, 7;
-       @%p26 bra       BB22_56;
-       bra.uni         BB22_24;
+       setp.eq.s32     %p26, %r9, 7;
+       @%p26 bra       BB22_46;
+       bra.uni         BB22_16;
 
-BB22_56:
-       setp.gt.f64     %p50, %fd1, %fd2;
-       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB22_76;
+BB22_46:
+       setp.gt.f64     %p52, %fd1, %fd2;
+       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
+       bra.uni         BB22_64;
 
-BB22_39:
-       setp.eq.s32     %p12, %r13, 16;
-       @%p12 bra       BB22_51;
+BB22_31:
+       setp.eq.s32     %p12, %r9, 16;
+       @%p12 bra       BB22_41;
 
-       setp.eq.s32     %p13, %r13, 17;
-       @%p13 bra       BB22_46;
-       bra.uni         BB22_41;
+       setp.eq.s32     %p13, %r9, 17;
+       @%p13 bra       BB22_37;
+       bra.uni         BB22_33;
 
-BB22_46:
-       setp.eq.f64     %p38, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p39, %fd2, 0d8000000000000000;
-       or.pred         %p40, %p38, %p39;
-       mov.f64         %fd55, 0d7FF8000000000000;
-       @%p40 bra       BB22_76;
+BB22_37:
+       setp.eq.f64     %p39, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p40, %fd2, 0d8000000000000000;
+       or.pred         %p41, %p39, %p40;
+       mov.f64         %fd54, 0d7FF8000000000000;
+       @%p41 bra       BB22_64;
 
-       div.rn.f64      %fd55, %fd1, %fd2;
-       abs.f64         %fd39, %fd55;
-       setp.gtu.f64    %p41, %fd39, 0d7FF0000000000000;
-       @%p41 bra       BB22_76;
+       div.rn.f64      %fd54, %fd1, %fd2;
+       abs.f64         %fd39, %fd54;
+       setp.gtu.f64    %p42, %fd39, 0d7FF0000000000000;
+       @%p42 bra       BB22_64;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r22}, %fd55;
+       mov.b64         {%r22, %temp}, %fd54;
        }
-       and.b32         %r23, %r22, 2147483647;
-       setp.ne.s32     %p42, %r23, 2146435072;
-       @%p42 bra       BB22_50;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r24, %temp}, %fd55;
+       mov.b64         {%temp, %r23}, %fd54;
        }
-       setp.eq.s32     %p43, %r24, 0;
-       @%p43 bra       BB22_76;
-
-BB22_50:
-       cvt.rmi.f64.f64 %fd40, %fd55;
+       and.b32         %r24, %r23, 2147483647;
+       setp.ne.s32     %p43, %r24, 2146435072;
+       setp.ne.s32     %p44, %r22, 0;
+       or.pred         %p45, %p43, %p44;
+       @!%p45 bra      BB22_64;
+       bra.uni         BB22_40;
+
+BB22_40:
+       cvt.rmi.f64.f64 %fd40, %fd54;
        mul.f64         %fd41, %fd2, %fd40;
-       sub.f64         %fd55, %fd1, %fd41;
-       bra.uni         BB22_76;
+       sub.f64         %fd54, %fd1, %fd41;
+       bra.uni         BB22_64;
 
-BB22_13:
-       setp.eq.s32     %p34, %r13, 1;
-       @%p34 bra       BB22_14;
-       bra.uni         BB22_76;
+BB22_5:
+       setp.eq.s32     %p34, %r9, 1;
+       @%p34 bra       BB22_6;
+       bra.uni         BB22_64;
 
-BB22_14:
-       sub.f64         %fd55, %fd1, %fd2;
-       bra.uni         BB22_76;
+BB22_6:
+       sub.f64         %fd54, %fd1, %fd2;
+       bra.uni         BB22_64;
 
-BB22_29:
-       setp.eq.s32     %p22, %r13, 10;
-       @%p22 bra       BB22_30;
-       bra.uni         BB22_76;
+BB22_21:
+       setp.eq.s32     %p22, %r9, 10;
+       @%p22 bra       BB22_22;
+       bra.uni         BB22_64;
 
-BB22_30:
-       setp.neu.f64    %p47, %fd1, %fd2;
-       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
-       bra.uni         BB22_76;
+BB22_22:
+       setp.neu.f64    %p49, %fd1, %fd2;
+       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
+       bra.uni         BB22_64;
 
-BB22_20:
-       setp.eq.s32     %p29, %r13, 5;
-       @%p29 bra       BB22_21;
-       bra.uni         BB22_76;
+BB22_12:
+       setp.eq.s32     %p29, %r9, 5;
+       @%p29 bra       BB22_13;
+       bra.uni         BB22_64;
 
-BB22_21:
-       setp.lt.f64     %p52, %fd1, %fd2;
-       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
-       bra.uni         BB22_76;
+BB22_13:
+       setp.lt.f64     %p54, %fd1, %fd2;
+       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
+       bra.uni         BB22_64;
 
-BB22_37:
-       setp.eq.s32     %p16, %r13, 15;
-       @%p16 bra       BB22_38;
-       bra.uni         BB22_76;
+BB22_29:
+       setp.eq.s32     %p16, %r9, 15;
+       @%p16 bra       BB22_30;
+       bra.uni         BB22_64;
 
-BB22_38:
+BB22_30:
        mul.f64         %fd43, %fd1, %fd2;
        mov.f64         %fd44, 0d3FF0000000000000;
-       sub.f64         %fd55, %fd44, %fd43;
-       bra.uni         BB22_76;
+       sub.f64         %fd54, %fd44, %fd43;
+       bra.uni         BB22_64;
 
-BB22_16:
-       setp.eq.s32     %p32, %r13, 3;
-       @%p32 bra       BB22_17;
-       bra.uni         BB22_76;
+BB22_8:
+       setp.eq.s32     %p32, %r9, 3;
+       @%p32 bra       BB22_9;
+       bra.uni         BB22_64;
 
-BB22_17:
-       div.rn.f64      %fd55, %fd1, %fd2;
-       bra.uni         BB22_76;
+BB22_9:
+       div.rn.f64      %fd54, %fd1, %fd2;
+       bra.uni         BB22_64;
 
-BB22_54:
-       min.f64         %fd55, %fd1, %fd2;
-       bra.uni         BB22_76;
+BB22_44:
+       min.f64         %fd54, %fd1, %fd2;
+       bra.uni         BB22_64;
 
-BB22_33:
-       setp.eq.s32     %p20, %r13, 13;
-       @%p20 bra       BB22_34;
-       bra.uni         BB22_76;
+BB22_25:
+       setp.eq.s32     %p20, %r9, 13;
+       @%p20 bra       BB22_26;
+       bra.uni         BB22_64;
 
-BB22_34:
+BB22_26:
        cvt.rni.s64.f64 %rd13, %fd1;
        cvt.rni.s64.f64 %rd14, %fd2;
        cvt.u32.u64     %r28, %rd13;
        cvt.u32.u64     %r29, %rd14;
        and.b32         %r30, %r29, %r28;
-       setp.eq.s32     %p46, %r30, 0;
-       selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
-       bra.uni         BB22_76;
+       setp.eq.s32     %p48, %r30, 0;
+       selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
+       bra.uni         BB22_64;
 
-BB22_57:
-       setp.gtu.f64    %p51, %fd1, %fd2;
-       selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p51;
-       bra.uni         BB22_76;
+BB22_47:
+       setp.le.f64     %p53, %fd1, %fd2;
+       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
+       bra.uni         BB22_64;
 
-BB22_24:
-       setp.eq.s32     %p27, %r13, 8;
-       @%p27 bra       BB22_25;
-       bra.uni         BB22_76;
+BB22_16:
+       setp.eq.s32     %p27, %r9, 8;
+       @%p27 bra       BB22_17;
+       bra.uni         BB22_64;
 
-BB22_25:
-       setp.ltu.f64    %p49, %fd1, %fd2;
-       selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p49;
-       bra.uni         BB22_76;
+BB22_17:
+       setp.ge.f64     %p51, %fd1, %fd2;
+       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
+       bra.uni         BB22_64;
 
-BB22_51:
-       setp.neu.f64    %p44, %fd1, 0d0000000000000000;
+BB22_41:
+       setp.neu.f64    %p46, %fd1, 0d0000000000000000;
        sub.f64         %fd42, %fd1, %fd2;
-       selp.f64        %fd55, %fd42, 0d0000000000000000, %p44;
-       bra.uni         BB22_76;
+       selp.f64        %fd54, %fd42, 0d0000000000000000, %p46;
+       bra.uni         BB22_64;
 
-BB22_41:
-       setp.ne.s32     %p14, %r13, 18;
-       @%p14 bra       BB22_76;
+BB22_33:
+       setp.ne.s32     %p14, %r9, 18;
+       @%p14 bra       BB22_64;
 
-       div.rn.f64      %fd55, %fd1, %fd2;
-       abs.f64         %fd37, %fd55;
+       div.rn.f64      %fd54, %fd1, %fd2;
+       abs.f64         %fd37, %fd54;
        setp.gtu.f64    %p35, %fd37, 0d7FF0000000000000;
-       @%p35 bra       BB22_76;
+       @%p35 bra       BB22_64;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r19}, %fd55;
+       mov.b64         {%r19, %temp}, %fd54;
        }
-       and.b32         %r20, %r19, 2147483647;
-       setp.ne.s32     %p36, %r20, 2146435072;
-       @%p36 bra       BB22_45;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r21, %temp}, %fd55;
+       mov.b64         {%temp, %r20}, %fd54;
        }
-       setp.eq.s32     %p37, %r21, 0;
-       @%p37 bra       BB22_76;
+       and.b32         %r21, %r20, 2147483647;
+       setp.ne.s32     %p36, %r21, 2146435072;
+       setp.ne.s32     %p37, %r19, 0;
+       or.pred         %p38, %p36, %p37;
+       @!%p38 bra      BB22_64;
+       bra.uni         BB22_36;
 
-BB22_45:
-       cvt.rmi.f64.f64 %fd55, %fd55;
-       bra.uni         BB22_76;
+BB22_36:
+       cvt.rmi.f64.f64 %fd54, %fd54;
+       bra.uni         BB22_64;
 
-BB22_61:
-       setp.gt.s32     %p56, %r8, -1;
-       @%p56 bra       BB22_64;
+BB22_51:
+       setp.gt.s32     %p58, %r4, -1;
+       @%p58 bra       BB22_54;
 
        cvt.rzi.f64.f64 %fd45, %fd2;
-       setp.neu.f64    %p57, %fd45, %fd2;
-       selp.f64        %fd53, 0dFFF8000000000000, %fd53, %p57;
+       setp.neu.f64    %p59, %fd45, %fd2;
+       selp.f64        %fd52, 0dFFF8000000000000, %fd52, %p59;
 
-BB22_64:
-       mov.f64         %fd25, %fd53;
+BB22_54:
+       mov.f64         %fd25, %fd52;
        add.f64         %fd26, %fd1, %fd2;
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r40}, %fd26;
        }
        and.b32         %r41, %r40, 2146435072;
-       setp.ne.s32     %p60, %r41, 2146435072;
-       mov.f64         %fd52, %fd25;
-       @%p60 bra       BB22_73;
+       setp.ne.s32     %p62, %r41, 2146435072;
+       mov.f64         %fd51, %fd25;
+       @%p62 bra       BB22_61;
 
-       setp.gtu.f64    %p61, %fd19, 0d7FF0000000000000;
-       mov.f64         %fd52, %fd26;
-       @%p61 bra       BB22_73;
-
-       abs.f64         %fd46, %fd2;
-       setp.gtu.f64    %p62, %fd46, 0d7FF0000000000000;
+       setp.gtu.f64    %p63, %fd19, 0d7FF0000000000000;
        mov.f64         %fd51, %fd26;
-       mov.f64         %fd52, %fd51;
-       @%p62 bra       BB22_73;
+       @%p63 bra       BB22_61;
 
-       and.b32         %r42, %r9, 2147483647;
-       setp.ne.s32     %p63, %r42, 2146435072;
-       @%p63 bra       BB22_69;
+       abs.f64         %fd46, %fd2;
+       setp.gtu.f64    %p64, %fd46, 0d7FF0000000000000;
+       mov.f64         %fd50, %fd26;
+       mov.f64         %fd51, %fd50;
+       @%p64 bra       BB22_61;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r43, %temp}, %fd2;
+       mov.b64         {%r42, %temp}, %fd2;
        }
-       setp.eq.s32     %p64, %r43, 0;
-       @%p64 bra       BB22_72;
+       and.b32         %r43, %r5, 2147483647;
+       setp.eq.s32     %p65, %r43, 2146435072;
+       setp.eq.s32     %p66, %r42, 0;
+       and.pred        %p67, %p65, %p66;
+       @%p67 bra       BB22_60;
+       bra.uni         BB22_58;
 
-BB22_69:
-       and.b32         %r44, %r8, 2147483647;
-       setp.ne.s32     %p65, %r44, 2146435072;
-       mov.f64         %fd49, %fd25;
-       mov.f64         %fd52, %fd49;
-       @%p65 bra       BB22_73;
+BB22_60:
+       setp.gt.f64     %p71, %fd19, 0d3FF0000000000000;
+       selp.b32        %r51, 2146435072, 0, %p71;
+       xor.b32         %r52, %r51, 2146435072;
+       setp.lt.s32     %p72, %r5, 0;
+       selp.b32        %r53, %r52, %r51, %p72;
+       setp.eq.f64     %p73, %fd1, 0dBFF0000000000000;
+       selp.b32        %r54, 1072693248, %r53, %p73;
+       mov.u32         %r55, 0;
+       mov.b64         %fd51, {%r55, %r54};
+       bra.uni         BB22_61;
 
+BB22_58:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r45, %temp}, %fd1;
+       mov.b64         {%r44, %temp}, %fd1;
        }
-       setp.ne.s32     %p66, %r45, 0;
-       mov.f64         %fd52, %fd25;
-       @%p66 bra       BB22_73;
+       and.b32         %r45, %r4, 2147483647;
+       setp.eq.s32     %p68, %r45, 2146435072;
+       setp.eq.s32     %p69, %r44, 0;
+       and.pred        %p70, %p68, %p69;
+       mov.f64         %fd51, %fd25;
+       @!%p70 bra      BB22_61;
+       bra.uni         BB22_59;
 
-       shr.s32         %r46, %r9, 31;
+BB22_59:
+       shr.s32         %r46, %r5, 31;
        and.b32         %r47, %r46, -2146435072;
-       add.s32         %r48, %r47, 2146435072;
-       or.b32          %r49, %r48, -2147483648;
-       selp.b32        %r50, %r49, %r48, %p1;
-       mov.u32         %r51, 0;
-       mov.b64         %fd52, {%r51, %r50};
-       bra.uni         BB22_73;
-
-BB22_72:
-       setp.gt.f64     %p67, %fd19, 0d3FF0000000000000;
-       selp.b32        %r52, 2146435072, 0, %p67;
-       xor.b32         %r53, %r52, 2146435072;
-       setp.lt.s32     %p68, %r9, 0;
-       selp.b32        %r54, %r53, %r52, %p68;
-       setp.eq.f64     %p69, %fd1, 0dBFF0000000000000;
-       selp.b32        %r55, 1072693248, %r54, %p69;
-       mov.u32         %r56, 0;
-       mov.b64         %fd52, {%r56, %r55};
-
-BB22_73:
-       setp.eq.f64     %p70, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p71, %fd1, 0d3FF0000000000000;
-       or.pred         %p72, %p71, %p70;
-       selp.f64        %fd55, 0d3FF0000000000000, %fd52, %p72;
+       selp.b32        %r48, -1048576, 2146435072, %p1;
+       add.s32         %r49, %r48, %r47;
+       mov.u32         %r50, 0;
+       mov.b64         %fd51, {%r50, %r49};
 
-BB22_76:
+BB22_61:
+       setp.eq.f64     %p74, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p75, %fd1, 0d3FF0000000000000;
+       or.pred         %p76, %p75, %p74;
+       selp.f64        %fd54, 0d3FF0000000000000, %fd51, %p76;
+
+BB22_64:
        cvta.to.global.u64      %rd16, %rd4;
        mul.wide.s32    %rd17, %r3, 8;
        add.s64         %rd18, %rd16, %rd17;
-       st.global.f64   [%rd18], %fd55;
+       st.global.f64   [%rd18], %fd54;
        bar.sync        0;
 
-BB22_77:
+BB22_65:
        ret;
 }
 
@@ -1769,114 +1744,91 @@ BB22_77:
 {
        .reg .pred      %p<76>;
        .reg .f32       %f<134>;
-       .reg .b32       %r<51>;
+       .reg .b32       %r<42>;
        .reg .b64       %rd<17>;
 
 
        ld.param.u64    %rd1, [matrix_matrix_cellwise_op_f_param_0];
        ld.param.u64    %rd2, [matrix_matrix_cellwise_op_f_param_1];
        ld.param.u64    %rd3, [matrix_matrix_cellwise_op_f_param_2];
-       ld.param.u32    %r12, [matrix_matrix_cellwise_op_f_param_3];
-       ld.param.u32    %r8, [matrix_matrix_cellwise_op_f_param_4];
-       ld.param.u32    %r9, [matrix_matrix_cellwise_op_f_param_5];
-       ld.param.u32    %r10, [matrix_matrix_cellwise_op_f_param_6];
-       ld.param.u32    %r11, [matrix_matrix_cellwise_op_f_param_7];
-       mov.u32         %r13, %ntid.x;
-       mov.u32         %r14, %ctaid.x;
-       mov.u32         %r15, %tid.x;
-       mad.lo.s32      %r16, %r13, %r14, %r15;
-       div.s32         %r1, %r16, %r8;
-       rem.s32         %r2, %r16, %r8;
-       setp.lt.s32     %p2, %r1, %r12;
-       setp.gt.s32     %p3, %r8, -1;
+       ld.param.u32    %r8, [matrix_matrix_cellwise_op_f_param_3];
+       ld.param.u32    %r4, [matrix_matrix_cellwise_op_f_param_4];
+       ld.param.u32    %r5, [matrix_matrix_cellwise_op_f_param_5];
+       ld.param.u32    %r6, [matrix_matrix_cellwise_op_f_param_6];
+       ld.param.u32    %r7, [matrix_matrix_cellwise_op_f_param_7];
+       mov.u32         %r9, %ntid.x;
+       mov.u32         %r10, %ctaid.x;
+       mov.u32         %r11, %tid.x;
+       mad.lo.s32      %r1, %r9, %r10, %r11;
+       div.s32         %r2, %r1, %r4;
+       setp.lt.s32     %p2, %r2, %r8;
+       setp.gt.s32     %p3, %r4, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB23_71;
+       @!%p4 bra       BB23_63;
        bra.uni         BB23_1;
 
 BB23_1:
-       mad.lo.s32      %r3, %r1, %r8, %r2;
-       setp.eq.s32     %p5, %r9, 1;
-       mov.u32         %r49, %r1;
-       @%p5 bra        BB23_5;
-
-       setp.ne.s32     %p6, %r9, 2;
-       mov.u32         %r50, %r3;
-       @%p6 bra        BB23_4;
-
-       mov.u32         %r50, %r2;
-
-BB23_4:
-       mov.u32         %r44, %r50;
-       mov.u32         %r4, %r44;
-       mov.u32         %r49, %r4;
-
-BB23_5:
-       mov.u32         %r5, %r49;
-       setp.eq.s32     %p7, %r10, 1;
-       mov.u32         %r47, %r1;
-       @%p7 bra        BB23_9;
-
-       setp.ne.s32     %p8, %r10, 2;
-       mov.u32         %r48, %r3;
-       @%p8 bra        BB23_8;
-
-       mov.u32         %r48, %r2;
-
-BB23_8:
-       mov.u32         %r47, %r48;
-
-BB23_9:
-       cvta.to.global.u64      %rd4, %rd2;
-       cvta.to.global.u64      %rd5, %rd1;
-       mul.wide.s32    %rd6, %r5, 4;
-       add.s64         %rd7, %rd5, %rd6;
-       ld.global.f32   %f1, [%rd7];
-       mul.wide.s32    %rd8, %r47, 4;
-       add.s64         %rd9, %rd4, %rd8;
+       rem.s32         %r12, %r1, %r4;
+       cvta.to.global.u64      %rd4, %rd1;
+       mad.lo.s32      %r3, %r2, %r4, %r12;
+       setp.eq.s32     %p5, %r5, 2;
+       selp.b32        %r13, %r12, %r3, %p5;
+       setp.eq.s32     %p6, %r5, 1;
+       selp.b32        %r14, %r2, %r13, %p6;
+       setp.eq.s32     %p7, %r6, 2;
+       selp.b32        %r15, %r12, %r3, %p7;
+       setp.eq.s32     %p8, %r6, 1;
+       selp.b32        %r16, %r2, %r15, %p8;
+       mul.wide.s32    %rd5, %r14, 4;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f32   %f1, [%rd6];
+       cvta.to.global.u64      %rd7, %rd2;
+       mul.wide.s32    %rd8, %r16, 4;
+       add.s64         %rd9, %rd7, %rd8;
        ld.global.f32   %f2, [%rd9];
        mov.f32         %f133, 0f7F7FFFFF;
-       setp.gt.s32     %p9, %r11, 8;
-       @%p9 bra        BB23_26;
+       setp.gt.s32     %p9, %r7, 8;
+       @%p9 bra        BB23_18;
 
-       setp.gt.s32     %p23, %r11, 3;
-       @%p23 bra       BB23_18;
+       setp.gt.s32     %p23, %r7, 3;
+       @%p23 bra       BB23_10;
 
-       setp.gt.s32     %p30, %r11, 1;
-       @%p30 bra       BB23_15;
+       setp.gt.s32     %p30, %r7, 1;
+       @%p30 bra       BB23_7;
 
-       setp.eq.s32     %p33, %r11, 0;
-       @%p33 bra       BB23_69;
-       bra.uni         BB23_13;
+       setp.eq.s32     %p33, %r7, 0;
+       @%p33 bra       BB23_61;
+       bra.uni         BB23_5;
 
-BB23_69:
+BB23_61:
        add.f32         %f133, %f1, %f2;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_26:
-       setp.gt.s32     %p10, %r11, 13;
-       @%p10 bra       BB23_35;
+BB23_18:
+       setp.gt.s32     %p10, %r7, 13;
+       @%p10 bra       BB23_27;
 
-       setp.gt.s32     %p17, %r11, 10;
-       @%p17 bra       BB23_31;
+       setp.gt.s32     %p17, %r7, 10;
+       @%p17 bra       BB23_23;
 
-       setp.eq.s32     %p21, %r11, 9;
-       @%p21 bra       BB23_51;
-       bra.uni         BB23_29;
+       setp.eq.s32     %p21, %r7, 9;
+       @%p21 bra       BB23_43;
+       bra.uni         BB23_21;
 
-BB23_51:
+BB23_43:
        setp.eq.f32     %p44, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p44;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_18:
-       setp.gt.s32     %p24, %r11, 5;
-       @%p24 bra       BB23_22;
+BB23_10:
+       setp.gt.s32     %p24, %r7, 5;
+       @%p24 bra       BB23_14;
 
-       setp.eq.s32     %p28, %r11, 4;
-       @%p28 bra       BB23_54;
-       bra.uni         BB23_20;
+       setp.eq.s32     %p28, %r7, 4;
+       @%p28 bra       BB23_46;
+       bra.uni         BB23_12;
 
-BB23_54:
+BB23_46:
        mul.f32         %f53, %f2, 0f3F000000;
        cvt.rzi.f32.f32 %f54, %f53;
        fma.rn.f32      %f55, %f54, 0fC0000000, %f2;
@@ -1976,11 +1928,11 @@ BB23_54:
        setp.gt.f32     %p54, %f115, 0f42D20000;
        selp.f32        %f131, 0f7F800000, %f125, %p54;
        setp.eq.f32     %p55, %f131, 0f7F800000;
-       @%p55 bra       BB23_56;
+       @%p55 bra       BB23_48;
 
        fma.rn.f32      %f131, %f131, %f22, %f131;
 
-BB23_56:
+BB23_48:
        setp.lt.f32     %p56, %f1, 0f00000000;
        setp.eq.f32     %p57, %f19, 0f3F800000;
        and.pred        %p1, %p56, %p57;
@@ -1989,10 +1941,10 @@ BB23_56:
        mov.b32          %f126, %r30;
        selp.f32        %f132, %f126, %f131, %p1;
        setp.eq.f32     %p58, %f1, 0f00000000;
-       @%p58 bra       BB23_59;
-       bra.uni         BB23_57;
+       @%p58 bra       BB23_51;
+       bra.uni         BB23_49;
 
-BB23_59:
+BB23_51:
        add.f32         %f128, %f1, %f1;
        mov.b32          %r31, %f128;
        selp.b32        %r32, %r31, 0, %p57;
@@ -2000,17 +1952,17 @@ BB23_59:
        setp.lt.f32     %p62, %f2, 0f00000000;
        selp.b32        %r34, %r33, %r32, %p62;
        mov.b32          %f132, %r34;
-       bra.uni         BB23_60;
+       bra.uni         BB23_52;
 
-BB23_35:
-       setp.gt.s32     %p11, %r11, 15;
-       @%p11 bra       BB23_39;
+BB23_27:
+       setp.gt.s32     %p11, %r7, 15;
+       @%p11 bra       BB23_31;
 
-       setp.eq.s32     %p15, %r11, 14;
-       @%p15 bra       BB23_48;
-       bra.uni         BB23_37;
+       setp.eq.s32     %p15, %r7, 14;
+       @%p15 bra       BB23_40;
+       bra.uni         BB23_29;
 
-BB23_48:
+BB23_40:
        cvt.rni.s64.f32 %rd10, %f1;
        cvt.rni.s64.f32 %rd11, %f2;
        cvt.u32.u64     %r17, %rd10;
@@ -2018,126 +1970,126 @@ BB23_48:
        or.b32          %r19, %r18, %r17;
        setp.eq.s32     %p41, %r19, 0;
        selp.f32        %f133, 0f00000000, 0f3F800000, %p41;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_15:
-       setp.eq.s32     %p31, %r11, 2;
-       @%p31 bra       BB23_68;
-       bra.uni         BB23_16;
+BB23_7:
+       setp.eq.s32     %p31, %r7, 2;
+       @%p31 bra       BB23_60;
+       bra.uni         BB23_8;
 
-BB23_68:
+BB23_60:
        mul.f32         %f133, %f1, %f2;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_31:
-       setp.eq.s32     %p18, %r11, 11;
-       @%p18 bra       BB23_50;
+BB23_23:
+       setp.eq.s32     %p18, %r7, 11;
+       @%p18 bra       BB23_42;
 
-       setp.eq.s32     %p19, %r11, 12;
-       @%p19 bra       BB23_49;
-       bra.uni         BB23_33;
+       setp.eq.s32     %p19, %r7, 12;
+       @%p19 bra       BB23_41;
+       bra.uni         BB23_25;
 
-BB23_49:
+BB23_41:
        max.f32         %f133, %f1, %f2;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_22:
-       setp.eq.s32     %p25, %r11, 6;
-       @%p25 bra       BB23_53;
+BB23_14:
+       setp.eq.s32     %p25, %r7, 6;
+       @%p25 bra       BB23_45;
 
-       setp.eq.s32     %p26, %r11, 7;
-       @%p26 bra       BB23_52;
-       bra.uni         BB23_24;
+       setp.eq.s32     %p26, %r7, 7;
+       @%p26 bra       BB23_44;
+       bra.uni         BB23_16;
 
-BB23_52:
+BB23_44:
        setp.gt.f32     %p46, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p46;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_39:
-       setp.eq.s32     %p12, %r11, 16;
-       @%p12 bra       BB23_47;
+BB23_31:
+       setp.eq.s32     %p12, %r7, 16;
+       @%p12 bra       BB23_39;
 
-       setp.eq.s32     %p13, %r11, 17;
-       @%p13 bra       BB23_44;
-       bra.uni         BB23_41;
+       setp.eq.s32     %p13, %r7, 17;
+       @%p13 bra       BB23_36;
+       bra.uni         BB23_33;
 
-BB23_44:
+BB23_36:
        setp.eq.f32     %p36, %f2, 0f00000000;
        setp.eq.f32     %p37, %f2, 0f80000000;
        or.pred         %p38, %p36, %p37;
        mov.f32         %f133, 0f7FC00000;
-       @%p38 bra       BB23_70;
+       @%p38 bra       BB23_62;
 
        div.rn.f32      %f133, %f1, %f2;
        abs.f32         %f43, %f133;
        setp.geu.f32    %p39, %f43, 0f7F800000;
-       @%p39 bra       BB23_70;
+       @%p39 bra       BB23_62;
 
        cvt.rmi.f32.f32 %f44, %f133;
        mul.f32         %f45, %f2, %f44;
        sub.f32         %f133, %f1, %f45;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_13:
-       setp.eq.s32     %p34, %r11, 1;
-       @%p34 bra       BB23_14;
-       bra.uni         BB23_70;
+BB23_5:
+       setp.eq.s32     %p34, %r7, 1;
+       @%p34 bra       BB23_6;
+       bra.uni         BB23_62;
 
-BB23_14:
+BB23_6:
        sub.f32         %f133, %f1, %f2;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_29:
-       setp.eq.s32     %p22, %r11, 10;
-       @%p22 bra       BB23_30;
-       bra.uni         BB23_70;
+BB23_21:
+       setp.eq.s32     %p22, %r7, 10;
+       @%p22 bra       BB23_22;
+       bra.uni         BB23_62;
 
-BB23_30:
+BB23_22:
        setp.neu.f32    %p43, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p43;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_20:
-       setp.eq.s32     %p29, %r11, 5;
-       @%p29 bra       BB23_21;
-       bra.uni         BB23_70;
+BB23_12:
+       setp.eq.s32     %p29, %r7, 5;
+       @%p29 bra       BB23_13;
+       bra.uni         BB23_62;
 
-BB23_21:
+BB23_13:
        setp.lt.f32     %p48, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p48;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_37:
-       setp.eq.s32     %p16, %r11, 15;
-       @%p16 bra       BB23_38;
-       bra.uni         BB23_70;
+BB23_29:
+       setp.eq.s32     %p16, %r7, 15;
+       @%p16 bra       BB23_30;
+       bra.uni         BB23_62;
 
-BB23_38:
+BB23_30:
        mul.f32         %f47, %f1, %f2;
        mov.f32         %f48, 0f3F800000;
        sub.f32         %f133, %f48, %f47;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_16:
-       setp.eq.s32     %p32, %r11, 3;
-       @%p32 bra       BB23_17;
-       bra.uni         BB23_70;
+BB23_8:
+       setp.eq.s32     %p32, %r7, 3;
+       @%p32 bra       BB23_9;
+       bra.uni         BB23_62;
 
-BB23_17:
+BB23_9:
        div.rn.f32      %f133, %f1, %f2;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_50:
+BB23_42:
        min.f32         %f133, %f1, %f2;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_33:
-       setp.eq.s32     %p20, %r11, 13;
-       @%p20 bra       BB23_34;
-       bra.uni         BB23_70;
+BB23_25:
+       setp.eq.s32     %p20, %r7, 13;
+       @%p20 bra       BB23_26;
+       bra.uni         BB23_62;
 
-BB23_34:
+BB23_26:
        cvt.rni.s64.f32 %rd12, %f1;
        cvt.rni.s64.f32 %rd13, %f2;
        cvt.u32.u64     %r20, %rd12;
@@ -2145,71 +2097,71 @@ BB23_34:
        and.b32         %r22, %r21, %r20;
        setp.eq.s32     %p42, %r22, 0;
        selp.f32        %f133, 0f00000000, 0f3F800000, %p42;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_53:
-       setp.gtu.f32    %p47, %f1, %f2;
-       selp.f32        %f133, 0f00000000, 0f3F800000, %p47;
-       bra.uni         BB23_70;
+BB23_45:
+       setp.le.f32     %p47, %f1, %f2;
+       selp.f32        %f133, 0f3F800000, 0f00000000, %p47;
+       bra.uni         BB23_62;
 
-BB23_24:
-       setp.eq.s32     %p27, %r11, 8;
-       @%p27 bra       BB23_25;
-       bra.uni         BB23_70;
+BB23_16:
+       setp.eq.s32     %p27, %r7, 8;
+       @%p27 bra       BB23_17;
+       bra.uni         BB23_62;
 
-BB23_25:
-       setp.ltu.f32    %p45, %f1, %f2;
-       selp.f32        %f133, 0f00000000, 0f3F800000, %p45;
-       bra.uni         BB23_70;
+BB23_17:
+       setp.ge.f32     %p45, %f1, %f2;
+       selp.f32        %f133, 0f3F800000, 0f00000000, %p45;
+       bra.uni         BB23_62;
 
-BB23_47:
+BB23_39:
        setp.neu.f32    %p40, %f1, 0f00000000;
        sub.f32         %f46, %f1, %f2;
        selp.f32        %f133, %f46, 0f00000000, %p40;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_41:
-       setp.ne.s32     %p14, %r11, 18;
-       @%p14 bra       BB23_70;
+BB23_33:
+       setp.ne.s32     %p14, %r7, 18;
+       @%p14 bra       BB23_62;
 
        div.rn.f32      %f133, %f1, %f2;
        abs.f32         %f41, %f133;
        setp.geu.f32    %p35, %f41, 0f7F800000;
-       @%p35 bra       BB23_70;
+       @%p35 bra       BB23_62;
 
        cvt.rmi.f32.f32 %f133, %f133;
-       bra.uni         BB23_70;
+       bra.uni         BB23_62;
 
-BB23_57:
+BB23_49:
        setp.geu.f32    %p59, %f1, 0f00000000;
-       @%p59 bra       BB23_60;
+       @%p59 bra       BB23_52;
 
        cvt.rzi.f32.f32 %f127, %f2;
        setp.neu.f32    %p60, %f127, %f2;
        selp.f32        %f132, 0f7FFFFFFF, %f132, %p60;
 
-BB23_60:
+BB23_52:
        add.f32         %f129, %f20, %f21;
        mov.b32          %r35, %f129;
        setp.lt.s32     %p63, %r35, 2139095040;
-       @%p63 bra       BB23_67;
+       @%p63 bra       BB23_59;
 
        setp.gtu.f32    %p64, %f20, 0f7F800000;
        setp.gtu.f32    %p65, %f21, 0f7F800000;
        or.pred         %p66, %p64, %p65;
-       @%p66 bra       BB23_66;
-       bra.uni         BB23_62;
+       @%p66 bra       BB23_58;
+       bra.uni         BB23_54;
 
-BB23_66:
+BB23_58:
        add.f32         %f132, %f1, %f2;
-       bra.uni         BB23_67;
+       bra.uni         BB23_59;
 
-BB23_62:
+BB23_54:
        setp.eq.f32     %p67, %f21, 0f7F800000;
-       @%p67 bra       BB23_65;
-       bra.uni         BB23_63;
+       @%p67 bra       BB23_57;
+       bra.uni         BB23_55;
 
-BB23_65:
+BB23_57:
        setp.gt.f32     %p70, %f20, 0f3F800000;
        selp.b32        %r39, 2139095040, 0, %p70;
        xor.b32         %r40, %r39, 2139095040;
@@ -2218,32 +2170,32 @@ BB23_65:
        mov.b32          %f130, %r41;
        setp.eq.f32     %p72, %f1, 0fBF800000;
        selp.f32        %f132, 0f3F800000, %f130, %p72;
-       bra.uni         BB23_67;
+       bra.uni         BB23_59;
 
-BB23_63:
+BB23_55:
        setp.neu.f32    %p68, %f20, 0f7F800000;
-       @%p68 bra       BB23_67;
+       @%p68 bra       BB23_59;
 
-       setp.ltu.f32    %p69, %f2, 0f00000000;
-       selp.b32        %r36, 0, 2139095040, %p69;
+       setp.ge.f32     %p69, %f2, 0f00000000;
+       selp.b32        %r36, 2139095040, 0, %p69;
        or.b32          %r37, %r36, -2147483648;
        selp.b32        %r38, %r37, %r36, %p1;
        mov.b32          %f132, %r38;
 
-BB23_67:
+BB23_59:
        setp.eq.f32     %p73, %f2, 0f00000000;
        setp.eq.f32     %p74, %f1, 0f3F800000;
        or.pred         %p75, %p74, %p73;
        selp.f32        %f133, 0f3F800000, %f132, %p75;
 
-BB23_70:
+BB23_62:
        cvta.to.global.u64      %rd14, %rd3;
        mul.wide.s32    %rd15, %r3, 4;
        add.s64         %rd16, %rd14, %rd15;
        st.global.f32   [%rd16], %f133;
        bar.sync        0;
 
-BB23_71:
+BB23_63:
        ret;
 }
 
@@ -2257,9 +2209,9 @@ BB23_71:
        .param .u32 matrix_scalar_op_d_param_5
 )
 {
-       .reg .pred      %p<133>;
-       .reg .b32       %r<88>;
-       .reg .f64       %fd<109>;
+       .reg .pred      %p<141>;
+       .reg .b32       %r<86>;
+       .reg .f64       %fd<107>;
        .reg .b64       %rd<20>;
 
 
@@ -2274,7 +2226,7 @@ BB23_71:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r9, %r10, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB24_138;
+       @%p3 bra        BB24_130;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -2283,9 +2235,9 @@ BB23_71:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB24_70;
+       @%p4 bra        BB24_66;
 
-       mov.f64         %fd99, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd98, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 8;
        @%p5 bra        BB24_19;
 
@@ -2296,31 +2248,31 @@ BB23_71:
        @%p26 bra       BB24_8;
 
        setp.eq.s32     %p29, %r6, 0;
-       @%p29 bra       BB24_68;
+       @%p29 bra       BB24_64;
        bra.uni         BB24_6;
 
-BB24_68:
-       add.f64         %fd99, %fd1, %fd68;
-       bra.uni         BB24_69;
+BB24_64:
+       add.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB24_65;
 
-BB24_70:
-       mov.f64         %fd108, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p69, %r6, 8;
-       @%p69 bra       BB24_87;
+BB24_66:
+       mov.f64         %fd106, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p73, %r6, 8;
+       @%p73 bra       BB24_83;
 
-       setp.gt.s32     %p83, %r6, 3;
-       @%p83 bra       BB24_79;
+       setp.gt.s32     %p87, %r6, 3;
+       @%p87 bra       BB24_75;
 
-       setp.gt.s32     %p90, %r6, 1;
-       @%p90 bra       BB24_76;
+       setp.gt.s32     %p94, %r6, 1;
+       @%p94 bra       BB24_72;
 
-       setp.eq.s32     %p93, %r6, 0;
-       @%p93 bra       BB24_136;
-       bra.uni         BB24_74;
+       setp.eq.s32     %p97, %r6, 0;
+       @%p97 bra       BB24_128;
+       bra.uni         BB24_70;
 
-BB24_136:
-       add.f64         %fd108, %fd1, %fd68;
-       bra.uni         BB24_137;
+BB24_128:
+       add.f64         %fd106, %fd1, %fd68;
+       bra.uni         BB24_129;
 
 BB24_19:
        setp.gt.s32     %p6, %r6, 13;
@@ -2330,39 +2282,39 @@ BB24_19:
        @%p13 bra       BB24_24;
 
        setp.eq.s32     %p17, %r6, 9;
-       @%p17 bra       BB24_48;
+       @%p17 bra       BB24_46;
        bra.uni         BB24_22;
 
-BB24_48:
-       setp.eq.f64     %p44, %fd1, %fd68;
-       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
-       bra.uni         BB24_69;
+BB24_46:
+       setp.eq.f64     %p46, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
+       bra.uni         BB24_65;
 
-BB24_87:
-       setp.gt.s32     %p70, %r6, 13;
-       @%p70 bra       BB24_96;
+BB24_83:
+       setp.gt.s32     %p74, %r6, 13;
+       @%p74 bra       BB24_92;
 
-       setp.gt.s32     %p77, %r6, 10;
-       @%p77 bra       BB24_92;
+       setp.gt.s32     %p81, %r6, 10;
+       @%p81 bra       BB24_88;
 
-       setp.eq.s32     %p81, %r6, 9;
-       @%p81 bra       BB24_116;
-       bra.uni         BB24_90;
+       setp.eq.s32     %p85, %r6, 9;
+       @%p85 bra       BB24_110;
+       bra.uni         BB24_86;
 
-BB24_116:
-       setp.eq.f64     %p108, %fd1, %fd68;
-       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
-       bra.uni         BB24_137;
+BB24_110:
+       setp.eq.f64     %p114, %fd1, %fd68;
+       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
+       bra.uni         BB24_129;
 
 BB24_11:
        setp.gt.s32     %p20, %r6, 5;
        @%p20 bra       BB24_15;
 
        setp.eq.s32     %p24, %r6, 4;
-       @%p24 bra       BB24_51;
+       @%p24 bra       BB24_49;
        bra.uni         BB24_13;
 
-BB24_51:
+BB24_49:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd68;
@@ -2375,7 +2327,7 @@ BB24_51:
        add.s32         %r25, %r24, -1012;
        mov.b64          %rd14, %fd1;
        shl.b64         %rd2, %rd14, %r25;
-       setp.eq.s64     %p49, %rd2, -9223372036854775808;
+       setp.eq.s64     %p51, %rd2, -9223372036854775808;
        abs.f64         %fd18, %fd68;
        // Callseq Start 1
        {
@@ -2392,69 +2344,69 @@ BB24_51:
        param0, 
        param1
        );
-       ld.param.f64    %fd98, [retval0+0];
+       ld.param.f64    %fd97, [retval0+0];
        
        //{
        }// Callseq End 1
-       setp.lt.s32     %p50, %r2, 0;
-       and.pred        %p1, %p50, %p49;
-       @!%p1 bra       BB24_53;
-       bra.uni         BB24_52;
+       setp.lt.s32     %p52, %r2, 0;
+       and.pred        %p1, %p52, %p51;
+       @!%p1 bra       BB24_51;
+       bra.uni         BB24_50;
 
-BB24_52:
+BB24_50:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r26}, %fd98;
+       mov.b64         {%temp, %r26}, %fd97;
        }
        xor.b32         %r27, %r26, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r28, %temp}, %fd98;
+       mov.b64         {%r28, %temp}, %fd97;
        }
-       mov.b64         %fd98, {%r28, %r27};
+       mov.b64         %fd97, {%r28, %r27};
 
-BB24_53:
-       mov.f64         %fd97, %fd98;
-       setp.eq.f64     %p51, %fd68, 0d0000000000000000;
-       @%p51 bra       BB24_56;
-       bra.uni         BB24_54;
+BB24_51:
+       mov.f64         %fd96, %fd97;
+       setp.eq.f64     %p53, %fd68, 0d0000000000000000;
+       @%p53 bra       BB24_54;
+       bra.uni         BB24_52;
 
-BB24_56:
-       selp.b32        %r29, %r2, 0, %p49;
+BB24_54:
+       selp.b32        %r29, %r2, 0, %p51;
        or.b32          %r30, %r29, 2146435072;
-       setp.lt.s32     %p55, %r3, 0;
-       selp.b32        %r31, %r30, %r29, %p55;
+       setp.lt.s32     %p57, %r3, 0;
+       selp.b32        %r31, %r30, %r29, %p57;
        mov.u32         %r32, 0;
-       mov.b64         %fd97, {%r32, %r31};
-       bra.uni         BB24_57;
+       mov.b64         %fd96, {%r32, %r31};
+       bra.uni         BB24_55;
 
 BB24_28:
        setp.gt.s32     %p7, %r6, 15;
        @%p7 bra        BB24_32;
 
        setp.eq.s32     %p11, %r6, 14;
-       @%p11 bra       BB24_45;
+       @%p11 bra       BB24_43;
        bra.uni         BB24_30;
 
-BB24_45:
+BB24_43:
        cvt.rni.s64.f64 %rd10, %fd68;
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.u32.u64     %r18, %rd10;
        cvt.u32.u64     %r19, %rd11;
        or.b32          %r20, %r19, %r18;
-       setp.eq.s32     %p41, %r20, 0;
-       selp.f64        %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
-       bra.uni         BB24_69;
+       setp.eq.s32     %p43, %r20, 0;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
+       bra.uni         BB24_65;
 
-BB24_79:
-       setp.gt.s32     %p84, %r6, 5;
-       @%p84 bra       BB24_83;
+BB24_75:
+       setp.gt.s32     %p88, %r6, 5;
+       @%p88 bra       BB24_79;
 
-       setp.eq.s32     %p88, %r6, 4;
-       @%p88 bra       BB24_119;
-       bra.uni         BB24_81;
+       setp.eq.s32     %p92, %r6, 4;
+       @%p92 bra       BB24_113;
+       bra.uni         BB24_77;
 
-BB24_119:
+BB24_113:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -2463,11 +2415,11 @@ BB24_119:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd68;
        }
-       bfe.u32         %r62, %r5, 20, 11;
-       add.s32         %r63, %r62, -1012;
+       bfe.u32         %r61, %r5, 20, 11;
+       add.s32         %r62, %r61, -1012;
        mov.b64          %rd19, %fd68;
-       shl.b64         %rd3, %rd19, %r63;
-       setp.eq.s64     %p113, %rd3, -9223372036854775808;
+       shl.b64         %rd3, %rd19, %r62;
+       setp.eq.s64     %p119, %rd3, -9223372036854775808;
        abs.f64         %fd51, %fd1;
        // Callseq Start 2
        {
@@ -2484,267 +2436,267 @@ BB24_119:
        param0, 
        param1
        );
-       ld.param.f64    %fd107, [retval0+0];
+       ld.param.f64    %fd105, [retval0+0];
        
        //{
        }// Callseq End 2
-       setp.lt.s32     %p114, %r4, 0;
-       and.pred        %p2, %p114, %p113;
-       @!%p2 bra       BB24_121;
-       bra.uni         BB24_120;
+       setp.lt.s32     %p120, %r4, 0;
+       and.pred        %p2, %p120, %p119;
+       @!%p2 bra       BB24_115;
+       bra.uni         BB24_114;
 
-BB24_120:
+BB24_114:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r64}, %fd107;
+       mov.b64         {%temp, %r63}, %fd105;
        }
-       xor.b32         %r65, %r64, -2147483648;
+       xor.b32         %r64, %r63, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r66, %temp}, %fd107;
+       mov.b64         {%r65, %temp}, %fd105;
        }
-       mov.b64         %fd107, {%r66, %r65};
+       mov.b64         %fd105, {%r65, %r64};
 
-BB24_121:
-       mov.f64         %fd106, %fd107;
-       setp.eq.f64     %p115, %fd1, 0d0000000000000000;
-       @%p115 bra      BB24_124;
-       bra.uni         BB24_122;
+BB24_115:
+       mov.f64         %fd104, %fd105;
+       setp.eq.f64     %p121, %fd1, 0d0000000000000000;
+       @%p121 bra      BB24_118;
+       bra.uni         BB24_116;
 
-BB24_124:
-       selp.b32        %r67, %r4, 0, %p113;
-       or.b32          %r68, %r67, 2146435072;
-       setp.lt.s32     %p119, %r5, 0;
-       selp.b32        %r69, %r68, %r67, %p119;
-       mov.u32         %r70, 0;
-       mov.b64         %fd106, {%r70, %r69};
-       bra.uni         BB24_125;
+BB24_118:
+       selp.b32        %r66, %r4, 0, %p119;
+       or.b32          %r67, %r66, 2146435072;
+       setp.lt.s32     %p125, %r5, 0;
+       selp.b32        %r68, %r67, %r66, %p125;
+       mov.u32         %r69, 0;
+       mov.b64         %fd104, {%r69, %r68};
+       bra.uni         BB24_119;
 
-BB24_96:
-       setp.gt.s32     %p71, %r6, 15;
-       @%p71 bra       BB24_100;
+BB24_92:
+       setp.gt.s32     %p75, %r6, 15;
+       @%p75 bra       BB24_96;
 
-       setp.eq.s32     %p75, %r6, 14;
-       @%p75 bra       BB24_113;
-       bra.uni         BB24_98;
+       setp.eq.s32     %p79, %r6, 14;
+       @%p79 bra       BB24_107;
+       bra.uni         BB24_94;
 
-BB24_113:
+BB24_107:
        cvt.rni.s64.f64 %rd15, %fd1;
        cvt.rni.s64.f64 %rd16, %fd68;
-       cvt.u32.u64     %r56, %rd15;
-       cvt.u32.u64     %r57, %rd16;
-       or.b32          %r58, %r57, %r56;
-       setp.eq.s32     %p105, %r58, 0;
-       selp.f64        %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
-       bra.uni         BB24_137;
+       cvt.u32.u64     %r55, %rd15;
+       cvt.u32.u64     %r56, %rd16;
+       or.b32          %r57, %r56, %r55;
+       setp.eq.s32     %p111, %r57, 0;
+       selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
+       bra.uni         BB24_129;
 
 BB24_8:
        setp.eq.s32     %p27, %r6, 2;
-       @%p27 bra       BB24_67;
+       @%p27 bra       BB24_63;
        bra.uni         BB24_9;
 
-BB24_67:
-       mul.f64         %fd99, %fd1, %fd68;
-       bra.uni         BB24_69;
+BB24_63:
+       mul.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB24_65;
 
 BB24_24:
        setp.eq.s32     %p14, %r6, 11;
-       @%p14 bra       BB24_47;
+       @%p14 bra       BB24_45;
 
        setp.eq.s32     %p15, %r6, 12;
-       @%p15 bra       BB24_46;
+       @%p15 bra       BB24_44;
        bra.uni         BB24_26;
 
-BB24_46:
-       max.f64         %fd99, %fd68, %fd1;
-       bra.uni         BB24_69;
+BB24_44:
+       max.f64         %fd98, %fd68, %fd1;
+       bra.uni         BB24_65;
 
 BB24_15:
        setp.eq.s32     %p21, %r6, 6;
-       @%p21 bra       BB24_50;
+       @%p21 bra       BB24_48;
 
        setp.eq.s32     %p22, %r6, 7;
-       @%p22 bra       BB24_49;
+       @%p22 bra       BB24_47;
        bra.uni         BB24_17;
 
-BB24_49:
-       setp.lt.f64     %p46, %fd1, %fd68;
-       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
-       bra.uni         BB24_69;
+BB24_47:
+       setp.lt.f64     %p48, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
+       bra.uni         BB24_65;
 
 BB24_32:
        setp.eq.s32     %p8, %r6, 16;
-       @%p8 bra        BB24_44;
+       @%p8 bra        BB24_42;
 
        setp.eq.s32     %p9, %r6, 17;
-       @%p9 bra        BB24_39;
+       @%p9 bra        BB24_38;
        bra.uni         BB24_34;
 
-BB24_39:
-       setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p35, %fd1, 0d8000000000000000;
-       or.pred         %p36, %p34, %p35;
-       mov.f64         %fd99, 0d7FF8000000000000;
-       @%p36 bra       BB24_69;
+BB24_38:
+       setp.eq.f64     %p35, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p36, %fd1, 0d8000000000000000;
+       or.pred         %p37, %p35, %p36;
+       mov.f64         %fd98, 0d7FF8000000000000;
+       @%p37 bra       BB24_65;
 
-       div.rn.f64      %fd99, %fd68, %fd1;
-       abs.f64         %fd72, %fd99;
-       setp.gtu.f64    %p37, %fd72, 0d7FF0000000000000;
-       @%p37 bra       BB24_69;
+       div.rn.f64      %fd98, %fd68, %fd1;
+       abs.f64         %fd72, %fd98;
+       setp.gtu.f64    %p38, %fd72, 0d7FF0000000000000;
+       @%p38 bra       BB24_65;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r15}, %fd99;
+       mov.b64         {%r15, %temp}, %fd98;
        }
-       and.b32         %r16, %r15, 2147483647;
-       setp.ne.s32     %p38, %r16, 2146435072;
-       @%p38 bra       BB24_43;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r17, %temp}, %fd99;
+       mov.b64         {%temp, %r16}, %fd98;
        }
-       setp.eq.s32     %p39, %r17, 0;
-       @%p39 bra       BB24_69;
-
-BB24_43:
-       cvt.rmi.f64.f64 %fd73, %fd99;
+       and.b32         %r17, %r16, 2147483647;
+       setp.ne.s32     %p39, %r17, 2146435072;
+       setp.ne.s32     %p40, %r15, 0;
+       or.pred         %p41, %p39, %p40;
+       @!%p41 bra      BB24_65;
+       bra.uni         BB24_41;
+
+BB24_41:
+       cvt.rmi.f64.f64 %fd73, %fd98;
        mul.f64         %fd74, %fd1, %fd73;
-       sub.f64         %fd99, %fd68, %fd74;
-       bra.uni         BB24_69;
+       sub.f64         %fd98, %fd68, %fd74;
+       bra.uni         BB24_65;
 
-BB24_76:
-       setp.eq.s32     %p91, %r6, 2;
-       @%p91 bra       BB24_135;
-       bra.uni         BB24_77;
+BB24_72:
+       setp.eq.s32     %p95, %r6, 2;
+       @%p95 bra       BB24_127;
+       bra.uni         BB24_73;
 
-BB24_135:
-       mul.f64         %fd108, %fd1, %fd68;
-       bra.uni         BB24_137;
+BB24_127:
+       mul.f64         %fd106, %fd1, %fd68;
+       bra.uni         BB24_129;
 
-BB24_92:
-       setp.eq.s32     %p78, %r6, 11;
-       @%p78 bra       BB24_115;
+BB24_88:
+       setp.eq.s32     %p82, %r6, 11;
+       @%p82 bra       BB24_109;
 
-       setp.eq.s32     %p79, %r6, 12;
-       @%p79 bra       BB24_114;
-       bra.uni         BB24_94;
+       setp.eq.s32     %p83, %r6, 12;
+       @%p83 bra       BB24_108;
+       bra.uni         BB24_90;
 
-BB24_114:
-       max.f64         %fd108, %fd1, %fd68;
-       bra.uni         BB24_137;
+BB24_108:
+       max.f64         %fd106, %fd1, %fd68;
+       bra.uni         BB24_129;
 
-BB24_83:
-       setp.eq.s32     %p85, %r6, 6;
-       @%p85 bra       BB24_118;
+BB24_79:
+       setp.eq.s32     %p89, %r6, 6;
+       @%p89 bra       BB24_112;
 
-       setp.eq.s32     %p86, %r6, 7;
-       @%p86 bra       BB24_117;
-       bra.uni         BB24_85;
+       setp.eq.s32     %p90, %r6, 7;
+       @%p90 bra       BB24_111;
+       bra.uni         BB24_81;
 
-BB24_117:
-       setp.gt.f64     %p110, %fd1, %fd68;
-       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
-       bra.uni         BB24_137;
+BB24_111:
+       setp.gt.f64     %p116, %fd1, %fd68;
+       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
+       bra.uni         BB24_129;
 
-BB24_100:
-       setp.eq.s32     %p72, %r6, 16;
-       @%p72 bra       BB24_112;
+BB24_96:
+       setp.eq.s32     %p76, %r6, 16;
+       @%p76 bra       BB24_106;
 
-       setp.eq.s32     %p73, %r6, 17;
-       @%p73 bra       BB24_107;
-       bra.uni         BB24_102;
+       setp.eq.s32     %p77, %r6, 17;
+       @%p77 bra       BB24_102;
+       bra.uni         BB24_98;
 
-BB24_107:
-       setp.eq.f64     %p98, %fd68, 0d0000000000000000;
-       setp.eq.f64     %p99, %fd68, 0d8000000000000000;
-       or.pred         %p100, %p98, %p99;
-       mov.f64         %fd108, 0d7FF8000000000000;
-       @%p100 bra      BB24_137;
+BB24_102:
+       setp.eq.f64     %p103, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p104, %fd68, 0d8000000000000000;
+       or.pred         %p105, %p103, %p104;
+       mov.f64         %fd106, 0d7FF8000000000000;
+       @%p105 bra      BB24_129;
 
-       div.rn.f64      %fd108, %fd1, %fd68;
-       abs.f64         %fd83, %fd108;
-       setp.gtu.f64    %p101, %fd83, 0d7FF0000000000000;
-       @%p101 bra      BB24_137;
+       div.rn.f64      %fd106, %fd1, %fd68;
+       abs.f64         %fd83, %fd106;
+       setp.gtu.f64    %p106, %fd83, 0d7FF0000000000000;
+       @%p106 bra      BB24_129;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r53}, %fd108;
+       mov.b64         {%r52, %temp}, %fd106;
        }
-       and.b32         %r54, %r53, 2147483647;
-       setp.ne.s32     %p102, %r54, 2146435072;
-       @%p102 bra      BB24_111;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r55, %temp}, %fd108;
+       mov.b64         {%temp, %r53}, %fd106;
        }
-       setp.eq.s32     %p103, %r55, 0;
-       @%p103 bra      BB24_137;
-
-BB24_111:
-       cvt.rmi.f64.f64 %fd84, %fd108;
+       and.b32         %r54, %r53, 2147483647;
+       setp.ne.s32     %p107, %r54, 2146435072;
+       setp.ne.s32     %p108, %r52, 0;
+       or.pred         %p109, %p107, %p108;
+       @!%p109 bra     BB24_129;
+       bra.uni         BB24_105;
+
+BB24_105:
+       cvt.rmi.f64.f64 %fd84, %fd106;
        mul.f64         %fd85, %fd84, %fd68;
-       sub.f64         %fd108, %fd1, %fd85;
-       bra.uni         BB24_137;
+       sub.f64         %fd106, %fd1, %fd85;
+       bra.uni         BB24_129;
 
 BB24_6:
        setp.eq.s32     %p30, %r6, 1;
        @%p30 bra       BB24_7;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_7:
-       sub.f64         %fd99, %fd68, %fd1;
-       bra.uni         BB24_69;
+       sub.f64         %fd98, %fd68, %fd1;
+       bra.uni         BB24_65;
 
 BB24_22:
        setp.eq.s32     %p18, %r6, 10;
        @%p18 bra       BB24_23;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_23:
-       setp.neu.f64    %p43, %fd1, %fd68;
-       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
-       bra.uni         BB24_69;
+       setp.neu.f64    %p45, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
+       bra.uni         BB24_65;
 
 BB24_13:
        setp.eq.s32     %p25, %r6, 5;
        @%p25 bra       BB24_14;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_14:
-       setp.gt.f64     %p48, %fd1, %fd68;
-       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
-       bra.uni         BB24_69;
+       setp.gt.f64     %p50, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
+       bra.uni         BB24_65;
 
 BB24_30:
        setp.eq.s32     %p12, %r6, 15;
        @%p12 bra       BB24_31;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_31:
        mul.f64         %fd76, %fd1, %fd68;
        mov.f64         %fd77, 0d3FF0000000000000;
-       sub.f64         %fd99, %fd77, %fd76;
-       bra.uni         BB24_69;
+       sub.f64         %fd98, %fd77, %fd76;
+       bra.uni         BB24_65;
 
 BB24_9:
        setp.eq.s32     %p28, %r6, 3;
        @%p28 bra       BB24_10;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_10:
-       div.rn.f64      %fd99, %fd68, %fd1;
-       bra.uni         BB24_69;
+       div.rn.f64      %fd98, %fd68, %fd1;
+       bra.uni         BB24_65;
 
-BB24_47:
-       min.f64         %fd99, %fd68, %fd1;
-       bra.uni         BB24_69;
+BB24_45:
+       min.f64         %fd98, %fd68, %fd1;
+       bra.uni         BB24_65;
 
 BB24_26:
        setp.eq.s32     %p16, %r6, 13;
        @%p16 bra       BB24_27;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_27:
        cvt.rni.s64.f64 %rd12, %fd68;
@@ -2752,348 +2704,344 @@ BB24_27:
        cvt.u32.u64     %r21, %rd12;
        cvt.u32.u64     %r22, %rd13;
        and.b32         %r23, %r22, %r21;
-       setp.eq.s32     %p42, %r23, 0;
-       selp.f64        %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
-       bra.uni         BB24_69;
+       setp.eq.s32     %p44, %r23, 0;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
+       bra.uni         BB24_65;
 
-BB24_50:
-       setp.ltu.f64    %p47, %fd1, %fd68;
-       selp.f64        %fd99, 0d0000000000000000, 0d3FF0000000000000, %p47;
-       bra.uni         BB24_69;
+BB24_48:
+       setp.ge.f64     %p49, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
+       bra.uni         BB24_65;
 
 BB24_17:
        setp.eq.s32     %p23, %r6, 8;
        @%p23 bra       BB24_18;
-       bra.uni         BB24_69;
+       bra.uni         BB24_65;
 
 BB24_18:
-       setp.gtu.f64    %p45, %fd1, %fd68;
-       selp.f64        %fd99, 0d0000000000000000, 0d3FF0000000000000, %p45;
-       bra.uni         BB24_69;
+       setp.le.f64     %p47, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
+       bra.uni         BB24_65;
 
-BB24_44:
-       setp.neu.f64    %p40, %fd68, 0d0000000000000000;
+BB24_42:
+       setp.neu.f64    %p42, %fd68, 0d0000000000000000;
        sub.f64         %fd75, %fd68, %fd1;
-       selp.f64        %fd99, %fd75, 0d0000000000000000, %p40;
-       bra.uni         BB24_69;
+       selp.f64        %fd98, %fd75, 0d0000000000000000, %p42;
+       bra.uni         BB24_65;
 
 BB24_34:
        setp.ne.s32     %p10, %r6, 18;
-       @%p10 bra       BB24_69;
+       @%p10 bra       BB24_65;
 
-       div.rn.f64      %fd99, %fd68, %fd1;
-       abs.f64         %fd70, %fd99;
+       div.rn.f64      %fd98, %fd68, %fd1;
+       abs.f64         %fd70, %fd98;
        setp.gtu.f64    %p31, %fd70, 0d7FF0000000000000;
-       @%p31 bra       BB24_69;
+       @%p31 bra       BB24_65;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r12}, %fd99;
+       mov.b64         {%r12, %temp}, %fd98;
        }
-       and.b32         %r13, %r12, 2147483647;
-       setp.ne.s32     %p32, %r13, 2146435072;
-       @%p32 bra       BB24_38;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r14, %temp}, %fd99;
+       mov.b64         {%temp, %r13}, %fd98;
        }
-       setp.eq.s32     %p33, %r14, 0;
-       @%p33 bra       BB24_69;
+       and.b32         %r14, %r13, 2147483647;
+       setp.ne.s32     %p32, %r14, 2146435072;
+       setp.ne.s32     %p33, %r12, 0;
+       or.pred         %p34, %p32, %p33;
+       @!%p34 bra      BB24_65;
+       bra.uni         BB24_37;
 
-BB24_38:
-       cvt.rmi.f64.f64 %fd99, %fd99;
-       bra.uni         BB24_69;
+BB24_37:
+       cvt.rmi.f64.f64 %fd98, %fd98;
+       bra.uni         BB24_65;
 
-BB24_74:
-       setp.eq.s32     %p94, %r6, 1;
-       @%p94 bra       BB24_75;
-       bra.uni         BB24_137;
+BB24_70:
+       setp.eq.s32     %p98, %r6, 1;
+       @%p98 bra       BB24_71;
+       bra.uni         BB24_129;
 
-BB24_75:
-       sub.f64         %fd108, %fd1, %fd68;
-       bra.uni         BB24_137;
+BB24_71:
+       sub.f64         %fd106, %fd1, %fd68;
+       bra.uni         BB24_129;
 
-BB24_90:
-       setp.eq.s32     %p82, %r6, 10;
-       @%p82 bra       BB24_91;
-       bra.uni         BB24_137;
+BB24_86:
+       setp.eq.s32     %p86, %r6, 10;
+       @%p86 bra       BB24_87;
+       bra.uni         BB24_129;
 
-BB24_91:
-       setp.neu.f64    %p107, %fd1, %fd68;
-       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
-       bra.uni         BB24_137;
+BB24_87:
+       setp.neu.f64    %p113, %fd1, %fd68;
+       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
+       bra.uni         BB24_129;
 
-BB24_81:
-       setp.eq.s32     %p89, %r6, 5;
-       @%p89 bra       BB24_82;
-       bra.uni         BB24_137;
+BB24_77:
+       setp.eq.s32     %p93, %r6, 5;
+       @%p93 bra       BB24_78;
+       bra.uni         BB24_129;
 
-BB24_82:
-       setp.lt.f64     %p112, %fd1, %fd68;
-       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
-       bra.uni         BB24_137;
+BB24_78:
+       setp.lt.f64     %p118, %fd1, %fd68;
+       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
+       bra.uni         BB24_129;
 
-BB24_98:
-       setp.eq.s32     %p76, %r6, 15;
-       @%p76 bra       BB24_99;
-       bra.uni         BB24_137;
+BB24_94:
+       setp.eq.s32     %p80, %r6, 15;
+       @%p80 bra       BB24_95;
+       bra.uni         BB24_129;
 
-BB24_99:
+BB24_95:
        mul.f64         %fd87, %fd1, %fd68;
        mov.f64         %fd88, 0d3FF0000000000000;
-       sub.f64         %fd108, %fd88, %fd87;
-       bra.uni         BB24_137;
+       sub.f64         %fd106, %fd88, %fd87;
+       bra.uni         BB24_129;
 
-BB24_77:
-       setp.eq.s32     %p92, %r6, 3;
-       @%p92 bra       BB24_78;
-       bra.uni         BB24_137;
+BB24_73:
+       setp.eq.s32     %p96, %r6, 3;
+       @%p96 bra       BB24_74;
+       bra.uni         BB24_129;
 
-BB24_78:
-       div.rn.f64      %fd108, %fd1, %fd68;
-       bra.uni         BB24_137;
+BB24_74:
+       div.rn.f64      %fd106, %fd1, %fd68;
+       bra.uni         BB24_129;
 
-BB24_115:
-       min.f64         %fd108, %fd1, %fd68;
-       bra.uni         BB24_137;
+BB24_109:
+       min.f64         %fd106, %fd1, %fd68;
+       bra.uni         BB24_129;
 
-BB24_94:
-       setp.eq.s32     %p80, %r6, 13;
-       @%p80 bra       BB24_95;
-       bra.uni         BB24_137;
+BB24_90:
+       setp.eq.s32     %p84, %r6, 13;
+       @%p84 bra       BB24_91;
+       bra.uni         BB24_129;
 
-BB24_95:
+BB24_91:
        cvt.rni.s64.f64 %rd17, %fd1;
        cvt.rni.s64.f64 %rd18, %fd68;
-       cvt.u32.u64     %r59, %rd17;
-       cvt.u32.u64     %r60, %rd18;
-       and.b32         %r61, %r60, %r59;
-       setp.eq.s32     %p106, %r61, 0;
-       selp.f64        %fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
-       bra.uni         BB24_137;
+       cvt.u32.u64     %r58, %rd17;
+       cvt.u32.u64     %r59, %rd18;
+       and.b32         %r60, %r59, %r58;
+       setp.eq.s32     %p112, %r60, 0;
+       selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
+       bra.uni         BB24_129;
 
-BB24_118:
-       setp.gtu.f64    %p111, %fd1, %fd68;
-       selp.f64        %fd108, 0d0000000000000000, 0d3FF0000000000000, %p111;
-       bra.uni         BB24_137;
+BB24_112:
+       setp.le.f64     %p117, %fd1, %fd68;
+       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
+       bra.uni         BB24_129;
 
-BB24_85:
-       setp.eq.s32     %p87, %r6, 8;
-       @%p87 bra       BB24_86;
-       bra.uni         BB24_137;
+BB24_81:
+       setp.eq.s32     %p91, %r6, 8;
+       @%p91 bra       BB24_82;
+       bra.uni         BB24_129;
 
-BB24_86:
-       setp.ltu.f64    %p109, %fd1, %fd68;
-       selp.f64        %fd108, 0d0000000000000000, 0d3FF0000000000000, %p109;
-       bra.uni         BB24_137;
+BB24_82:
+       setp.ge.f64     %p115, %fd1, %fd68;
+       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
+       bra.uni         BB24_129;
 
-BB24_112:
-       setp.neu.f64    %p104, %fd1, 0d0000000000000000;
+BB24_106:
+       setp.neu.f64    %p110, %fd1, 0d0000000000000000;
        sub.f64         %fd86, %fd1, %fd68;
-       selp.f64        %fd108, %fd86, 0d0000000000000000, %p104;
-       bra.uni         BB24_137;
+       selp.f64        %fd106, %fd86, 0d0000000000000000, %p110;
+       bra.uni         BB24_129;
 
-BB24_102:
-       setp.ne.s32     %p74, %r6, 18;
-       @%p74 bra       BB24_137;
+BB24_98:
+       setp.ne.s32     %p78, %r6, 18;
+       @%p78 bra       BB24_129;
 
-       div.rn.f64      %fd108, %fd1, %fd68;
-       abs.f64         %fd81, %fd108;
-       setp.gtu.f64    %p95, %fd81, 0d7FF0000000000000;
-       @%p95 bra       BB24_137;
+       div.rn.f64      %fd106, %fd1, %fd68;
+       abs.f64         %fd81, %fd106;
+       setp.gtu.f64    %p99, %fd81, 0d7FF0000000000000;
+       @%p99 bra       BB24_129;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r50}, %fd108;
+       mov.b64         {%r49, %temp}, %fd106;
        }
-       and.b32         %r51, %r50, 2147483647;
-       setp.ne.s32     %p96, %r51, 2146435072;
-       @%p96 bra       BB24_106;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r52, %temp}, %fd108;
+       mov.b64         {%temp, %r50}, %fd106;
        }
-       setp.eq.s32     %p97, %r52, 0;
-       @%p97 bra       BB24_137;
+       and.b32         %r51, %r50, 2147483647;
+       setp.ne.s32     %p100, %r51, 2146435072;
+       setp.ne.s32     %p101, %r49, 0;
+       or.pred         %p102, %p100, %p101;
+       @!%p102 bra     BB24_129;
+       bra.uni         BB24_101;
 
-BB24_106:
-       cvt.rmi.f64.f64 %fd108, %fd108;
-       bra.uni         BB24_137;
+BB24_101:
+       cvt.rmi.f64.f64 %fd106, %fd106;
+       bra.uni         BB24_129;
 
-BB24_54:
-       setp.gt.s32     %p52, %r2, -1;
-       @%p52 bra       BB24_57;
+BB24_52:
+       setp.gt.s32     %p54, %r2, -1;
+       @%p54 bra       BB24_55;
 
        cvt.rzi.f64.f64 %fd78, %fd1;
-       setp.neu.f64    %p53, %fd78, %fd1;
-       selp.f64        %fd97, 0dFFF8000000000000, %fd97, %p53;
+       setp.neu.f64    %p55, %fd78, %fd1;
+       selp.f64        %fd96, 0dFFF8000000000000, %fd96, %p55;
 
-BB24_57:
-       mov.f64         %fd24, %fd97;
+BB24_55:
+       mov.f64         %fd24, %fd96;
        add.f64         %fd25, %fd1, %fd68;
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r33}, %fd25;
        }
        and.b32         %r34, %r33, 2146435072;
-       setp.ne.s32     %p56, %r34, 2146435072;
-       mov.f64         %fd96, %fd24;
-       @%p56 bra       BB24_66;
-
-       setp.gtu.f64    %p57, %fd18, 0d7FF0000000000000;
-       mov.f64         %fd96, %fd25;
-       @%p57 bra       BB24_66;
+       setp.ne.s32     %p58, %r34, 2146435072;
+       mov.f64         %fd95, %fd24;
+       @%p58 bra       BB24_62;
 
-       abs.f64         %fd79, %fd1;
-       setp.gtu.f64    %p58, %fd79, 0d7FF0000000000000;
+       setp.gtu.f64    %p59, %fd18, 0d7FF0000000000000;
        mov.f64         %fd95, %fd25;
-       mov.f64         %fd96, %fd95;
-       @%p58 bra       BB24_66;
-
-       and.b32         %r35, %r3, 2147483647;
-       setp.ne.s32     %p59, %r35, 2146435072;
        @%p59 bra       BB24_62;
 
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r36, %temp}, %fd1;
-       }
-       setp.eq.s32     %p60, %r36, 0;
-       @%p60 bra       BB24_65;
-
-BB24_62:
-       and.b32         %r37, %r2, 2147483647;
-       setp.ne.s32     %p61, %r37, 2146435072;
-       mov.f64         %fd93, %fd24;
-       mov.f64         %fd96, %fd93;
-       @%p61 bra       BB24_66;
+       abs.f64         %fd79, %fd1;
+       setp.gtu.f64    %p60, %fd79, 0d7FF0000000000000;
+       mov.f64         %fd94, %fd25;
+       mov.f64         %fd95, %fd94;
+       @%p60 bra       BB24_62;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r38, %temp}, %fd68;
+       mov.b64         {%r35, %temp}, %fd1;
        }
-       setp.ne.s32     %p62, %r38, 0;
-       mov.f64         %fd96, %fd24;
-       @%p62 bra       BB24_66;
+       and.b32         %r36, %r3, 2147483647;
+       setp.eq.s32     %p61, %r36, 2146435072;
+       setp.eq.s32     %p62, %r35, 0;
+       and.pred        %p63, %p61, %p62;
+       @%p63 bra       BB24_61;
+       bra.uni         BB24_59;
+
+BB24_61:
+       setp.gt.f64     %p67, %fd18, 0d3FF0000000000000;
+       selp.b32        %r44, 2146435072, 0, %p67;
+       xor.b32         %r45, %r44, 2146435072;
+       setp.lt.s32     %p68, %r3, 0;
+       selp.b32        %r46, %r45, %r44, %p68;
+       setp.eq.f64     %p69, %fd68, 0dBFF0000000000000;
+       selp.b32        %r47, 1072693248, %r46, %p69;
+       mov.u32         %r48, 0;
+       mov.b64         %fd95, {%r48, %r47};
+       bra.uni         BB24_62;
 
-       shr.s32         %r39, %r3, 31;
-       and.b32         %r40, %r39, -2146435072;
-       add.s32         %r41, %r40, 2146435072;
-       or.b32          %r42, %r41, -2147483648;
-       selp.b32        %r43, %r42, %r41, %p1;
-       mov.u32         %r44, 0;
-       mov.b64         %fd96, {%r44, %r43};
-       bra.uni         BB24_66;
-
-BB24_122:
-       setp.gt.s32     %p116, %r4, -1;
-       @%p116 bra      BB24_125;
+BB24_116:
+       setp.gt.s32     %p122, %r4, -1;
+       @%p122 bra      BB24_119;
 
        cvt.rzi.f64.f64 %fd89, %fd68;
-       setp.neu.f64    %p117, %fd89, %fd68;
-       selp.f64        %fd106, 0dFFF8000000000000, %fd106, %p117;
+       setp.neu.f64    %p123, %fd89, %fd68;
+       selp.f64        %fd104, 0dFFF8000000000000, %fd104, %p123;
 
-BB24_125:
-       mov.f64         %fd57, %fd106;
+BB24_119:
+       mov.f64         %fd57, %fd104;
        add.f64         %fd58, %fd1, %fd68;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r71}, %fd58;
+       mov.b64         {%temp, %r70}, %fd58;
        }
-       and.b32         %r72, %r71, 2146435072;
-       setp.ne.s32     %p120, %r72, 2146435072;
-       mov.f64         %fd105, %fd57;
-       @%p120 bra      BB24_134;
+       and.b32         %r71, %r70, 2146435072;
+       setp.ne.s32     %p126, %r71, 2146435072;
+       mov.f64         %fd103, %fd57;
+       @%p126 bra      BB24_126;
 
-       setp.gtu.f64    %p121, %fd51, 0d7FF0000000000000;
-       mov.f64         %fd105, %fd58;
-       @%p121 bra      BB24_134;
+       setp.gtu.f64    %p127, %fd51, 0d7FF0000000000000;
+       mov.f64         %fd103, %fd58;
+       @%p127 bra      BB24_126;
 
        abs.f64         %fd90, %fd68;
-       setp.gtu.f64    %p122, %fd90, 0d7FF0000000000000;
-       mov.f64         %fd104, %fd58;
-       mov.f64         %fd105, %fd104;
-       @%p122 bra      BB24_134;
-
-       and.b32         %r73, %r5, 2147483647;
-       setp.ne.s32     %p123, %r73, 2146435072;
-       @%p123 bra      BB24_130;
+       setp.gtu.f64    %p128, %fd90, 0d7FF0000000000000;
+       mov.f64         %fd102, %fd58;
+       mov.f64         %fd103, %fd102;
+       @%p128 bra      BB24_126;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r74, %temp}, %fd68;
+       mov.b64         {%r72, %temp}, %fd68;
        }
-       setp.eq.s32     %p124, %r74, 0;
-       @%p124 bra      BB24_133;
-
-BB24_130:
-       and.b32         %r75, %r4, 2147483647;
-       setp.ne.s32     %p125, %r75, 2146435072;
-       mov.f64         %fd102, %fd57;
-       mov.f64         %fd105, %fd102;
-       @%p125 bra      BB24_134;
+       and.b32         %r73, %r5, 2147483647;
+       setp.eq.s32     %p129, %r73, 2146435072;
+       setp.eq.s32     %p130, %r72, 0;
+       and.pred        %p131, %p129, %p130;
+       @%p131 bra      BB24_125;
+       bra.uni         BB24_123;
 
+BB24_125:
+       setp.gt.f64     %p135, %fd51, 0d3FF0000000000000;
+       selp.b32        %r81, 2146435072, 0, %p135;
+       xor.b32         %r82, %r81, 2146435072;
+       setp.lt.s32     %p136, %r5, 0;
+       selp.b32        %r83, %r82, %r81, %p136;
+       setp.eq.f64     %p137, %fd1, 0dBFF0000000000000;
+       selp.b32        %r84, 1072693248, %r83, %p137;
+       mov.u32         %r85, 0;
+       mov.b64         %fd103, {%r85, %r84};
+       bra.uni         BB24_126;
+
+BB24_59:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r76, %temp}, %fd1;
+       mov.b64         {%r37, %temp}, %fd68;
        }
-       setp.ne.s32     %p126, %r76, 0;
-       mov.f64         %fd105, %fd57;
-       @%p126 bra      BB24_134;
+       and.b32         %r38, %r2, 2147483647;
+       setp.eq.s32     %p64, %r38, 2146435072;
+       setp.eq.s32     %p65, %r37, 0;
+       and.pred        %p66, %p64, %p65;
+       mov.f64         %fd95, %fd24;
+       @!%p66 bra      BB24_62;
+       bra.uni         BB24_60;
+
+BB24_60:
+       shr.s32         %r39, %r3, 31;
+       and.b32         %r40, %r39, -2146435072;
+       selp.b32        %r41, -1048576, 2146435072, %p1;
+       add.s32         %r42, %r41, %r40;
+       mov.u32         %r43, 0;
+       mov.b64         %fd95, {%r43, %r42};
 
-       shr.s32         %r77, %r5, 31;
-       and.b32         %r78, %r77, -2146435072;
-       add.s32         %r79, %r78, 2146435072;
-       or.b32          %r80, %r79, -2147483648;
-       selp.b32        %r81, %r80, %r79, %p2;
-       mov.u32         %r82, 0;
-       mov.b64         %fd105, {%r82, %r81};
-       bra.uni         BB24_134;
+BB24_62:
+       setp.eq.f64     %p70, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p71, %fd68, 0d3FF0000000000000;
+       or.pred         %p72, %p71, %p70;
+       selp.f64        %fd98, 0d3FF0000000000000, %fd95, %p72;
 
 BB24_65:
-       setp.gt.f64     %p63, %fd18, 0d3FF0000000000000;
-       selp.b32        %r45, 2146435072, 0, %p63;
-       xor.b32         %r46, %r45, 2146435072;
-       setp.lt.s32     %p64, %r3, 0;
-       selp.b32        %r47, %r46, %r45, %p64;
-       setp.eq.f64     %p65, %fd68, 0dBFF0000000000000;
-       selp.b32        %r48, 1072693248, %r47, %p65;
-       mov.u32         %r49, 0;
-       mov.b64         %fd96, {%r49, %r48};
-
-BB24_66:
-       setp.eq.f64     %p66, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p67, %fd68, 0d3FF0000000000000;
-       or.pred         %p68, %p67, %p66;
-       selp.f64        %fd99, 0d3FF0000000000000, %fd96, %p68;
-
-BB24_69:
-       st.global.f64   [%rd1], %fd99;
-       bra.uni         BB24_138;
-
-BB24_133:
-       setp.gt.f64     %p127, %fd51, 0d3FF0000000000000;
-       selp.b32        %r83, 2146435072, 0, %p127;
-       xor.b32         %r84, %r83, 2146435072;
-       setp.lt.s32     %p128, %r5, 0;
-       selp.b32        %r85, %r84, %r83, %p128;
-       setp.eq.f64     %p129, %fd1, 0dBFF0000000000000;
-       selp.b32        %r86, 1072693248, %r85, %p129;
-       mov.u32         %r87, 0;
-       mov.b64         %fd105, {%r87, %r86};
+       st.global.f64   [%rd1], %fd98;
+       bra.uni         BB24_130;
 
-BB24_134:
-       setp.eq.f64     %p130, %fd68, 0d0000000000000000;
-       setp.eq.f64     %p131, %fd1, 0d3FF0000000000000;
-       or.pred         %p132, %p131, %p130;
-       selp.f64        %fd108, 0d3FF0000000000000, %fd105, %p132;
+BB24_123:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r74, %temp}, %fd1;
+       }
+       and.b32         %r75, %r4, 2147483647;
+       setp.eq.s32     %p132, %r75, 2146435072;
+       setp.eq.s32     %p133, %r74, 0;
+       and.pred        %p134, %p132, %p133;
+       mov.f64         %fd103, %fd57;
+       @!%p134 bra     BB24_126;
+       bra.uni         BB24_124;
 
-BB24_137:
-       st.global.f64   [%rd1], %fd108;
+BB24_124:
+       shr.s32         %r76, %r5, 31;
+       and.b32         %r77, %r76, -2146435072;
+       selp.b32        %r78, -1048576, 2146435072, %p2;
+       add.s32         %r79, %r78, %r77;
+       mov.u32         %r80, 0;
+       mov.b64         %fd103, {%r80, %r79};
+
+BB24_126:
+       setp.eq.f64     %p138, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p139, %fd1, 0d3FF0000000000000;
+       or.pred         %p140, %p139, %p138;
+       selp.f64        %fd106, 0d3FF0000000000000, %fd103, %p140;
+
+BB24_129:
+       st.global.f64   [%rd1], %fd106;
 
-BB24_138:
+BB24_130:
        bar.sync        0;
        ret;
 }
@@ -3698,8 +3646,8 @@ BB25_27:
        bra.uni         BB25_63;
 
 BB25_46:
-       setp.gtu.f32    %p43, %f1, %f2;
-       selp.f32        %f261, 0f00000000, 0f3F800000, %p43;
+       setp.le.f32     %p43, %f1, %f2;
+       selp.f32        %f261, 0f3F800000, 0f00000000, %p43;
        bra.uni         BB25_63;
 
 BB25_17:
@@ -3708,8 +3656,8 @@ BB25_17:
        bra.uni         BB25_63;
 
 BB25_18:
-       setp.ltu.f32    %p41, %f1, %f2;
-       selp.f32        %f261, 0f00000000, 0f3F800000, %p41;
+       setp.ge.f32     %p41, %f1, %f2;
+       selp.f32        %f261, 0f3F800000, 0f00000000, %p41;
        bra.uni         BB25_63;
 
 BB25_40:
@@ -3799,8 +3747,8 @@ BB25_89:
        bra.uni         BB25_125;
 
 BB25_108:
-       setp.gtu.f32    %p110, %f2, %f1;
-       selp.f32        %f264, 0f00000000, 0f3F800000, %p110;
+       setp.le.f32     %p110, %f2, %f1;
+       selp.f32        %f264, 0f3F800000, 0f00000000, %p110;
        bra.uni         BB25_125;
 
 BB25_79:
@@ -3809,8 +3757,8 @@ BB25_79:
        bra.uni         BB25_125;
 
 BB25_80:
-       setp.ltu.f32    %p108, %f2, %f1;
-       selp.f32        %f264, 0f00000000, 0f3F800000, %p108;
+       setp.ge.f32     %p108, %f2, %f1;
+       selp.f32        %f264, 0f3F800000, 0f00000000, %p108;
        bra.uni         BB25_125;
 
 BB25_102:
@@ -3915,8 +3863,8 @@ BB25_56:
        setp.neu.f32    %p64, %f20, 0f7F800000;
        @%p64 bra       BB25_60;
 
-       setp.ltu.f32    %p65, %f2, 0f00000000;
-       selp.b32        %r27, 0, 2139095040, %p65;
+       setp.ge.f32     %p65, %f2, 0f00000000;
+       selp.b32        %r27, 2139095040, 0, %p65;
        or.b32          %r28, %r27, -2147483648;
        selp.b32        %r29, %r28, %r27, %p1;
        mov.b32          %f260, %r29;
@@ -3935,8 +3883,8 @@ BB25_118:
        setp.neu.f32    %p131, %f57, 0f7F800000;
        @%p131 bra      BB25_122;
 
-       setp.ltu.f32    %p132, %f1, 0f00000000;
-       selp.b32        %r52, 0, 2139095040, %p132;
+       setp.ge.f32     %p132, %f1, 0f00000000;
+       selp.b32        %r52, 2139095040, 0, %p132;
        or.b32          %r53, %r52, -2147483648;
        selp.b32        %r54, %r53, %r52, %p2;
        mov.b32          %f263, %r54;
@@ -8171,15 +8119,15 @@ BB60_4:
        mov.u64         %rd24, 0;
        @%p5 bra        BB61_6;
 
-       setp.eq.s32     %p6, %r3, 189;
+       setp.ne.s32     %p6, %r3, 189;
        mov.u32         %r14, 64;
        sub.s32         %r15, %r14, %r4;
        shl.b64         %rd14, %rd25, %r15;
        cvt.u64.u32     %rd15, %r4;
-       selp.b64        %rd16, 0, %rd15, %p6;
+       selp.b64        %rd16, %rd15, 0, %p6;
        cvt.u32.u64     %r16, %rd16;
        shr.u64         %rd24, %rd25, %r16;
-       selp.b64        %rd25, 0, %rd14, %p6;
+       selp.b64        %rd25, %rd14, 0, %p6;
 
 BB61_6:
        shr.u64         %rd17, %rd25, 63;
@@ -8455,16 +8403,16 @@ BB64_9:
        setp.lt.f32     %p2, %f5, 0f00800000;
        mul.f32         %f6, %f5, 0f4B000000;
        selp.f32        %f1, %f6, %f5, %p2;
-       selp.f32        %f7, 0fC1B80000, 0f00000000, %p2;
        mov.b32          %r6, %f1;
        add.s32         %r7, %r6, -1059760811;
        and.b32         %r8, %r7, -8388608;
        sub.s32         %r9, %r6, %r8;
-       mov.b32          %f8, %r9;
-       cvt.rn.f32.s32  %f9, %r8;
+       mov.b32          %f7, %r9;
+       cvt.rn.f32.s32  %f8, %r8;
+       selp.f32        %f9, 0fC1B80000, 0f00000000, %p2;
        mov.f32         %f10, 0f34000000;
-       fma.rn.f32      %f11, %f9, %f10, %f7;
-       add.f32         %f12, %f8, 0fBF800000;
+       fma.rn.f32      %f11, %f8, %f10, %f9;
+       add.f32         %f12, %f7, 0fBF800000;
        mov.f32         %f13, 0f3E1039F6;
        mov.f32         %f14, 0fBE055027;
        fma.rn.f32      %f15, %f14, %f12, %f13;
@@ -8658,7 +8606,7 @@ BB69_2:
        .local .align 4 .b8     __local_depot70[4];
        .reg .b64       %SP;
        .reg .b64       %SPL;
-       .reg .pred      %p<7>;
+       .reg .pred      %p<9>;
        .reg .b32       %r<18>;
        .reg .f64       %fd<41>;
        .reg .b64       %rd<17>;
@@ -8676,7 +8624,7 @@ BB69_2:
        mov.u32         %r8, %tid.x;
        mad.lo.s32      %r1, %r6, %r7, %r8;
        setp.ge.u32     %p1, %r1, %r5;
-       @%p1 bra        BB70_11;
+       @%p1 bra        BB70_10;
 
        cvta.to.global.u64      %rd6, %rd3;
        cvt.s64.s32     %rd2, %r1;
@@ -8685,23 +8633,24 @@ BB69_2:
        ld.global.f64   %fd38, [%rd8];
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r9}, %fd38;
+       mov.b64         {%r9, %temp}, %fd38;
        }
-       and.b32         %r10, %r9, 2147483647;
-       setp.ne.s32     %p2, %r10, 2146435072;
-       @%p2 bra        BB70_4;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r11, %temp}, %fd38;
+       mov.b64         {%temp, %r10}, %fd38;
        }
-       setp.ne.s32     %p3, %r11, 0;
-       @%p3 bra        BB70_4;
+       and.b32         %r11, %r10, 2147483647;
+       setp.eq.s32     %p2, %r11, 2146435072;
+       setp.eq.s32     %p3, %r9, 0;
+       and.pred        %p4, %p2, %p3;
+       @!%p4 bra       BB70_3;
+       bra.uni         BB70_2;
 
+BB70_2:
        mov.f64         %fd14, 0d0000000000000000;
        mul.rn.f64      %fd38, %fd38, %fd14;
 
-BB70_4:
+BB70_3:
        mul.f64         %fd15, %fd38, 0d3FE45F306DC9C883;
        cvt.rni.s32.f64 %r17, %fd15;
        st.local.u32    [%rd1], %r17;
@@ -8718,8 +8667,8 @@ BB70_4:
        mov.b64         {%temp, %r12}, %fd38;
        }
        and.b32         %r13, %r12, 2145386496;
-       setp.lt.u32     %p4, %r13, 1105199104;
-       @%p4 bra        BB70_6;
+       setp.lt.u32     %p5, %r13, 1105199104;
+       @%p5 bra        BB70_5;
 
        // Callseq Start 3
        {
@@ -8742,11 +8691,11 @@ BB70_4:
        }// Callseq End 3
        ld.local.u32    %r17, [%rd1];
 
-BB70_6:
+BB70_5:
        and.b32         %r14, %r17, 1;
        shl.b32         %r15, %r14, 3;
-       setp.eq.s32     %p5, %r14, 0;
-       selp.f64        %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+       setp.eq.b32     %p6, %r14, 1;
+       selp.f64        %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
        mul.wide.u32    %rd10, %r15, 8;
        mov.u64         %rd11, __cudart_sin_cos_coeffs;
        add.s64         %rd12, %rd10, %rd11;
@@ -8764,27 +8713,28 @@ BB70_6:
        ld.const.f64    %fd34, [%rd12+48];
        fma.rn.f64      %fd8, %fd33, %fd7, %fd34;
        fma.rn.f64      %fd40, %fd8, %fd39, %fd39;
-       @%p5 bra        BB70_8;
+       setp.eq.s32     %p7, %r14, 0;
+       @%p7 bra        BB70_7;
 
        mov.f64         %fd35, 0d3FF0000000000000;
        fma.rn.f64      %fd40, %fd8, %fd7, %fd35;
 
-BB70_8:
+BB70_7:
        and.b32         %r16, %r17, 2;
-       setp.eq.s32     %p6, %r16, 0;
-       @%p6 bra        BB70_10;
+       setp.eq.s32     %p8, %r16, 0;
+       @%p8 bra        BB70_9;
 
        mov.f64         %fd36, 0d0000000000000000;
        mov.f64         %fd37, 0dBFF0000000000000;
        fma.rn.f64      %fd40, %fd40, %fd37, %fd36;
 
-BB70_10:
+BB70_9:
        cvta.to.global.u64      %rd13, %rd4;
        shl.b64         %rd14, %rd2, 3;
        add.s64         %rd15, %rd13, %rd14;
        st.global.f64   [%rd15], %fd40;
 
-BB70_11:
+BB70_10:
        ret;
 }
 
@@ -8800,7 +8750,7 @@ BB70_11:
        .reg .b64       %SPL;
        .reg .pred      %p<15>;
        .reg .f32       %f<48>;
-       .reg .b32       %r<97>;
+       .reg .b32       %r<95>;
        .reg .b64       %rd<22>;
 
 
@@ -8808,13 +8758,13 @@ BB70_11:
        cvta.local.u64  %SP, %rd21;
        ld.param.u64    %rd8, [matrix_sin_f_param_0];
        ld.param.u64    %rd9, [matrix_sin_f_param_1];
-       ld.param.u32    %r37, [matrix_sin_f_param_2];
-       mov.u32         %r38, %ntid.x;
-       mov.u32         %r39, %ctaid.x;
-       mov.u32         %r40, %tid.x;
-       mad.lo.s32      %r1, %r38, %r39, %r40;
-       setp.ge.u32     %p1, %r1, %r37;
-       @%p1 bra        BB71_24;
+       ld.param.u32    %r30, [matrix_sin_f_param_2];
+       mov.u32         %r31, %ntid.x;
+       mov.u32         %r32, %ctaid.x;
+       mov.u32         %r33, %tid.x;
+       mad.lo.s32      %r1, %r31, %r32, %r33;
+       setp.ge.u32     %p1, %r1, %r30;
+       @%p1 bra        BB71_22;
 
        cvta.to.global.u64      %rd10, %rd8;
        cvt.s64.s32     %rd1, %r1;
@@ -8832,8 +8782,8 @@ BB70_11:
 
 BB71_3:
        mul.f32         %f21, %f43, 0f3F22F983;
-       cvt.rni.s32.f32 %r96, %f21;
-       cvt.rn.f32.s32  %f22, %r96;
+       cvt.rni.s32.f32 %r94, %f21;
+       cvt.rn.f32.s32  %f22, %r94;
        neg.f32         %f23, %f22;
        mov.f32         %f24, 0f3FC90FDA;
        fma.rn.f32      %f25, %f23, %f24, %f43;
@@ -8843,12 +8793,12 @@ BB71_3:
        fma.rn.f32      %f44, %f23, %f28, %f27;
        abs.f32         %f29, %f43;
        setp.leu.f32    %p3, %f29, 0f47CE4780;
-       @%p3 bra        BB71_13;
+       @%p3 bra        BB71_11;
 
        mov.b32          %r3, %f43;
        shr.u32         %r4, %r3, 23;
-       shl.b32         %r43, %r3, 8;
-       or.b32          %r5, %r43, -2147483648;
+       shl.b32         %r36, %r3, 8;
+       or.b32          %r5, %r36, -2147483648;
        mov.u32         %r88, 0;
        mov.u64         %rd19, __cudart_i2opi_f;
        mov.u32         %r87, -6;
@@ -8857,14 +8807,14 @@ BB71_3:
 BB71_5:
        .pragma "nounroll";
        mov.u64         %rd4, %rd20;
-       ld.const.u32    %r46, [%rd19];
+       ld.const.u32    %r39, [%rd19];
        // inline asm
        {
-       mad.lo.cc.u32   %r44, %r46, %r5, %r88;
-       madc.hi.u32     %r88, %r46, %r5,  0;
+       mad.lo.cc.u32   %r37, %r39, %r5, %r88;
+       madc.hi.u32     %r88, %r39, %r5,  0;
        }
        // inline asm
-       st.local.u32    [%rd4], %r44;
+       st.local.u32    [%rd4], %r37;
        add.s64         %rd5, %rd4, 4;
        add.s64         %rd19, %rd19, 4;
        add.s32         %r87, %r87, 1;
@@ -8872,14 +8822,14 @@ BB71_5:
        mov.u64         %rd20, %rd5;
        @%p4 bra        BB71_5;
 
-       and.b32         %r49, %r4, 255;
-       add.s32         %r50, %r49, -128;
-       shr.u32         %r51, %r50, 5;
+       and.b32         %r42, %r4, 255;
+       add.s32         %r43, %r42, -128;
+       shr.u32         %r44, %r43, 5;
        and.b32         %r10, %r3, -2147483648;
        st.local.u32    [%rd2+24], %r88;
-       mov.u32         %r52, 6;
-       sub.s32         %r53, %r52, %r51;
-       mul.wide.s32    %rd15, %r53, 4;
+       mov.u32         %r45, 6;
+       sub.s32         %r46, %r45, %r44;
+       mul.wide.s32    %rd15, %r46, 4;
        add.s64         %rd7, %rd2, %rd15;
        ld.local.u32    %r89, [%rd7];
        ld.local.u32    %r90, [%rd7+-4];
@@ -8887,67 +8837,65 @@ BB71_5:
        setp.eq.s32     %p5, %r13, 0;
        @%p5 bra        BB71_8;
 
-       mov.u32         %r54, 32;
-       sub.s32         %r55, %r54, %r13;
-       shr.u32         %r56, %r90, %r55;
-       shl.b32         %r57, %r89, %r13;
-       add.s32         %r89, %r56, %r57;
-       ld.local.u32    %r58, [%rd7+-8];
-       shr.u32         %r59, %r58, %r55;
-       shl.b32         %r60, %r90, %r13;
-       add.s32         %r90, %r59, %r60;
+       mov.u32         %r47, 32;
+       sub.s32         %r48, %r47, %r13;
+       shr.u32         %r49, %r90, %r48;
+       shl.b32         %r50, %r89, %r13;
+       add.s32         %r89, %r49, %r50;
+       ld.local.u32    %r51, [%rd7+-8];
+       shr.u32         %r52, %r51, %r48;
+       shl.b32         %r53, %r90, %r13;
+       add.s32         %r90, %r52, %r53;
 
 BB71_8:
-       shr.u32         %r61, %r90, 30;
-       shl.b32         %r62, %r89, 2;
-       add.s32         %r91, %r61, %r62;
+       shr.u32         %r54, %r90, 30;
+       shl.b32         %r55, %r89, 2;
+       add.s32         %r91, %r54, %r55;
        shl.b32         %r19, %r90, 2;
-       shr.u32         %r63, %r91, 31;
-       shr.u32         %r64, %r89, 30;
-       add.s32         %r20, %r63, %r64;
-       setp.eq.s32     %p6, %r63, 0;
+       shr.u32         %r56, %r91, 31;
+       shr.u32         %r57, %r89, 30;
+       add.s32         %r20, %r56, %r57;
+       setp.eq.s32     %p6, %r56, 0;
        mov.u32         %r92, %r10;
        mov.u32         %r93, %r19;
        @%p6 bra        BB71_10;
 
-       not.b32         %r65, %r91;
+       not.b32         %r58, %r91;
        neg.s32         %r21, %r19;
        setp.eq.s32     %p7, %r19, 0;
-       selp.u32        %r66, 1, 0, %p7;
-       add.s32         %r91, %r66, %r65;
+       selp.u32        %r59, 1, 0, %p7;
+       add.s32         %r91, %r59, %r58;
        xor.b32         %r23, %r10, -2147483648;
        mov.u32         %r92, %r23;
        mov.u32         %r93, %r21;
 
 BB71_10:
        mov.u32         %r25, %r92;
-       neg.s32         %r67, %r20;
-       setp.eq.s32     %p8, %r10, 0;
-       selp.b32        %r96, %r20, %r67, %p8;
-       clz.b32         %r95, %r91;
-       setp.eq.s32     %p9, %r95, 0;
-       shl.b32         %r68, %r91, %r95;
-       mov.u32         %r69, 32;
-       sub.s32         %r70, %r69, %r95;
-       shr.u32         %r71, %r93, %r70;
-       add.s32         %r72, %r71, %r68;
-       selp.b32        %r29, %r91, %r72, %p9;
-       mov.u32         %r73, -921707870;
-       mul.hi.u32      %r94, %r29, %r73;
-       setp.lt.s32     %p10, %r94, 1;
-       @%p10 bra       BB71_12;
-
-       mul.lo.s32      %r74, %r29, -921707870;
-       shr.u32         %r75, %r74, 31;
-       shl.b32         %r76, %r94, 1;
-       add.s32         %r94, %r75, %r76;
-       add.s32         %r95, %r95, 1;
-
-BB71_12:
-       mov.u32         %r77, 126;
-       sub.s32         %r78, %r77, %r95;
+       neg.s32         %r60, %r20;
+       setp.ne.s32     %p8, %r10, 0;
+       selp.b32        %r94, %r60, %r20, %p8;
+       clz.b32         %r61, %r91;
+       setp.ne.s32     %p9, %r61, 0;
+       shl.b32         %r62, %r91, %r61;
+       mov.u32         %r63, 32;
+       sub.s32         %r64, %r63, %r61;
+       shr.u32         %r65, %r93, %r64;
+       add.s32         %r66, %r65, %r62;
+       selp.b32        %r67, %r66, %r91, %p9;
+       mul.lo.s32      %r68, %r67, -921707870;
+       mov.u32         %r69, -921707870;
+       mul.hi.u32      %r70, %r67, %r69;
+       setp.gt.s32     %p10, %r70, 0;
+       shl.b32         %r71, %r70, 1;
+       shr.u32         %r72, %r68, 31;
+       add.s32         %r73, %r72, %r71;
+       selp.b32        %r74, %r73, %r70, %p10;
+       selp.b32        %r75, -1, 0, %p10;
+       mov.u32         %r76, 126;
+       sub.s32         %r77, %r76, %r61;
+       add.s32         %r78, %r77, %r75;
        shl.b32         %r79, %r78, 23;
-       add.s32         %r80, %r94, 1;
+       add.s32         %r80, %r74, 1;
        shr.u32         %r81, %r80, 7;
        add.s32         %r82, %r81, 1;
        shr.u32         %r83, %r82, 1;
@@ -8955,60 +8903,60 @@ BB71_12:
        or.b32          %r85, %r84, %r25;
        mov.b32          %f44, %r85;
 
-BB71_13:
+BB71_11:
        mul.rn.f32      %f7, %f44, %f44;
-       and.b32         %r36, %r96, 1;
-       setp.eq.s32     %p11, %r36, 0;
-       @%p11 bra       BB71_15;
+       and.b32         %r29, %r94, 1;
+       setp.eq.s32     %p11, %r29, 0;
+       @%p11 bra       BB71_13;
 
        mov.f32         %f30, 0fBAB6061A;
        mov.f32         %f31, 0f37CCF5CE;
        fma.rn.f32      %f45, %f31, %f7, %f30;
-       bra.uni         BB71_16;
+       bra.uni         BB71_14;
 
-BB71_15:
+BB71_13:
        mov.f32         %f32, 0f3C08839E;
        mov.f32         %f33, 0fB94CA1F9;
        fma.rn.f32      %f45, %f33, %f7, %f32;
 
-BB71_16:
-       @%p11 bra       BB71_18;
+BB71_14:
+       @%p11 bra       BB71_16;
 
        mov.f32         %f34, 0f3D2AAAA5;
        fma.rn.f32      %f35, %f45, %f7, %f34;
        mov.f32         %f36, 0fBF000000;
        fma.rn.f32      %f46, %f35, %f7, %f36;
-       bra.uni         BB71_19;
+       bra.uni         BB71_17;
 
-BB71_18:
+BB71_16:
        mov.f32         %f37, 0fBE2AAAA3;
        fma.rn.f32      %f38, %f45, %f7, %f37;
        mov.f32         %f39, 0f00000000;
        fma.rn.f32      %f46, %f38, %f7, %f39;
 
-BB71_19:
+BB71_17:
        fma.rn.f32      %f47, %f46, %f44, %f44;
-       @%p11 bra       BB71_21;
+       @%p11 bra       BB71_19;
 
        mov.f32         %f40, 0f3F800000;
        fma.rn.f32      %f47, %f46, %f7, %f40;
 
-BB71_21:
-       and.b32         %r86, %r96, 2;
+BB71_19:
+       and.b32         %r86, %r94, 2;
        setp.eq.s32     %p14, %r86, 0;
-       @%p14 bra       BB71_23;
+       @%p14 bra       BB71_21;
 
        mov.f32         %f41, 0f00000000;
        mov.f32         %f42, 0fBF800000;
        fma.rn.f32      %f47, %f47, %f42, %f41;
 
-BB71_23:
+BB71_21:
        cvta.to.global.u64      %rd16, %rd9;
        shl.b64         %rd17, %rd1, 2;
        add.s64         %rd18, %rd16, %rd17;
        st.global.f32   [%rd18], %f47;
 
-BB71_24:
+BB71_22:
        ret;
 }
 
@@ -9139,8 +9087,8 @@ BB72_2:
        fma.rn.f64      %fd48, %fd47, %fd45, %fd46;
        div.rn.f64      %fd49, %fd45, %fd48;
        add.f64         %fd50, %fd49, %fd45;
-       setp.ltu.f64    %p6, %fd1, 0d408633CE8FB9F87E;
-       selp.f64        %fd67, %fd50, 0d7FF0000000000000, %p6;
+       setp.ge.f64     %p6, %fd1, 0d408633CE8FB9F87E;
+       selp.f64        %fd67, 0d7FF0000000000000, %fd50, %p6;
 
 BB72_4:
        cvta.to.global.u64      %rd7, %rd3;
@@ -9229,8 +9177,8 @@ BB73_2:
        mov.f32         %f20, 0f40000000;
        fma.rn.f32      %f21, %f20, %f16, %f19;
        mov.b32          %r6, %f21;
-       setp.ltu.f32    %p3, %f2, 0f42B40000;
-       selp.b32        %r7, %r6, 2139095040, %p3;
+       setp.ge.f32     %p3, %f2, 0f42B40000;
+       selp.b32        %r7, 2139095040, %r6, %p3;
        mov.b32          %r8, %f1;
        and.b32         %r9, %r8, -2147483648;
        or.b32          %r10, %r7, %r9;
@@ -9256,7 +9204,7 @@ BB73_5:
        .local .align 4 .b8     __local_depot74[4];
        .reg .b64       %SP;
        .reg .b64       %SPL;
-       .reg .pred      %p<7>;
+       .reg .pred      %p<9>;
        .reg .b32       %r<19>;
        .reg .f64       %fd<41>;
        .reg .b64       %rd<17>;
@@ -9274,7 +9222,7 @@ BB73_5:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB74_11;
+       @%p1 bra        BB74_10;
 
        cvta.to.global.u64      %rd6, %rd3;
        cvt.s64.s32     %rd2, %r1;
@@ -9283,23 +9231,24 @@ BB73_5:
        ld.global.f64   %fd38, [%rd8];
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r10}, %fd38;
+       mov.b64         {%r10, %temp}, %fd38;
        }
-       and.b32         %r11, %r10, 2147483647;
-       setp.ne.s32     %p2, %r11, 2146435072;
-       @%p2 bra        BB74_4;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r12, %temp}, %fd38;
+       mov.b64         {%temp, %r11}, %fd38;
        }
-       setp.ne.s32     %p3, %r12, 0;
-       @%p3 bra        BB74_4;
+       and.b32         %r12, %r11, 2147483647;
+       setp.eq.s32     %p2, %r12, 2146435072;
+       setp.eq.s32     %p3, %r10, 0;
+       and.pred        %p4, %p2, %p3;
+       @!%p4 bra       BB74_3;
+       bra.uni         BB74_2;
 
+BB74_2:
        mov.f64         %fd14, 0d0000000000000000;
        mul.rn.f64      %fd38, %fd38, %fd14;
 
-BB74_4:
+BB74_3:
        mul.f64         %fd15, %fd38, 0d3FE45F306DC9C883;
        cvt.rni.s32.f64 %r18, %fd15;
        st.local.u32    [%rd1], %r18;
@@ -9316,8 +9265,8 @@ BB74_4:
        mov.b64         {%temp, %r13}, %fd38;
        }
        and.b32         %r14, %r13, 2145386496;
-       setp.lt.u32     %p4, %r14, 1105199104;
-       @%p4 bra        BB74_6;
+       setp.lt.u32     %p5, %r14, 1105199104;
+       @%p5 bra        BB74_5;
 
        // Callseq Start 4
        {
@@ -9340,12 +9289,12 @@ BB74_4:
        }// Callseq End 4
        ld.local.u32    %r18, [%rd1];
 
-BB74_6:
+BB74_5:
        add.s32         %r5, %r18, 1;
        and.b32         %r15, %r5, 1;
        shl.b32         %r16, %r15, 3;
-       setp.eq.s32     %p5, %r15, 0;
-       selp.f64        %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+       setp.eq.b32     %p6, %r15, 1;
+       selp.f64        %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
        mul.wide.u32    %rd10, %r16, 8;
        mov.u64         %rd11, __cudart_sin_cos_coeffs;
        add.s64         %rd12, %rd10, %rd11;
@@ -9363,27 +9312,28 @@ BB74_6:
        ld.const.f64    %fd34, [%rd12+48];
        fma.rn.f64      %fd8, %fd33, %fd7, %fd34;
        fma.rn.f64      %fd40, %fd8, %fd39, %fd39;
-       @%p5 bra        BB74_8;
+       setp.eq.s32     %p7, %r15, 0;
+       @%p7 bra        BB74_7;
 
        mov.f64         %fd35, 0d3FF0000000000000;
        fma.rn.f64      %fd40, %fd8, %fd7, %fd35;
 
-BB74_8:
+BB74_7:
        and.b32         %r17, %r5, 2;
-       setp.eq.s32     %p6, %r17, 0;
-       @%p6 bra        BB74_10;
+       setp.eq.s32     %p8, %r17, 0;
+       @%p8 bra        BB74_9;
 
        mov.f64         %fd36, 0d0000000000000000;
        mov.f64         %fd37, 0dBFF0000000000000;
        fma.rn.f64      %fd40, %fd40, %fd37, %fd36;
 
-BB74_10:
+BB74_9:
        cvta.to.global.u64      %rd13, %rd4;
        shl.b64         %rd14, %rd2, 3;
        add.s64         %rd15, %rd13, %rd14;
        st.global.f64   [%rd15], %fd40;
 
-BB74_11:
+BB74_10:
        ret;
 }
 
@@ -9399,7 +9349,7 @@ BB74_11:
        .reg .b64       %SPL;
        .reg .pred      %p<15>;
        .reg .f32       %f<48>;
-       .reg .b32       %r<98>;
+       .reg .b32       %r<96>;
        .reg .b64       %rd<22>;
 
 
@@ -9407,13 +9357,13 @@ BB74_11:
        cvta.local.u64  %SP, %rd21;
        ld.param.u64    %rd8, [matrix_cos_f_param_0];
        ld.param.u64    %rd9, [matrix_cos_f_param_1];
-       ld.param.u32    %r38, [matrix_cos_f_param_2];
-       mov.u32         %r39, %ntid.x;
-       mov.u32         %r40, %ctaid.x;
-       mov.u32         %r41, %tid.x;
-       mad.lo.s32      %r1, %r39, %r40, %r41;
-       setp.ge.u32     %p1, %r1, %r38;
-       @%p1 bra        BB75_24;
+       ld.param.u32    %r31, [matrix_cos_f_param_2];
+       mov.u32         %r32, %ntid.x;
+       mov.u32         %r33, %ctaid.x;
+       mov.u32         %r34, %tid.x;
+       mad.lo.s32      %r1, %r32, %r33, %r34;
+       setp.ge.u32     %p1, %r1, %r31;
+       @%p1 bra        BB75_22;
 
        cvta.to.global.u64      %rd10, %rd8;
        cvt.s64.s32     %rd1, %r1;
@@ -9431,8 +9381,8 @@ BB74_11:
 
 BB75_3:
        mul.f32         %f21, %f43, 0f3F22F983;
-       cvt.rni.s32.f32 %r97, %f21;
-       cvt.rn.f32.s32  %f22, %r97;
+       cvt.rni.s32.f32 %r95, %f21;
+       cvt.rn.f32.s32  %f22, %r95;
        neg.f32         %f23, %f22;
        mov.f32         %f24, 0f3FC90FDA;
        fma.rn.f32      %f25, %f23, %f24, %f43;
@@ -9442,12 +9392,12 @@ BB75_3:
        fma.rn.f32      %f44, %f23, %f28, %f27;
        abs.f32         %f29, %f43;
        setp.leu.f32    %p3, %f29, 0f47CE4780;
-       @%p3 bra        BB75_13;
+       @%p3 bra        BB75_11;
 
        mov.b32          %r3, %f43;
        shr.u32         %r4, %r3, 23;
-       shl.b32         %r44, %r3, 8;
-       or.b32          %r5, %r44, -2147483648;
+       shl.b32         %r37, %r3, 8;
+       or.b32          %r5, %r37, -2147483648;
        mov.u32         %r89, 0;
        mov.u64         %rd19, __cudart_i2opi_f;
        mov.u32         %r88, -6;
@@ -9456,14 +9406,14 @@ BB75_3:
 BB75_5:
        .pragma "nounroll";
        mov.u64         %rd4, %rd20;
-       ld.const.u32    %r47, [%rd19];
+       ld.const.u32    %r40, [%rd19];
        // inline asm
        {
-       mad.lo.cc.u32   %r45, %r47, %r5, %r89;
-       madc.hi.u32     %r89, %r47, %r5,  0;
+       mad.lo.cc.u32   %r38, %r40, %r5, %r89;
+       madc.hi.u32     %r89, %r40, %r5,  0;
        }
        // inline asm
-       st.local.u32    [%rd4], %r45;
+       st.local.u32    [%rd4], %r38;
        add.s64         %rd5, %rd4, 4;
        add.s64         %rd19, %rd19, 4;
        add.s32         %r88, %r88, 1;
@@ -9471,14 +9421,14 @@ BB75_5:
        mov.u64         %rd20, %rd5;
        @%p4 bra        BB75_5;
 
-       and.b32         %r50, %r4, 255;
-       add.s32         %r51, %r50, -128;
-       shr.u32         %r52, %r51, 5;
+       and.b32         %r43, %r4, 255;
+       add.s32         %r44, %r43, -128;
+       shr.u32         %r45, %r44, 5;
        and.b32         %r10, %r3, -2147483648;
        st.local.u32    [%rd2+24], %r89;
-       mov.u32         %r53, 6;
-       sub.s32         %r54, %r53, %r52;
-       mul.wide.s32    %rd15, %r54, 4;
+       mov.u32         %r46, 6;
+       sub.s32         %r47, %r46, %r45;
+       mul.wide.s32    %rd15, %r47, 4;
        add.s64         %rd7, %rd2, %rd15;
        ld.local.u32    %r90, [%rd7];
        ld.local.u32    %r91, [%rd7+-4];
@@ -9486,67 +9436,65 @@ BB75_5:
        setp.eq.s32     %p5, %r13, 0;
        @%p5 bra        BB75_8;
 
-       mov.u32         %r55, 32;
-       sub.s32         %r56, %r55, %r13;
-       shr.u32         %r57, %r91, %r56;
-       shl.b32         %r58, %r90, %r13;
-       add.s32         %r90, %r57, %r58;
-       ld.local.u32    %r59, [%rd7+-8];
-       shr.u32         %r60, %r59, %r56;
-       shl.b32         %r61, %r91, %r13;
-       add.s32         %r91, %r60, %r61;
+       mov.u32         %r48, 32;
+       sub.s32         %r49, %r48, %r13;
+       shr.u32         %r50, %r91, %r49;
+       shl.b32         %r51, %r90, %r13;
+       add.s32         %r90, %r50, %r51;
+       ld.local.u32    %r52, [%rd7+-8];
+       shr.u32         %r53, %r52, %r49;
+       shl.b32         %r54, %r91, %r13;
+       add.s32         %r91, %r53, %r54;
 
 BB75_8:
-       shr.u32         %r62, %r91, 30;
-       shl.b32         %r63, %r90, 2;
-       add.s32         %r92, %r62, %r63;
+       shr.u32         %r55, %r91, 30;
+       shl.b32         %r56, %r90, 2;
+       add.s32         %r92, %r55, %r56;
        shl.b32         %r19, %r91, 2;
-       shr.u32         %r64, %r92, 31;
-       shr.u32         %r65, %r90, 30;
-       add.s32         %r20, %r64, %r65;
-       setp.eq.s32     %p6, %r64, 0;
+       shr.u32         %r57, %r92, 31;
+       shr.u32         %r58, %r90, 30;
+       add.s32         %r20, %r57, %r58;
+       setp.eq.s32     %p6, %r57, 0;
        mov.u32         %r93, %r10;
        mov.u32         %r94, %r19;
        @%p6 bra        BB75_10;
 
-       not.b32         %r66, %r92;
+       not.b32         %r59, %r92;
        neg.s32         %r21, %r19;
        setp.eq.s32     %p7, %r19, 0;
-       selp.u32        %r67, 1, 0, %p7;
-       add.s32         %r92, %r67, %r66;
+       selp.u32        %r60, 1, 0, %p7;
+       add.s32         %r92, %r60, %r59;
        xor.b32         %r23, %r10, -2147483648;
        mov.u32         %r93, %r23;
        mov.u32         %r94, %r21;
 
 BB75_10:
        mov.u32         %r25, %r93;
-       neg.s32         %r68, %r20;
-       setp.eq.s32     %p8, %r10, 0;
-       selp.b32        %r97, %r20, %r68, %p8;
-       clz.b32         %r96, %r92;
-       setp.eq.s32     %p9, %r96, 0;
-       shl.b32         %r69, %r92, %r96;
-       mov.u32         %r70, 32;
-       sub.s32         %r71, %r70, %r96;
-       shr.u32         %r72, %r94, %r71;
-       add.s32         %r73, %r72, %r69;
-       selp.b32        %r29, %r92, %r73, %p9;
-       mov.u32         %r74, -921707870;
-       mul.hi.u32      %r95, %r29, %r74;
-       setp.lt.s32     %p10, %r95, 1;
-       @%p10 bra       BB75_12;
-
-       mul.lo.s32      %r75, %r29, -921707870;
-       shr.u32         %r76, %r75, 31;
-       shl.b32         %r77, %r95, 1;
-       add.s32         %r95, %r76, %r77;
-       add.s32         %r96, %r96, 1;
-
-BB75_12:
-       mov.u32         %r78, 126;
-       sub.s32         %r79, %r78, %r96;
+       neg.s32         %r61, %r20;
+       setp.ne.s32     %p8, %r10, 0;
+       selp.b32        %r95, %r61, %r20, %p8;
+       clz.b32         %r62, %r92;
+       setp.ne.s32     %p9, %r62, 0;
+       shl.b32         %r63, %r92, %r62;
+       mov.u32         %r64, 32;
+       sub.s32         %r65, %r64, %r62;
+       shr.u32         %r66, %r94, %r65;
+       add.s32         %r67, %r66, %r63;
+       selp.b32        %r68, %r67, %r92, %p9;
+       mul.lo.s32      %r69, %r68, -921707870;
+       mov.u32         %r70, -921707870;
+       mul.hi.u32      %r71, %r68, %r70;
+       setp.gt.s32     %p10, %r71, 0;
+       shl.b32         %r72, %r71, 1;
+       shr.u32         %r73, %r69, 31;
+       add.s32         %r74, %r73, %r72;
+       selp.b32        %r75, %r74, %r71, %p10;
+       selp.b32        %r76, -1, 0, %p10;
+       mov.u32         %r77, 126;
+       sub.s32         %r78, %r77, %r62;
+       add.s32         %r79, %r78, %r76;
        shl.b32         %r80, %r79, 23;
-       add.s32         %r81, %r95, 1;
+       add.s32         %r81, %r75, 1;
        shr.u32         %r82, %r81, 7;
        add.s32         %r83, %r82, 1;
        shr.u32         %r84, %r83, 1;
@@ -9554,61 +9502,61 @@ BB75_12:
        or.b32          %r86, %r85, %r25;
        mov.b32          %f44, %r86;
 
-BB75_13:
+BB75_11:
        mul.rn.f32      %f7, %f44, %f44;
-       add.s32         %r36, %r97, 1;
-       and.b32         %r37, %r36, 1;
-       setp.eq.s32     %p11, %r37, 0;
-       @%p11 bra       BB75_15;
+       add.s32         %r29, %r95, 1;
+       and.b32         %r30, %r29, 1;
+       setp.eq.s32     %p11, %r30, 0;
+       @%p11 bra       BB75_13;
 
        mov.f32         %f30, 0fBAB6061A;
        mov.f32         %f31, 0f37CCF5CE;
        fma.rn.f32      %f45, %f31, %f7, %f30;
-       bra.uni         BB75_16;
+       bra.uni         BB75_14;
 
-BB75_15:
+BB75_13:
        mov.f32         %f32, 0f3C08839E;
        mov.f32         %f33, 0fB94CA1F9;
        fma.rn.f32      %f45, %f33, %f7, %f32;
 
-BB75_16:
-       @%p11 bra       BB75_18;
+BB75_14:
+       @%p11 bra       BB75_16;
 
        mov.f32         %f34, 0f3D2AAAA5;
        fma.rn.f32      %f35, %f45, %f7, %f34;
        mov.f32         %f36, 0fBF000000;
        fma.rn.f32      %f46, %f35, %f7, %f36;
-       bra.uni         BB75_19;
+       bra.uni         BB75_17;
 
-BB75_18:
+BB75_16:
        mov.f32         %f37, 0fBE2AAAA3;
        fma.rn.f32      %f38, %f45, %f7, %f37;
        mov.f32         %f39, 0f00000000;
        fma.rn.f32      %f46, %f38, %f7, %f39;
 
-BB75_19:
+BB75_17:
        fma.rn.f32      %f47, %f46, %f44, %f44;
-       @%p11 bra       BB75_21;
+       @%p11 bra       BB75_19;
 
        mov.f32         %f40, 0f3F800000;
        fma.rn.f32      %f47, %f46, %f7, %f40;
 
-BB75_21:
-       and.b32         %r87, %r36, 2;
+BB75_19:
+       and.b32         %r87, %r29, 2;
        setp.eq.s32     %p14, %r87, 0;
-       @%p14 bra       BB75_23;
+       @%p14 bra       BB75_21;
 
        mov.f32         %f41, 0f00000000;
        mov.f32         %f42, 0fBF800000;
        fma.rn.f32      %f47, %f47, %f42, %f41;
 
-BB75_23:
+BB75_21:
        cvta.to.global.u64      %rd16, %rd9;
        shl.b64         %rd17, %rd1, 2;
        add.s64         %rd18, %rd16, %rd17;
        st.global.f32   [%rd18], %f47;
 
-BB75_24:
+BB75_22:
        ret;
 }
 
@@ -9718,8 +9666,8 @@ BB76_3:
        bra.uni         BB76_4;
 
 BB76_2:
-       setp.gtu.f64    %p3, %fd1, 0d7FF0000000000000;
-       selp.f64        %fd45, %fd1, 0d7FF0000000000000, %p3;
+       setp.le.f64     %p3, %fd1, 0d7FF0000000000000;
+       selp.f64        %fd45, 0d7FF0000000000000, %fd1, %p3;
 
 BB76_4:
        cvta.to.global.u64      %rd7, %rd3;
@@ -9777,8 +9725,8 @@ BB76_5:
        div.approx.f32  %f15, %f14, %f13;
        mov.f32         %f16, 0f40000000;
        fma.rn.f32      %f17, %f16, %f13, %f15;
-       setp.ltu.f32    %p2, %f4, 0f42B40000;
-       selp.f32        %f18, %f17, 0f7F800000, %p2;
+       setp.ge.f32     %p2, %f4, 0f42B40000;
+       selp.f32        %f18, 0f7F800000, %f17, %p2;
        cvta.to.global.u64      %rd6, %rd2;
        add.s64         %rd7, %rd6, %rd4;
        st.global.f32   [%rd7], %f18;
@@ -9797,7 +9745,7 @@ BB77_2:
        .local .align 4 .b8     __local_depot78[4];
        .reg .b64       %SP;
        .reg .b64       %SPL;
-       .reg .pred      %p<6>;
+       .reg .pred      %p<7>;
        .reg .b32       %r<16>;
        .reg .f64       %fd<66>;
        .reg .b64       %rd<14>;
@@ -9815,7 +9763,7 @@ BB77_2:
        mov.u32         %r8, %tid.x;
        mad.lo.s32      %r1, %r6, %r7, %r8;
        setp.ge.u32     %p1, %r1, %r5;
-       @%p1 bra        BB78_9;
+       @%p1 bra        BB78_8;
 
        cvta.to.global.u64      %rd6, %rd3;
        cvt.s64.s32     %rd2, %r1;
@@ -9824,23 +9772,24 @@ BB77_2:
        ld.global.f64   %fd63, [%rd8];
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r9}, %fd63;
+       mov.b64         {%r9, %temp}, %fd63;
        }
-       and.b32         %r10, %r9, 2147483647;
-       setp.ne.s32     %p2, %r10, 2146435072;
-       @%p2 bra        BB78_4;
-
        {
        .reg .b32 %temp; 
-       mov.b64         {%r11, %temp}, %fd63;
+       mov.b64         {%temp, %r10}, %fd63;
        }
-       setp.ne.s32     %p3, %r11, 0;
-       @%p3 bra        BB78_4;
+       and.b32         %r11, %r10, 2147483647;
+       setp.eq.s32     %p2, %r11, 2146435072;
+       setp.eq.s32     %p3, %r9, 0;
+       and.pred        %p4, %p2, %p3;
+       @!%p4 bra       BB78_3;
+       bra.uni         BB78_2;
 
+BB78_2:
        mov.f64         %fd11, 0d0000000000000000;
        mul.rn.f64      %fd63, %fd63, %fd11;
 
-BB78_4:
+BB78_3:
        mul.f64         %fd12, %fd63, 0d3FE45F306DC9C883;
        cvt.rni.s32.f64 %r15, %fd12;
        st.local.u32    [%rd1], %r15;
@@ -9857,8 +9806,8 @@ BB78_4:
        mov.b64         {%temp, %r12}, %fd63;
        }
        and.b32         %r13, %r12, 2145386496;
-       setp.lt.u32     %p4, %r13, 1105199104;
-       @%p4 bra        BB78_6;
+       setp.lt.u32     %p5, %r13, 1105199104;
+       @%p5 bra        BB78_5;
 
        // Callseq Start 5
        {
@@ -9881,7 +9830,7 @@ BB78_4:
        }// Callseq End 5
        ld.local.u32    %r15, [%rd1];
 
-BB78_6:
+BB78_5:
        mul.f64         %fd20, %fd64, %fd64;
        mov.f64         %fd21, 0dBEF9757C5B27EBB1;
        mov.f64         %fd22, 0d3EE48DAC2799BCB9;
@@ -9915,11 +9864,11 @@ BB78_6:
        mul.f64         %fd7, %fd20, %fd49;
        fma.rn.f64      %fd65, %fd7, %fd64, %fd64;
        and.b32         %r14, %r15, 1;
-       setp.eq.b32     %p5, %r14, 1;
-       @!%p5 bra       BB78_8;
-       bra.uni         BB78_7;
+       setp.eq.b32     %p6, %r14, 1;
+       @!%p6 bra       BB78_7;
+       bra.uni         BB78_6;
 
-BB78_7:
+BB78_6:
        sub.f64         %fd52, %fd65, %fd64;
        neg.f64         %fd53, %fd52;
        fma.rn.f64      %fd54, %fd7, %fd64, %fd53;
@@ -9936,13 +9885,13 @@ BB78_7:
        fma.rn.f64      %fd62, %fd60, %fd54, %fd61;
        fma.rn.f64      %fd65, %fd62, %fd60, %fd60;
 
-BB78_8:
+BB78_7:
        cvta.to.global.u64      %rd10, %rd4;
        shl.b64         %rd11, %rd2, 3;
        add.s64         %rd12, %rd10, %rd11;
        st.global.f64   [%rd12], %fd65;
 
-BB78_9:
+BB78_8:
        ret;
 }
 
@@ -9958,7 +9907,7 @@ BB78_9:
        .reg .b64       %SPL;
        .reg .pred      %p<12>;
        .reg .f32       %f<33>;
-       .reg .b32       %r<96>;
+       .reg .b32       %r<94>;
        .reg .b64       %rd<22>;
 
 
@@ -9966,13 +9915,13 @@ BB78_9:
        cvta.local.u64  %SP, %rd21;
        ld.param.u64    %rd8, [matrix_tan_f_param_0];
        ld.param.u64    %rd9, [matrix_tan_f_param_1];
-       ld.param.u32    %r36, [matrix_tan_f_param_2];
-       mov.u32         %r37, %ntid.x;
-       mov.u32         %r38, %ctaid.x;
-       mov.u32         %r39, %tid.x;
-       mad.lo.s32      %r1, %r37, %r38, %r39;
-       setp.ge.u32     %p1, %r1, %r36;
-       @%p1 bra        BB79_16;
+       ld.param.u32    %r29, [matrix_tan_f_param_2];
+       mov.u32         %r30, %ntid.x;
+       mov.u32         %r31, %ctaid.x;
+       mov.u32         %r32, %tid.x;
+       mad.lo.s32      %r1, %r30, %r31, %r32;
+       setp.ge.u32     %p1, %r1, %r29;
+       @%p1 bra        BB79_14;
 
        cvta.to.global.u64      %rd10, %rd8;
        cvt.s64.s32     %rd1, %r1;
@@ -9990,8 +9939,8 @@ BB78_9:
 
 BB79_3:
        mul.f32         %f12, %f30, 0f3F22F983;
-       cvt.rni.s32.f32 %r95, %f12;
-       cvt.rn.f32.s32  %f13, %r95;
+       cvt.rni.s32.f32 %r93, %f12;
+       cvt.rn.f32.s32  %f13, %r93;
        neg.f32         %f14, %f13;
        mov.f32         %f15, 0f3FC90FDA;
        fma.rn.f32      %f16, %f14, %f15, %f30;
@@ -10001,12 +9950,12 @@ BB79_3:
        fma.rn.f32      %f31, %f14, %f19, %f18;
        abs.f32         %f20, %f30;
        setp.leu.f32    %p3, %f20, 0f47CE4780;
-       @%p3 bra        BB79_13;
+       @%p3 bra        BB79_11;
 
        mov.b32          %r3, %f30;
        shr.u32         %r4, %r3, 23;
-       shl.b32         %r42, %r3, 8;
-       or.b32          %r5, %r42, -2147483648;
+       shl.b32         %r35, %r3, 8;
+       or.b32          %r5, %r35, -2147483648;
        mov.u32         %r87, 0;
        mov.u64         %rd19, __cudart_i2opi_f;
        mov.u32         %r86, -6;
@@ -10015,14 +9964,14 @@ BB79_3:
 BB79_5:
        .pragma "nounroll";
        mov.u64         %rd4, %rd20;
-       ld.const.u32    %r45, [%rd19];
+       ld.const.u32    %r38, [%rd19];
        // inline asm
        {
-       mad.lo.cc.u32   %r43, %r45, %r5, %r87;
-       madc.hi.u32     %r87, %r45, %r5,  0;
+       mad.lo.cc.u32   %r36, %r38, %r5, %r87;
+       madc.hi.u32     %r87, %r38, %r5,  0;
        }
        // inline asm
-       st.local.u32    [%rd4], %r43;
+       st.local.u32    [%rd4], %r36;
        add.s64         %rd5, %rd4, 4;
        add.s64         %rd19, %rd19, 4;
        add.s32         %r86, %r86, 1;
@@ -10030,14 +9979,14 @@ BB79_5:
        mov.u64         %rd20, %rd5;
        @%p4 bra        BB79_5;
 
-       and.b32         %r48, %r4, 255;
-       add.s32         %r49, %r48, -128;
-       shr.u32         %r50, %r49, 5;
+       and.b32         %r41, %r4, 255;
+       add.s32         %r42, %r41, -128;
+       shr.u32         %r43, %r42, 5;
        and.b32         %r10, %r3, -2147483648;
        st.local.u32    [%rd2+24], %r87;
-       mov.u32         %r51, 6;
-       sub.s32         %r52, %r51, %r50;
-       mul.wide.s32    %rd15, %r52, 4;
+       mov.u32         %r44, 6;
+       sub.s32         %r45, %r44, %r43;
+       mul.wide.s32    %rd15, %r45, 4;
        add.s64         %rd7, %rd2, %rd15;
        ld.local.u32    %r88, [%rd7];
        ld.local.u32    %r89, [%rd7+-4];
@@ -10045,67 +9994,65 @@ BB79_5:
        setp.eq.s32     %p5, %r13, 0;
        @%p5 bra        BB79_8;
 
-       mov.u32         %r53, 32;
-       sub.s32         %r54, %r53, %r13;
-       shr.u32         %r55, %r89, %r54;
-       shl.b32         %r56, %r88, %r13;
-       add.s32         %r88, %r55, %r56;
-       ld.local.u32    %r57, [%rd7+-8];
-       shr.u32         %r58, %r57, %r54;
-       shl.b32         %r59, %r89, %r13;
-       add.s32         %r89, %r58, %r59;
+       mov.u32         %r46, 32;
+       sub.s32         %r47, %r46, %r13;
+       shr.u32         %r48, %r89, %r47;
+       shl.b32 

<TRUNCATED>

Reply via email to