http://git-wip-us.apache.org/repos/asf/systemml/blob/34bb3ca8/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 54b53b9..b4a6559 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -10,7 +10,7 @@
 .target sm_30
 .address_size 64
 
-       // .globl       slice_sparse_dense
+       // .globl       slice_sparse_dense_row
 .func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
 (
        .param .b64 __internal_trig_reduction_slowpathd_param_0,
@@ -27,16 +27,16 @@
 .const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 
107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 
45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 
10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 
139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 
112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 
29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 
97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 
221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 
219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 
199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 
85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 
217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 
93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 
0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
 
-.visible .entry slice_sparse_dense(
-       .param .u64 slice_sparse_dense_param_0,
-       .param .u64 slice_sparse_dense_param_1,
-       .param .u64 slice_sparse_dense_param_2,
-       .param .u64 slice_sparse_dense_param_3,
-       .param .u32 slice_sparse_dense_param_4,
-       .param .u32 slice_sparse_dense_param_5,
-       .param .u32 slice_sparse_dense_param_6,
-       .param .u32 slice_sparse_dense_param_7,
-       .param .u32 slice_sparse_dense_param_8
+.visible .entry slice_sparse_dense_row(
+       .param .u64 slice_sparse_dense_row_param_0,
+       .param .u64 slice_sparse_dense_row_param_1,
+       .param .u64 slice_sparse_dense_row_param_2,
+       .param .u64 slice_sparse_dense_row_param_3,
+       .param .u32 slice_sparse_dense_row_param_4,
+       .param .u32 slice_sparse_dense_row_param_5,
+       .param .u32 slice_sparse_dense_row_param_6,
+       .param .u32 slice_sparse_dense_row_param_7,
+       .param .u32 slice_sparse_dense_row_param_8
 )
 {
        .reg .pred      %p<7>;
@@ -45,15 +45,15 @@
        .reg .b64       %rd<23>;
 
 
-       ld.param.u64    %rd9, [slice_sparse_dense_param_0];
-       ld.param.u64    %rd10, [slice_sparse_dense_param_1];
-       ld.param.u64    %rd11, [slice_sparse_dense_param_2];
-       ld.param.u64    %rd12, [slice_sparse_dense_param_3];
-       ld.param.u32    %r15, [slice_sparse_dense_param_4];
-       ld.param.u32    %r16, [slice_sparse_dense_param_5];
-       ld.param.u32    %r12, [slice_sparse_dense_param_6];
-       ld.param.u32    %r13, [slice_sparse_dense_param_7];
-       ld.param.u32    %r14, [slice_sparse_dense_param_8];
+       ld.param.u64    %rd9, [slice_sparse_dense_row_param_0];
+       ld.param.u64    %rd10, [slice_sparse_dense_row_param_1];
+       ld.param.u64    %rd11, [slice_sparse_dense_row_param_2];
+       ld.param.u64    %rd12, [slice_sparse_dense_row_param_3];
+       ld.param.u32    %r15, [slice_sparse_dense_row_param_4];
+       ld.param.u32    %r16, [slice_sparse_dense_row_param_5];
+       ld.param.u32    %r12, [slice_sparse_dense_row_param_6];
+       ld.param.u32    %r13, [slice_sparse_dense_row_param_7];
+       ld.param.u32    %r14, [slice_sparse_dense_row_param_8];
        mov.u32         %r17, %ntid.x;
        mov.u32         %r18, %ctaid.x;
        mov.u32         %r19, %tid.x;
@@ -105,6 +105,88 @@ BB0_6:
        ret;
 }
 
+       // .globl       slice_sparse_dense_nnz
+.visible .entry slice_sparse_dense_nnz(
+       .param .u64 slice_sparse_dense_nnz_param_0,
+       .param .u64 slice_sparse_dense_nnz_param_1,
+       .param .u64 slice_sparse_dense_nnz_param_2,
+       .param .u64 slice_sparse_dense_nnz_param_3,
+       .param .u32 slice_sparse_dense_nnz_param_4,
+       .param .u32 slice_sparse_dense_nnz_param_5,
+       .param .u32 slice_sparse_dense_nnz_param_6,
+       .param .u32 slice_sparse_dense_nnz_param_7,
+       .param .u32 slice_sparse_dense_nnz_param_8
+)
+{
+       .reg .pred      %p<6>;
+       .reg .b32       %r<22>;
+       .reg .f64       %fd<2>;
+       .reg .b64       %rd<22>;
+
+
+       ld.param.u64    %rd5, [slice_sparse_dense_nnz_param_0];
+       ld.param.u64    %rd8, [slice_sparse_dense_nnz_param_1];
+       ld.param.u64    %rd6, [slice_sparse_dense_nnz_param_2];
+       ld.param.u64    %rd7, [slice_sparse_dense_nnz_param_3];
+       ld.param.u32    %r5, [slice_sparse_dense_nnz_param_4];
+       ld.param.u32    %r9, [slice_sparse_dense_nnz_param_5];
+       ld.param.u32    %r6, [slice_sparse_dense_nnz_param_6];
+       ld.param.u32    %r7, [slice_sparse_dense_nnz_param_7];
+       ld.param.u32    %r8, [slice_sparse_dense_nnz_param_8];
+       mov.u32         %r10, %ntid.x;
+       mov.u32         %r11, %ctaid.x;
+       mov.u32         %r12, %tid.x;
+       mad.lo.s32      %r13, %r10, %r11, %r12;
+       cvta.to.global.u64      %rd1, %rd8;
+       mul.wide.s32    %rd9, %r5, 4;
+       add.s64         %rd10, %rd1, %rd9;
+       ld.global.u32   %r14, [%rd10];
+       add.s32         %r1, %r13, %r14;
+       mul.wide.s32    %rd11, %r9, 4;
+       add.s64         %rd12, %rd1, %rd11;
+       ld.global.u32   %r15, [%rd12+4];
+       setp.ge.s32     %p1, %r1, %r15;
+       @%p1 bra        BB1_5;
+
+       cvta.to.global.u64      %rd2, %rd7;
+       cvta.to.global.u64      %rd3, %rd5;
+       cvta.to.global.u64      %rd13, %rd6;
+       cvt.s64.s32     %rd4, %r1;
+       mul.wide.s32    %rd14, %r1, 4;
+       add.s64         %rd15, %rd13, %rd14;
+       ld.global.u32   %r2, [%rd15];
+       setp.lt.s32     %p2, %r2, %r6;
+       setp.gt.s32     %p3, %r2, %r7;
+       or.pred         %p4, %p2, %p3;
+       @%p4 bra        BB1_5;
+
+       mov.u32         %r21, %r5;
+
+BB1_3:
+       mov.u32         %r3, %r21;
+       add.s32         %r4, %r3, 1;
+       mul.wide.s32    %rd16, %r4, 4;
+       add.s64         %rd17, %rd1, %rd16;
+       ld.global.u32   %r16, [%rd17];
+       setp.le.s32     %p5, %r16, %r1;
+       mov.u32         %r21, %r4;
+       @%p5 bra        BB1_3;
+
+       shl.b64         %rd18, %rd4, 3;
+       add.s64         %rd19, %rd3, %rd18;
+       ld.global.f64   %fd1, [%rd19];
+       sub.s32         %r17, %r3, %r5;
+       mul.lo.s32      %r18, %r17, %r8;
+       sub.s32         %r19, %r18, %r6;
+       add.s32         %r20, %r19, %r2;
+       mul.wide.s32    %rd20, %r20, 8;
+       add.s64         %rd21, %rd2, %rd20;
+       st.global.f64   [%rd21], %fd1;
+
+BB1_5:
+       ret;
+}
+
        // .globl       slice_dense_dense
 .visible .entry slice_dense_dense(
        .param .u64 slice_dense_dense_param_0,
@@ -114,58 +196,49 @@ BB0_6:
        .param .u32 slice_dense_dense_param_4,
        .param .u32 slice_dense_dense_param_5,
        .param .u32 slice_dense_dense_param_6,
-       .param .u32 slice_dense_dense_param_7
+       .param .u32 slice_dense_dense_param_7,
+       .param .u32 slice_dense_dense_param_8
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<22>;
+       .reg .b32       %r<15>;
        .reg .f64       %fd<2>;
-       .reg .b64       %rd<15>;
+       .reg .b64       %rd<9>;
 
 
-       ld.param.u64    %rd7, [slice_dense_dense_param_0];
-       ld.param.u64    %rd8, [slice_dense_dense_param_1];
-       ld.param.u32    %r9, [slice_dense_dense_param_2];
-       ld.param.u32    %r13, [slice_dense_dense_param_3];
-       ld.param.u32    %r10, [slice_dense_dense_param_4];
-       ld.param.u32    %r11, [slice_dense_dense_param_6];
-       ld.param.u32    %r12, [slice_dense_dense_param_7];
-       mov.u32         %r1, %ntid.x;
-       mov.u32         %r2, %ctaid.x;
-       mov.u32         %r3, %tid.x;
-       mad.lo.s32      %r4, %r1, %r2, %r3;
-       add.s32         %r14, %r4, %r9;
-       setp.gt.s32     %p1, %r14, %r13;
-       @%p1 bra        BB1_4;
-
-       mul.lo.s32      %r21, %r4, %r12;
-       setp.lt.s32     %p2, %r12, 1;
-       @%p2 bra        BB1_4;
-
-       cvta.to.global.u64      %rd9, %rd8;
-       cvta.to.global.u64      %rd10, %rd7;
-       add.s32         %r6, %r21, %r12;
-       mul.lo.s32      %r15, %r1, %r2;
-       add.s32         %r16, %r3, %r15;
-       mul.lo.s32      %r17, %r12, %r16;
-       mul.wide.s32    %rd11, %r17, 8;
-       add.s64         %rd14, %rd9, %rd11;
-       add.s32         %r18, %r3, %r9;
-       add.s32         %r19, %r18, %r15;
-       mad.lo.s32      %r20, %r11, %r19, %r10;
-       mul.wide.s32    %rd12, %r20, 8;
-       add.s64         %rd13, %rd10, %rd12;
+       ld.param.u64    %rd1, [slice_dense_dense_param_0];
+       ld.param.u64    %rd2, [slice_dense_dense_param_1];
+       ld.param.u32    %r3, [slice_dense_dense_param_2];
+       ld.param.u32    %r4, [slice_dense_dense_param_4];
+       ld.param.u32    %r5, [slice_dense_dense_param_6];
+       ld.param.u32    %r7, [slice_dense_dense_param_7];
+       ld.param.u32    %r6, [slice_dense_dense_param_8];
+       mov.u32         %r8, %ctaid.x;
+       mov.u32         %r9, %ntid.x;
+       mov.u32         %r10, %tid.x;
+       mad.lo.s32      %r1, %r9, %r8, %r10;
+       div.s32         %r2, %r1, %r6;
+       setp.lt.s32     %p1, %r2, %r7;
+       setp.gt.s32     %p2, %r6, -1;
+       and.pred        %p3, %p1, %p2;
+       @!%p3 bra       BB2_2;
+       bra.uni         BB2_1;
 
-BB1_3:
-       ld.global.f64   %fd1, [%rd13];
-       st.global.f64   [%rd14], %fd1;
-       add.s64         %rd14, %rd14, 8;
-       add.s64         %rd13, %rd13, 8;
-       add.s32         %r21, %r21, 1;
-       setp.lt.s32     %p3, %r21, %r6;
-       @%p3 bra        BB1_3;
-
-BB1_4:
+BB2_1:
+       rem.s32         %r11, %r1, %r6;
+       cvta.to.global.u64      %rd3, %rd1;
+       add.s32         %r12, %r2, %r3;
+       add.s32         %r13, %r11, %r4;
+       mad.lo.s32      %r14, %r12, %r5, %r13;
+       mul.wide.s32    %rd4, %r14, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       cvta.to.global.u64      %rd6, %rd2;
+       mul.wide.s32    %rd7, %r1, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f64   [%rd8], %fd1;
+
+BB2_2:
        ret;
 }
 
@@ -195,10 +268,10 @@ BB1_4:
        setp.gt.s32     %p1, %r9, %r8;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB2_2;
-       bra.uni         BB2_1;
+       @!%p3 bra       BB3_2;
+       bra.uni         BB3_1;
 
-BB2_1:
+BB3_1:
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
@@ -207,7 +280,7 @@ BB2_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB2_2:
+BB3_2:
        ret;
 }
 
@@ -220,31 +293,29 @@ BB2_2:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<10>;
+       .reg .b32       %r<8>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<8>;
 
 
        ld.param.u64    %rd1, [relu_param_0];
        ld.param.u64    %rd2, [relu_param_1];
-       ld.param.u32    %r4, [relu_param_2];
+       ld.param.u32    %r2, [relu_param_2];
        ld.param.u32    %r3, [relu_param_3];
-       mov.u32         %r5, %ctaid.x;
-       mov.u32         %r6, %ntid.x;
-       mov.u32         %r7, %tid.x;
-       mad.lo.s32      %r1, %r6, %r5, %r7;
-       div.s32         %r2, %r1, %r3;
-       setp.lt.s32     %p1, %r2, %r4;
+       mov.u32         %r4, %ctaid.x;
+       mov.u32         %r5, %ntid.x;
+       mov.u32         %r6, %tid.x;
+       mad.lo.s32      %r1, %r5, %r4, %r6;
+       div.s32         %r7, %r1, %r3;
+       setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB3_2;
-       bra.uni         BB3_1;
+       @!%p3 bra       BB4_2;
+       bra.uni         BB4_1;
 
-BB3_1:
-       rem.s32         %r8, %r1, %r3;
+BB4_1:
        cvta.to.global.u64      %rd3, %rd1;
-       mad.lo.s32      %r9, %r2, %r3, %r8;
-       mul.wide.s32    %rd4, %r9, 8;
+       mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
        ld.global.f64   %fd1, [%rd5];
        mov.f64         %fd2, 0d0000000000000000;
@@ -253,7 +324,7 @@ BB3_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB3_2:
+BB4_2:
        ret;
 }
 
@@ -267,7 +338,7 @@ BB3_2:
 )
 {
        .reg .pred      %p<5>;
-       .reg .b32       %r<10>;
+       .reg .b32       %r<8>;
        .reg .f64       %fd<6>;
        .reg .b64       %rd<14>;
 
@@ -275,43 +346,41 @@ BB3_2:
        ld.param.u64    %rd2, [relu_backward_param_0];
        ld.param.u64    %rd3, [relu_backward_param_1];
        ld.param.u64    %rd4, [relu_backward_param_2];
-       ld.param.u32    %r4, [relu_backward_param_3];
+       ld.param.u32    %r2, [relu_backward_param_3];
        ld.param.u32    %r3, [relu_backward_param_4];
-       mov.u32         %r5, %ntid.x;
-       mov.u32         %r6, %ctaid.x;
-       mov.u32         %r7, %tid.x;
-       mad.lo.s32      %r1, %r5, %r6, %r7;
-       div.s32         %r2, %r1, %r3;
-       setp.lt.s32     %p1, %r2, %r4;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %ctaid.x;
+       mov.u32         %r6, %tid.x;
+       mad.lo.s32      %r1, %r4, %r5, %r6;
+       div.s32         %r7, %r1, %r3;
+       setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB4_4;
-       bra.uni         BB4_1;
+       @!%p3 bra       BB5_4;
+       bra.uni         BB5_1;
 
-BB4_1:
-       rem.s32         %r8, %r1, %r3;
+BB5_1:
        cvta.to.global.u64      %rd5, %rd2;
-       mad.lo.s32      %r9, %r2, %r3, %r8;
-       cvt.s64.s32     %rd1, %r9;
-       mul.wide.s32    %rd6, %r9, 8;
+       cvt.s64.s32     %rd1, %r1;
+       mul.wide.s32    %rd6, %r1, 8;
        add.s64         %rd7, %rd5, %rd6;
        ld.global.f64   %fd4, [%rd7];
        mov.f64         %fd5, 0d0000000000000000;
        setp.leu.f64    %p4, %fd4, 0d0000000000000000;
-       @%p4 bra        BB4_3;
+       @%p4 bra        BB5_3;
 
        cvta.to.global.u64      %rd8, %rd3;
        shl.b64         %rd9, %rd1, 3;
        add.s64         %rd10, %rd8, %rd9;
        ld.global.f64   %fd5, [%rd10];
 
-BB4_3:
+BB5_3:
        cvta.to.global.u64      %rd11, %rd4;
        shl.b64         %rd12, %rd1, 3;
        add.s64         %rd13, %rd11, %rd12;
        st.global.f64   [%rd13], %fd5;
 
-BB4_4:
+BB5_4:
        ret;
 }
 
@@ -324,31 +393,29 @@ BB4_4:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<10>;
+       .reg .b32       %r<8>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<8>;
 
 
        ld.param.u64    %rd1, [inplace_add_param_0];
        ld.param.u64    %rd2, [inplace_add_param_1];
-       ld.param.u32    %r4, [inplace_add_param_2];
+       ld.param.u32    %r2, [inplace_add_param_2];
        ld.param.u32    %r3, [inplace_add_param_3];
-       mov.u32         %r5, %ctaid.x;
-       mov.u32         %r6, %ntid.x;
-       mov.u32         %r7, %tid.x;
-       mad.lo.s32      %r1, %r6, %r5, %r7;
-       div.s32         %r2, %r1, %r3;
-       setp.lt.s32     %p1, %r2, %r4;
+       mov.u32         %r4, %ctaid.x;
+       mov.u32         %r5, %ntid.x;
+       mov.u32         %r6, %tid.x;
+       mad.lo.s32      %r1, %r5, %r4, %r6;
+       div.s32         %r7, %r1, %r3;
+       setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB5_2;
-       bra.uni         BB5_1;
+       @!%p3 bra       BB6_2;
+       bra.uni         BB6_1;
 
-BB5_1:
-       rem.s32         %r8, %r1, %r3;
+BB6_1:
        cvta.to.global.u64      %rd3, %rd1;
-       mad.lo.s32      %r9, %r2, %r3, %r8;
-       mul.wide.s32    %rd4, %r9, 8;
+       mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
        cvta.to.global.u64      %rd6, %rd2;
        add.s64         %rd7, %rd6, %rd4;
@@ -357,7 +424,7 @@ BB5_1:
        add.f64         %fd3, %fd2, %fd1;
        st.global.f64   [%rd7], %fd3;
 
-BB5_2:
+BB6_2:
        ret;
 }
 
@@ -372,7 +439,7 @@ BB5_2:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<12>;
+       .reg .b32       %r<11>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<12>;
 
@@ -380,29 +447,28 @@ BB5_2:
        ld.param.u64    %rd1, [bias_add_param_0];
        ld.param.u64    %rd2, [bias_add_param_1];
        ld.param.u64    %rd3, [bias_add_param_2];
-       ld.param.u32    %r5, [bias_add_param_3];
-       ld.param.u32    %r3, [bias_add_param_4];
-       ld.param.u32    %r4, [bias_add_param_5];
-       mov.u32         %r6, %ctaid.x;
-       mov.u32         %r7, %ntid.x;
-       mov.u32         %r8, %tid.x;
-       mad.lo.s32      %r1, %r7, %r6, %r8;
-       div.s32         %r2, %r1, %r3;
-       setp.lt.s32     %p1, %r2, %r5;
-       setp.gt.s32     %p2, %r3, -1;
+       ld.param.u32    %r4, [bias_add_param_3];
+       ld.param.u32    %r2, [bias_add_param_4];
+       ld.param.u32    %r3, [bias_add_param_5];
+       mov.u32         %r5, %ctaid.x;
+       mov.u32         %r6, %ntid.x;
+       mov.u32         %r7, %tid.x;
+       mad.lo.s32      %r1, %r6, %r5, %r7;
+       div.s32         %r8, %r1, %r2;
+       setp.lt.s32     %p1, %r8, %r4;
+       setp.gt.s32     %p2, %r2, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB6_2;
-       bra.uni         BB6_1;
+       @!%p3 bra       BB7_2;
+       bra.uni         BB7_1;
 
-BB6_1:
-       rem.s32         %r9, %r1, %r3;
+BB7_1:
+       rem.s32         %r9, %r1, %r2;
        cvta.to.global.u64      %rd4, %rd1;
-       mad.lo.s32      %r10, %r2, %r3, %r9;
-       mul.wide.s32    %rd5, %r10, 8;
+       mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
-       div.s32         %r11, %r9, %r4;
+       div.s32         %r10, %r9, %r3;
        cvta.to.global.u64      %rd7, %rd2;
-       mul.wide.s32    %rd8, %r11, 8;
+       mul.wide.s32    %rd8, %r10, 8;
        add.s64         %rd9, %rd7, %rd8;
        ld.global.f64   %fd1, [%rd9];
        ld.global.f64   %fd2, [%rd6];
@@ -411,7 +477,7 @@ BB6_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f64   [%rd11], %fd3;
 
-BB6_2:
+BB7_2:
        ret;
 }
 
@@ -450,10 +516,10 @@ BB6_2:
        setp.lt.s32     %p1, %r1, %r5;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB7_4;
-       bra.uni         BB7_1;
+       @!%p3 bra       BB8_4;
+       bra.uni         BB8_1;
 
-BB7_1:
+BB8_1:
        cvta.to.global.u64      %rd6, %rd4;
        mad.lo.s32      %r10, %r1, %r3, %r2;
        cvta.to.global.u64      %rd7, %rd3;
@@ -462,25 +528,25 @@ BB7_1:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd2, %rd6, %rd8;
        setp.eq.s32     %p4, %r4, 1;
-       @%p4 bra        BB7_3;
-       bra.uni         BB7_2;
+       @%p4 bra        BB8_3;
+       bra.uni         BB8_2;
 
-BB7_3:
+BB8_3:
        mul.wide.s32    %rd12, %r2, 8;
        add.s64         %rd13, %rd1, %rd12;
        ld.global.f64   %fd5, [%rd13];
        fma.rn.f64      %fd6, %fd5, %fd2, %fd1;
        st.global.f64   [%rd2], %fd6;
-       bra.uni         BB7_4;
+       bra.uni         BB8_4;
 
-BB7_2:
+BB8_2:
        mul.wide.s32    %rd10, %r1, 8;
        add.s64         %rd11, %rd1, %rd10;
        ld.global.f64   %fd3, [%rd11];
        fma.rn.f64      %fd4, %fd3, %fd2, %fd1;
        st.global.f64   [%rd2], %fd4;
 
-BB7_4:
+BB8_4:
        ret;
 }
 
@@ -495,7 +561,7 @@ BB7_4:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<12>;
+       .reg .b32       %r<11>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<12>;
 
@@ -503,29 +569,28 @@ BB7_4:
        ld.param.u64    %rd1, [bias_multiply_param_0];
        ld.param.u64    %rd2, [bias_multiply_param_1];
        ld.param.u64    %rd3, [bias_multiply_param_2];
-       ld.param.u32    %r5, [bias_multiply_param_3];
-       ld.param.u32    %r3, [bias_multiply_param_4];
-       ld.param.u32    %r4, [bias_multiply_param_5];
-       mov.u32         %r6, %ctaid.x;
-       mov.u32         %r7, %ntid.x;
-       mov.u32         %r8, %tid.x;
-       mad.lo.s32      %r1, %r7, %r6, %r8;
-       div.s32         %r2, %r1, %r3;
-       setp.lt.s32     %p1, %r2, %r5;
-       setp.gt.s32     %p2, %r3, -1;
+       ld.param.u32    %r4, [bias_multiply_param_3];
+       ld.param.u32    %r2, [bias_multiply_param_4];
+       ld.param.u32    %r3, [bias_multiply_param_5];
+       mov.u32         %r5, %ctaid.x;
+       mov.u32         %r6, %ntid.x;
+       mov.u32         %r7, %tid.x;
+       mad.lo.s32      %r1, %r6, %r5, %r7;
+       div.s32         %r8, %r1, %r2;
+       setp.lt.s32     %p1, %r8, %r4;
+       setp.gt.s32     %p2, %r2, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB8_2;
-       bra.uni         BB8_1;
+       @!%p3 bra       BB9_2;
+       bra.uni         BB9_1;
 
-BB8_1:
-       rem.s32         %r9, %r1, %r3;
+BB9_1:
+       rem.s32         %r9, %r1, %r2;
        cvta.to.global.u64      %rd4, %rd1;
-       mad.lo.s32      %r10, %r2, %r3, %r9;
-       mul.wide.s32    %rd5, %r10, 8;
+       mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
-       div.s32         %r11, %r9, %r4;
+       div.s32         %r10, %r9, %r3;
        cvta.to.global.u64      %rd7, %rd2;
-       mul.wide.s32    %rd8, %r11, 8;
+       mul.wide.s32    %rd8, %r10, 8;
        add.s64         %rd9, %rd7, %rd8;
        ld.global.f64   %fd1, [%rd9];
        ld.global.f64   %fd2, [%rd6];
@@ -534,7 +599,7 @@ BB8_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f64   [%rd11], %fd3;
 
-BB8_2:
+BB9_2:
        ret;
 }
 
@@ -576,10 +641,10 @@ BB8_2:
        setp.lt.s32     %p1, %r8, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB9_6;
-       bra.uni         BB9_1;
+       @!%p3 bra       BB10_6;
+       bra.uni         BB10_1;
 
-BB9_1:
+BB10_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -589,26 +654,26 @@ BB9_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB9_5;
-       bra.uni         BB9_2;
+       @%p4 bra        BB10_5;
+       bra.uni         BB10_2;
 
-BB9_5:
+BB10_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB9_6;
+       bra.uni         BB10_6;
 
-BB9_2:
+BB10_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB9_4;
-       bra.uni         BB9_3;
+       @%p5 bra        BB10_4;
+       bra.uni         BB10_3;
 
-BB9_4:
+BB10_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB9_6;
+       bra.uni         BB10_6;
 
-BB9_3:
+BB10_3:
        st.global.f64   [%rd1], %fd6;
 
-BB9_6:
+BB10_6:
        ret;
 }
 
@@ -624,9 +689,9 @@ BB9_6:
        .param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-       .reg .pred      %p<77>;
-       .reg .b32       %r<65>;
-       .reg .f64       %fd<55>;
+       .reg .pred      %p<73>;
+       .reg .b32       %r<66>;
+       .reg .f64       %fd<56>;
        .reg .b64       %rd<19>;
 
 
@@ -647,93 +712,93 @@ BB9_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.gt.s32     %p3, %r10, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB10_73;
-       bra.uni         BB10_1;
+       @!%p4 bra       BB11_77;
+       bra.uni         BB11_1;
 
-BB10_1:
+BB11_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r63, %r1;
-       @%p5 bra        BB10_5;
+       mov.u32         %r64, %r1;
+       @%p5 bra        BB11_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r64, %r3;
-       @%p6 bra        BB10_4;
+       mov.u32         %r65, %r3;
+       @%p6 bra        BB11_4;
 
-       mov.u32         %r64, %r2;
+       mov.u32         %r65, %r2;
 
-BB10_4:
-       mov.u32         %r58, %r64;
-       mov.u32         %r4, %r58;
-       mov.u32         %r63, %r4;
+BB11_4:
+       mov.u32         %r59, %r65;
+       mov.u32         %r4, %r59;
+       mov.u32         %r64, %r4;
 
-BB10_5:
-       mov.u32         %r5, %r63;
+BB11_5:
+       mov.u32         %r5, %r64;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r61, %r1;
-       @%p7 bra        BB10_9;
+       mov.u32         %r62, %r1;
+       @%p7 bra        BB11_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r62, %r3;
-       @%p8 bra        BB10_8;
+       mov.u32         %r63, %r3;
+       @%p8 bra        BB11_8;
 
-       mov.u32         %r62, %r2;
+       mov.u32         %r63, %r2;
 
-BB10_8:
-       mov.u32         %r61, %r62;
+BB11_8:
+       mov.u32         %r62, %r63;
 
-BB10_9:
+BB11_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r61, 8;
+       mul.wide.s32    %rd9, %r62, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd54, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd55, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p9, %r13, 8;
-       @%p9 bra        BB10_26;
+       @%p9 bra        BB11_26;
 
        setp.gt.s32     %p23, %r13, 3;
-       @%p23 bra       BB10_18;
+       @%p23 bra       BB11_18;
 
        setp.gt.s32     %p30, %r13, 1;
-       @%p30 bra       BB10_15;
+       @%p30 bra       BB11_15;
 
        setp.eq.s32     %p33, %r13, 0;
-       @%p33 bra       BB10_71;
-       bra.uni         BB10_13;
+       @%p33 bra       BB11_75;
+       bra.uni         BB11_13;
 
-BB10_71:
-       add.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB10_72;
+BB11_75:
+       add.f64         %fd55, %fd1, %fd2;
+       bra.uni         BB11_76;
 
-BB10_26:
+BB11_26:
        setp.gt.s32     %p10, %r13, 13;
-       @%p10 bra       BB10_35;
+       @%p10 bra       BB11_35;
 
        setp.gt.s32     %p17, %r13, 10;
-       @%p17 bra       BB10_31;
+       @%p17 bra       BB11_31;
 
        setp.eq.s32     %p21, %r13, 9;
-       @%p21 bra       BB10_53;
-       bra.uni         BB10_29;
+       @%p21 bra       BB11_55;
+       bra.uni         BB11_29;
 
-BB10_53:
-       setp.eq.f64     %p50, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB10_72;
+BB11_55:
+       setp.eq.f64     %p48, %fd1, %fd2;
+       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
+       bra.uni         BB11_76;
 
-BB10_18:
+BB11_18:
        setp.gt.s32     %p24, %r13, 5;
-       @%p24 bra       BB10_22;
+       @%p24 bra       BB11_22;
 
        setp.eq.s32     %p28, %r13, 4;
-       @%p28 bra       BB10_56;
-       bra.uni         BB10_20;
+       @%p28 bra       BB11_58;
+       bra.uni         BB11_20;
 
-BB10_56:
+BB11_58:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -746,7 +811,7 @@ BB10_56:
        add.s32         %r32, %r31, -1012;
        mov.b64          %rd15, %fd2;
        shl.b64         %rd1, %rd15, %r32;
-       setp.eq.s64     %p55, %rd1, -9223372036854775808;
+       setp.eq.s64     %p53, %rd1, -9223372036854775808;
        abs.f64         %fd19, %fd1;
        // Callseq Start 0
        {
@@ -763,340 +828,342 @@ BB10_56:
        param0, 
        param1
        );
-       ld.param.f64    %fd53, [retval0+0];
+       ld.param.f64    %fd54, [retval0+0];
        
        //{
        }// Callseq End 0
-       setp.lt.s32     %p56, %r8, 0;
-       and.pred        %p1, %p56, %p55;
-       @!%p1 bra       BB10_58;
-       bra.uni         BB10_57;
+       setp.lt.s32     %p54, %r8, 0;
+       and.pred        %p1, %p54, %p53;
+       @!%p1 bra       BB11_60;
+       bra.uni         BB11_59;
 
-BB10_57:
+BB11_59:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r33}, %fd53;
+       mov.b64         {%temp, %r33}, %fd54;
        }
        xor.b32         %r34, %r33, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd53;
+       mov.b64         {%r35, %temp}, %fd54;
        }
-       mov.b64         %fd53, {%r35, %r34};
+       mov.b64         %fd54, {%r35, %r34};
 
-BB10_58:
-       mov.f64         %fd52, %fd53;
-       setp.eq.f64     %p57, %fd1, 0d0000000000000000;
-       @%p57 bra       BB10_61;
-       bra.uni         BB10_59;
+BB11_60:
+       mov.f64         %fd53, %fd54;
+       setp.eq.f64     %p55, %fd1, 0d0000000000000000;
+       @%p55 bra       BB11_63;
+       bra.uni         BB11_61;
 
-BB10_61:
-       selp.b32        %r36, %r8, 0, %p55;
+BB11_63:
+       selp.b32        %r36, %r8, 0, %p53;
        or.b32          %r37, %r36, 2146435072;
-       setp.lt.s32     %p61, %r9, 0;
-       selp.b32        %r38, %r37, %r36, %p61;
+       setp.lt.s32     %p59, %r9, 0;
+       selp.b32        %r38, %r37, %r36, %p59;
        mov.u32         %r39, 0;
-       mov.b64         %fd52, {%r39, %r38};
-       bra.uni         BB10_62;
+       mov.b64         %fd53, {%r39, %r38};
+       bra.uni         BB11_64;
 
-BB10_35:
+BB11_35:
        setp.gt.s32     %p11, %r13, 15;
-       @%p11 bra       BB10_39;
+       @%p11 bra       BB11_39;
 
        setp.eq.s32     %p15, %r13, 14;
-       @%p15 bra       BB10_50;
-       bra.uni         BB10_37;
+       @%p15 bra       BB11_52;
+       bra.uni         BB11_37;
 
-BB10_50:
+BB11_52:
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.rni.s64.f64 %rd12, %fd2;
        cvt.u32.u64     %r25, %rd11;
        cvt.u32.u64     %r26, %rd12;
        or.b32          %r27, %r26, %r25;
-       setp.eq.s32     %p47, %r27, 0;
-       selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-       bra.uni         BB10_72;
+       setp.eq.s32     %p45, %r27, 0;
+       selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
+       bra.uni         BB11_76;
 
-BB10_15:
+BB11_15:
        setp.eq.s32     %p31, %r13, 2;
-       @%p31 bra       BB10_70;
-       bra.uni         BB10_16;
+       @%p31 bra       BB11_74;
+       bra.uni         BB11_16;
 
-BB10_70:
-       mul.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB10_72;
+BB11_74:
+       mul.f64         %fd55, %fd1, %fd2;
+       bra.uni         BB11_76;
 
-BB10_31:
+BB11_31:
        setp.eq.s32     %p18, %r13, 11;
-       @%p18 bra       BB10_52;
+       @%p18 bra       BB11_54;
 
        setp.eq.s32     %p19, %r13, 12;
-       @%p19 bra       BB10_51;
-       bra.uni         BB10_33;
+       @%p19 bra       BB11_53;
+       bra.uni         BB11_33;
 
-BB10_51:
-       max.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB10_72;
+BB11_53:
+       max.f64         %fd55, %fd1, %fd2;
+       bra.uni         BB11_76;
 
-BB10_22:
+BB11_22:
        setp.eq.s32     %p25, %r13, 6;
-       @%p25 bra       BB10_55;
+       @%p25 bra       BB11_57;
 
        setp.eq.s32     %p26, %r13, 7;
-       @%p26 bra       BB10_54;
-       bra.uni         BB10_24;
+       @%p26 bra       BB11_56;
+       bra.uni         BB11_24;
 
-BB10_54:
-       setp.gt.f64     %p52, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-       bra.uni         BB10_72;
+BB11_56:
+       setp.gt.f64     %p50, %fd1, %fd2;
+       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
+       bra.uni         BB11_76;
 
-BB10_39:
+BB11_39:
        setp.eq.s32     %p12, %r13, 16;
-       @%p12 bra       BB10_49;
+       @%p12 bra       BB11_51;
 
        setp.eq.s32     %p13, %r13, 17;
-       @%p13 bra       BB10_45;
-       bra.uni         BB10_41;
+       @%p13 bra       BB11_46;
+       bra.uni         BB11_41;
 
-BB10_45:
-       setp.eq.f64     %p39, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p40, %fd2, 0d8000000000000000;
-       or.pred         %p41, %p39, %p40;
-       mov.f64         %fd54, 0d7FF8000000000000;
-       @%p41 bra       BB10_72;
+BB11_46:
+       setp.eq.f64     %p38, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p39, %fd2, 0d8000000000000000;
+       or.pred         %p40, %p38, %p39;
+       mov.f64         %fd55, 0d7FF8000000000000;
+       @%p40 bra       BB11_76;
 
-       div.rn.f64      %fd54, %fd1, %fd2;
-       abs.f64         %fd39, %fd54;
-       setp.gtu.f64    %p42, %fd39, 0d7FF0000000000000;
-       @%p42 bra       BB10_72;
+       div.rn.f64      %fd55, %fd1, %fd2;
+       abs.f64         %fd39, %fd55;
+       setp.gtu.f64    %p41, %fd39, 0d7FF0000000000000;
+       @%p41 bra       BB11_76;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r22, %temp}, %fd54;
+       mov.b64         {%temp, %r22}, %fd55;
        }
+       and.b32         %r23, %r22, 2147483647;
+       setp.ne.s32     %p42, %r23, 2146435072;
+       @%p42 bra       BB11_50;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r23}, %fd54;
+       mov.b64         {%r24, %temp}, %fd55;
        }
-       and.b32         %r24, %r23, 2147483647;
-       setp.ne.s32     %p43, %r24, 2146435072;
-       setp.ne.s32     %p44, %r22, 0;
-       or.pred         %p45, %p43, %p44;
-       @!%p45 bra      BB10_72;
-       bra.uni         BB10_48;
-
-BB10_48:
-       cvt.rmi.f64.f64 %fd40, %fd54;
+       setp.eq.s32     %p43, %r24, 0;
+       @%p43 bra       BB11_76;
+
+BB11_50:
+       cvt.rmi.f64.f64 %fd40, %fd55;
        mul.f64         %fd41, %fd2, %fd40;
-       sub.f64         %fd54, %fd1, %fd41;
-       bra.uni         BB10_72;
+       sub.f64         %fd55, %fd1, %fd41;
+       bra.uni         BB11_76;
 
-BB10_13:
+BB11_13:
        setp.eq.s32     %p34, %r13, 1;
-       @%p34 bra       BB10_14;
-       bra.uni         BB10_72;
+       @%p34 bra       BB11_14;
+       bra.uni         BB11_76;
 
-BB10_14:
-       sub.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB10_72;
+BB11_14:
+       sub.f64         %fd55, %fd1, %fd2;
+       bra.uni         BB11_76;
 
-BB10_29:
+BB11_29:
        setp.eq.s32     %p22, %r13, 10;
-       @%p22 bra       BB10_30;
-       bra.uni         BB10_72;
+       @%p22 bra       BB11_30;
+       bra.uni         BB11_76;
 
-BB10_30:
-       setp.neu.f64    %p49, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-       bra.uni         BB10_72;
+BB11_30:
+       setp.neu.f64    %p47, %fd1, %fd2;
+       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
+       bra.uni         BB11_76;
 
-BB10_20:
+BB11_20:
        setp.eq.s32     %p29, %r13, 5;
-       @%p29 bra       BB10_21;
-       bra.uni         BB10_72;
+       @%p29 bra       BB11_21;
+       bra.uni         BB11_76;
 
-BB10_21:
-       setp.lt.f64     %p54, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-       bra.uni         BB10_72;
+BB11_21:
+       setp.lt.f64     %p52, %fd1, %fd2;
+       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
+       bra.uni         BB11_76;
 
-BB10_37:
+BB11_37:
        setp.eq.s32     %p16, %r13, 15;
-       @%p16 bra       BB10_38;
-       bra.uni         BB10_72;
+       @%p16 bra       BB11_38;
+       bra.uni         BB11_76;
 
-BB10_38:
+BB11_38:
        mul.f64         %fd43, %fd1, %fd2;
        mov.f64         %fd44, 0d3FF0000000000000;
-       sub.f64         %fd54, %fd44, %fd43;
-       bra.uni         BB10_72;
+       sub.f64         %fd55, %fd44, %fd43;
+       bra.uni         BB11_76;
 
-BB10_16:
+BB11_16:
        setp.eq.s32     %p32, %r13, 3;
-       @%p32 bra       BB10_17;
-       bra.uni         BB10_72;
+       @%p32 bra       BB11_17;
+       bra.uni         BB11_76;
 
-BB10_17:
-       div.rn.f64      %fd54, %fd1, %fd2;
-       bra.uni         BB10_72;
+BB11_17:
+       div.rn.f64      %fd55, %fd1, %fd2;
+       bra.uni         BB11_76;
 
-BB10_52:
-       min.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB10_72;
+BB11_54:
+       min.f64         %fd55, %fd1, %fd2;
+       bra.uni         BB11_76;
 
-BB10_33:
+BB11_33:
        setp.eq.s32     %p20, %r13, 13;
-       @%p20 bra       BB10_34;
-       bra.uni         BB10_72;
+       @%p20 bra       BB11_34;
+       bra.uni         BB11_76;
 
-BB10_34:
+BB11_34:
        cvt.rni.s64.f64 %rd13, %fd1;
        cvt.rni.s64.f64 %rd14, %fd2;
        cvt.u32.u64     %r28, %rd13;
        cvt.u32.u64     %r29, %rd14;
        and.b32         %r30, %r29, %r28;
-       setp.eq.s32     %p48, %r30, 0;
-       selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-       bra.uni         BB10_72;
+       setp.eq.s32     %p46, %r30, 0;
+       selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
+       bra.uni         BB11_76;
 
-BB10_55:
-       setp.le.f64     %p53, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-       bra.uni         BB10_72;
+BB11_57:
+       setp.le.f64     %p51, %fd1, %fd2;
+       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p51;
+       bra.uni         BB11_76;
 
-BB10_24:
+BB11_24:
        setp.eq.s32     %p27, %r13, 8;
-       @%p27 bra       BB10_25;
-       bra.uni         BB10_72;
+       @%p27 bra       BB11_25;
+       bra.uni         BB11_76;
 
-BB10_25:
-       setp.ge.f64     %p51, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-       bra.uni         BB10_72;
+BB11_25:
+       setp.ge.f64     %p49, %fd1, %fd2;
+       selp.f64        %fd55, 0d3FF0000000000000, 0d0000000000000000, %p49;
+       bra.uni         BB11_76;
 
-BB10_49:
-       setp.neu.f64    %p46, %fd1, 0d0000000000000000;
+BB11_51:
+       setp.neu.f64    %p44, %fd1, 0d0000000000000000;
        sub.f64         %fd42, %fd1, %fd2;
-       selp.f64        %fd54, %fd42, 0d0000000000000000, %p46;
-       bra.uni         BB10_72;
+       selp.f64        %fd55, %fd42, 0d0000000000000000, %p44;
+       bra.uni         BB11_76;
 
-BB10_41:
+BB11_41:
        setp.ne.s32     %p14, %r13, 18;
-       @%p14 bra       BB10_72;
+       @%p14 bra       BB11_76;
 
-       div.rn.f64      %fd54, %fd1, %fd2;
-       abs.f64         %fd37, %fd54;
+       div.rn.f64      %fd55, %fd1, %fd2;
+       abs.f64         %fd37, %fd55;
        setp.gtu.f64    %p35, %fd37, 0d7FF0000000000000;
-       @%p35 bra       BB10_72;
+       @%p35 bra       BB11_76;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r19, %temp}, %fd54;
+       mov.b64         {%temp, %r19}, %fd55;
        }
+       and.b32         %r20, %r19, 2147483647;
+       setp.ne.s32     %p36, %r20, 2146435072;
+       @%p36 bra       BB11_45;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r20}, %fd54;
+       mov.b64         {%r21, %temp}, %fd55;
        }
-       and.b32         %r21, %r20, 2147483647;
-       setp.ne.s32     %p36, %r21, 2146435072;
-       setp.ne.s32     %p37, %r19, 0;
-       or.pred         %p38, %p36, %p37;
-       @!%p38 bra      BB10_72;
-       bra.uni         BB10_44;
+       setp.eq.s32     %p37, %r21, 0;
+       @%p37 bra       BB11_76;
 
-BB10_44:
-       cvt.rmi.f64.f64 %fd54, %fd54;
-       bra.uni         BB10_72;
+BB11_45:
+       cvt.rmi.f64.f64 %fd55, %fd55;
+       bra.uni         BB11_76;
 
-BB10_59:
-       setp.gt.s32     %p58, %r8, -1;
-       @%p58 bra       BB10_62;
+BB11_61:
+       setp.gt.s32     %p56, %r8, -1;
+       @%p56 bra       BB11_64;
 
        cvt.rzi.f64.f64 %fd45, %fd2;
-       setp.neu.f64    %p59, %fd45, %fd2;
-       selp.f64        %fd52, 0dFFF8000000000000, %fd52, %p59;
+       setp.neu.f64    %p57, %fd45, %fd2;
+       selp.f64        %fd53, 0dFFF8000000000000, %fd53, %p57;
 
-BB10_62:
-       mov.f64         %fd25, %fd52;
+BB11_64:
+       mov.f64         %fd25, %fd53;
        add.f64         %fd26, %fd1, %fd2;
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r40}, %fd26;
        }
        and.b32         %r41, %r40, 2146435072;
-       setp.ne.s32     %p62, %r41, 2146435072;
-       mov.f64         %fd51, %fd25;
-       @%p62 bra       BB10_69;
+       setp.ne.s32     %p60, %r41, 2146435072;
+       mov.f64         %fd52, %fd25;
+       @%p60 bra       BB11_73;
 
-       setp.gtu.f64    %p63, %fd19, 0d7FF0000000000000;
-       mov.f64         %fd51, %fd26;
-       @%p63 bra       BB10_69;
+       setp.gtu.f64    %p61, %fd19, 0d7FF0000000000000;
+       mov.f64         %fd52, %fd26;
+       @%p61 bra       BB11_73;
 
        abs.f64         %fd46, %fd2;
-       setp.gtu.f64    %p64, %fd46, 0d7FF0000000000000;
-       mov.f64         %fd50, %fd26;
-       mov.f64         %fd51, %fd50;
-       @%p64 bra       BB10_69;
+       setp.gtu.f64    %p62, %fd46, 0d7FF0000000000000;
+       mov.f64         %fd51, %fd26;
+       mov.f64         %fd52, %fd51;
+       @%p62 bra       BB11_73;
+
+       and.b32         %r42, %r9, 2147483647;
+       setp.ne.s32     %p63, %r42, 2146435072;
+       @%p63 bra       BB11_69;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r42, %temp}, %fd2;
+       mov.b64         {%r43, %temp}, %fd2;
        }
-       and.b32         %r43, %r9, 2147483647;
-       setp.eq.s32     %p65, %r43, 2146435072;
-       setp.eq.s32     %p66, %r42, 0;
-       and.pred        %p67, %p65, %p66;
-       @%p67 bra       BB10_68;
-       bra.uni         BB10_66;
-
-BB10_68:
-       setp.gt.f64     %p71, %fd19, 0d3FF0000000000000;
-       selp.b32        %r51, 2146435072, 0, %p71;
-       xor.b32         %r52, %r51, 2146435072;
-       setp.lt.s32     %p72, %r9, 0;
-       selp.b32        %r53, %r52, %r51, %p72;
-       setp.eq.f64     %p73, %fd1, 0dBFF0000000000000;
-       selp.b32        %r54, 1072693248, %r53, %p73;
-       mov.u32         %r55, 0;
-       mov.b64         %fd51, {%r55, %r54};
-       bra.uni         BB10_69;
-
-BB10_66:
+       setp.eq.s32     %p64, %r43, 0;
+       @%p64 bra       BB11_72;
+
+BB11_69:
+       and.b32         %r44, %r8, 2147483647;
+       setp.ne.s32     %p65, %r44, 2146435072;
+       mov.f64         %fd49, %fd25;
+       mov.f64         %fd52, %fd49;
+       @%p65 bra       BB11_73;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%r44, %temp}, %fd1;
+       mov.b64         {%r45, %temp}, %fd1;
        }
-       and.b32         %r45, %r8, 2147483647;
-       setp.eq.s32     %p68, %r45, 2146435072;
-       setp.eq.s32     %p69, %r44, 0;
-       and.pred        %p70, %p68, %p69;
-       mov.f64         %fd51, %fd25;
-       @!%p70 bra      BB10_69;
-       bra.uni         BB10_67;
-
-BB10_67:
+       setp.ne.s32     %p66, %r45, 0;
+       mov.f64         %fd52, %fd25;
+       @%p66 bra       BB11_73;
+
        shr.s32         %r46, %r9, 31;
        and.b32         %r47, %r46, -2146435072;
-       selp.b32        %r48, -1048576, 2146435072, %p1;
-       add.s32         %r49, %r48, %r47;
-       mov.u32         %r50, 0;
-       mov.b64         %fd51, {%r50, %r49};
-
-BB10_69:
-       setp.eq.f64     %p74, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p75, %fd1, 0d3FF0000000000000;
-       or.pred         %p76, %p75, %p74;
-       selp.f64        %fd54, 0d3FF0000000000000, %fd51, %p76;
-
-BB10_72:
+       add.s32         %r48, %r47, 2146435072;
+       or.b32          %r49, %r48, -2147483648;
+       selp.b32        %r50, %r49, %r48, %p1;
+       mov.u32         %r51, 0;
+       mov.b64         %fd52, {%r51, %r50};
+       bra.uni         BB11_73;
+
+BB11_72:
+       setp.gt.f64     %p67, %fd19, 0d3FF0000000000000;
+       selp.b32        %r52, 2146435072, 0, %p67;
+       xor.b32         %r53, %r52, 2146435072;
+       setp.lt.s32     %p68, %r9, 0;
+       selp.b32        %r54, %r53, %r52, %p68;
+       setp.eq.f64     %p69, %fd1, 0dBFF0000000000000;
+       selp.b32        %r55, 1072693248, %r54, %p69;
+       mov.u32         %r56, 0;
+       mov.b64         %fd52, {%r56, %r55};
+
+BB11_73:
+       setp.eq.f64     %p70, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p71, %fd1, 0d3FF0000000000000;
+       or.pred         %p72, %p71, %p70;
+       selp.f64        %fd55, 0d3FF0000000000000, %fd52, %p72;
+
+BB11_76:
        cvta.to.global.u64      %rd16, %rd4;
        mul.wide.s32    %rd17, %r3, 8;
        add.s64         %rd18, %rd16, %rd17;
-       st.global.f64   [%rd18], %fd54;
+       st.global.f64   [%rd18], %fd55;
        bar.sync        0;
 
-BB10_73:
+BB11_77:
        ret;
 }
 
@@ -1110,9 +1177,9 @@ BB10_73:
        .param .u32 matrix_scalar_op_param_5
 )
 {
-       .reg .pred      %p<141>;
-       .reg .b32       %r<86>;
-       .reg .f64       %fd<107>;
+       .reg .pred      %p<133>;
+       .reg .b32       %r<88>;
+       .reg .f64       %fd<109>;
        .reg .b64       %rd<20>;
 
 
@@ -1127,7 +1194,7 @@ BB10_73:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r9, %r10, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB11_130;
+       @%p3 bra        BB12_138;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -1136,86 +1203,86 @@ BB10_73:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB11_66;
+       @%p4 bra        BB12_70;
 
-       mov.f64         %fd98, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd99, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 8;
-       @%p5 bra        BB11_19;
+       @%p5 bra        BB12_19;
 
        setp.gt.s32     %p19, %r6, 3;
-       @%p19 bra       BB11_11;
+       @%p19 bra       BB12_11;
 
        setp.gt.s32     %p26, %r6, 1;
-       @%p26 bra       BB11_8;
+       @%p26 bra       BB12_8;
 
        setp.eq.s32     %p29, %r6, 0;
-       @%p29 bra       BB11_64;
-       bra.uni         BB11_6;
+       @%p29 bra       BB12_68;
+       bra.uni         BB12_6;
 
-BB11_64:
-       add.f64         %fd98, %fd1, %fd68;
-       bra.uni         BB11_65;
+BB12_68:
+       add.f64         %fd99, %fd1, %fd68;
+       bra.uni         BB12_69;
 
-BB11_66:
-       mov.f64         %fd106, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p73, %r6, 8;
-       @%p73 bra       BB11_83;
+BB12_70:
+       mov.f64         %fd108, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p69, %r6, 8;
+       @%p69 bra       BB12_87;
 
-       setp.gt.s32     %p87, %r6, 3;
-       @%p87 bra       BB11_75;
+       setp.gt.s32     %p83, %r6, 3;
+       @%p83 bra       BB12_79;
 
-       setp.gt.s32     %p94, %r6, 1;
-       @%p94 bra       BB11_72;
+       setp.gt.s32     %p90, %r6, 1;
+       @%p90 bra       BB12_76;
 
-       setp.eq.s32     %p97, %r6, 0;
-       @%p97 bra       BB11_128;
-       bra.uni         BB11_70;
+       setp.eq.s32     %p93, %r6, 0;
+       @%p93 bra       BB12_136;
+       bra.uni         BB12_74;
 
-BB11_128:
-       add.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB11_129;
+BB12_136:
+       add.f64         %fd108, %fd1, %fd68;
+       bra.uni         BB12_137;
 
-BB11_19:
+BB12_19:
        setp.gt.s32     %p6, %r6, 13;
-       @%p6 bra        BB11_28;
+       @%p6 bra        BB12_28;
 
        setp.gt.s32     %p13, %r6, 10;
-       @%p13 bra       BB11_24;
+       @%p13 bra       BB12_24;
 
        setp.eq.s32     %p17, %r6, 9;
-       @%p17 bra       BB11_46;
-       bra.uni         BB11_22;
+       @%p17 bra       BB12_48;
+       bra.uni         BB12_22;
 
-BB11_46:
-       setp.eq.f64     %p46, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-       bra.uni         BB11_65;
+BB12_48:
+       setp.eq.f64     %p44, %fd1, %fd68;
+       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
+       bra.uni         BB12_69;
 
-BB11_83:
-       setp.gt.s32     %p74, %r6, 13;
-       @%p74 bra       BB11_92;
+BB12_87:
+       setp.gt.s32     %p70, %r6, 13;
+       @%p70 bra       BB12_96;
 
-       setp.gt.s32     %p81, %r6, 10;
-       @%p81 bra       BB11_88;
+       setp.gt.s32     %p77, %r6, 10;
+       @%p77 bra       BB12_92;
 
-       setp.eq.s32     %p85, %r6, 9;
-       @%p85 bra       BB11_110;
-       bra.uni         BB11_86;
+       setp.eq.s32     %p81, %r6, 9;
+       @%p81 bra       BB12_116;
+       bra.uni         BB12_90;
 
-BB11_110:
-       setp.eq.f64     %p114, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-       bra.uni         BB11_129;
+BB12_116:
+       setp.eq.f64     %p108, %fd1, %fd68;
+       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
+       bra.uni         BB12_137;
 
-BB11_11:
+BB12_11:
        setp.gt.s32     %p20, %r6, 5;
-       @%p20 bra       BB11_15;
+       @%p20 bra       BB12_15;
 
        setp.eq.s32     %p24, %r6, 4;
-       @%p24 bra       BB11_49;
-       bra.uni         BB11_13;
+       @%p24 bra       BB12_51;
+       bra.uni         BB12_13;
 
-BB11_49:
+BB12_51:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd68;
@@ -1228,7 +1295,7 @@ BB11_49:
        add.s32         %r25, %r24, -1012;
        mov.b64          %rd14, %fd1;
        shl.b64         %rd2, %rd14, %r25;
-       setp.eq.s64     %p51, %rd2, -9223372036854775808;
+       setp.eq.s64     %p49, %rd2, -9223372036854775808;
        abs.f64         %fd18, %fd68;
        // Callseq Start 1
        {
@@ -1245,69 +1312,69 @@ BB11_49:
        param0, 
        param1
        );
-       ld.param.f64    %fd97, [retval0+0];
+       ld.param.f64    %fd98, [retval0+0];
        
        //{
        }// Callseq End 1
-       setp.lt.s32     %p52, %r2, 0;
-       and.pred        %p1, %p52, %p51;
-       @!%p1 bra       BB11_51;
-       bra.uni         BB11_50;
+       setp.lt.s32     %p50, %r2, 0;
+       and.pred        %p1, %p50, %p49;
+       @!%p1 bra       BB12_53;
+       bra.uni         BB12_52;
 
-BB11_50:
+BB12_52:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r26}, %fd97;
+       mov.b64         {%temp, %r26}, %fd98;
        }
        xor.b32         %r27, %r26, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r28, %temp}, %fd97;
+       mov.b64         {%r28, %temp}, %fd98;
        }
-       mov.b64         %fd97, {%r28, %r27};
+       mov.b64         %fd98, {%r28, %r27};
 
-BB11_51:
-       mov.f64         %fd96, %fd97;
-       setp.eq.f64     %p53, %fd68, 0d0000000000000000;
-       @%p53 bra       BB11_54;
-       bra.uni         BB11_52;
+BB12_53:
+       mov.f64         %fd97, %fd98;
+       setp.eq.f64     %p51, %fd68, 0d0000000000000000;
+       @%p51 bra       BB12_56;
+       bra.uni         BB12_54;
 
-BB11_54:
-       selp.b32        %r29, %r2, 0, %p51;
+BB12_56:
+       selp.b32        %r29, %r2, 0, %p49;
        or.b32          %r30, %r29, 2146435072;
-       setp.lt.s32     %p57, %r3, 0;
-       selp.b32        %r31, %r30, %r29, %p57;
+       setp.lt.s32     %p55, %r3, 0;
+       selp.b32        %r31, %r30, %r29, %p55;
        mov.u32         %r32, 0;
-       mov.b64         %fd96, {%r32, %r31};
-       bra.uni         BB11_55;
+       mov.b64         %fd97, {%r32, %r31};
+       bra.uni         BB12_57;
 
-BB11_28:
+BB12_28:
        setp.gt.s32     %p7, %r6, 15;
-       @%p7 bra        BB11_32;
+       @%p7 bra        BB12_32;
 
        setp.eq.s32     %p11, %r6, 14;
-       @%p11 bra       BB11_43;
-       bra.uni         BB11_30;
+       @%p11 bra       BB12_45;
+       bra.uni         BB12_30;
 
-BB11_43:
+BB12_45:
        cvt.rni.s64.f64 %rd10, %fd68;
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.u32.u64     %r18, %rd10;
        cvt.u32.u64     %r19, %rd11;
        or.b32          %r20, %r19, %r18;
-       setp.eq.s32     %p43, %r20, 0;
-       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-       bra.uni         BB11_65;
+       setp.eq.s32     %p41, %r20, 0;
+       selp.f64        %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
+       bra.uni         BB12_69;
 
-BB11_75:
-       setp.gt.s32     %p88, %r6, 5;
-       @%p88 bra       BB11_79;
+BB12_79:
+       setp.gt.s32     %p84, %r6, 5;
+       @%p84 bra       BB12_83;
 
-       setp.eq.s32     %p92, %r6, 4;
-       @%p92 bra       BB11_113;
-       bra.uni         BB11_77;
+       setp.eq.s32     %p88, %r6, 4;
+       @%p88 bra       BB12_119;
+       bra.uni         BB12_81;
 
-BB11_113:
+BB12_119:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -1316,11 +1383,11 @@ BB11_113:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd68;
        }
-       bfe.u32         %r61, %r5, 20, 11;
-       add.s32         %r62, %r61, -1012;
+       bfe.u32         %r62, %r5, 20, 11;
+       add.s32         %r63, %r62, -1012;
        mov.b64          %rd19, %fd68;
-       shl.b64         %rd3, %rd19, %r62;
-       setp.eq.s64     %p119, %rd3, -9223372036854775808;
+       shl.b64         %rd3, %rd19, %r63;
+       setp.eq.s64     %p113, %rd3, -9223372036854775808;
        abs.f64         %fd51, %fd1;
        // Callseq Start 2
        {
@@ -1337,612 +1404,616 @@ BB11_113:
        param0, 
        param1
        );
-       ld.param.f64    %fd105, [retval0+0];
+       ld.param.f64    %fd107, [retval0+0];
        
        //{
        }// Callseq End 2
-       setp.lt.s32     %p120, %r4, 0;
-       and.pred        %p2, %p120, %p119;
-       @!%p2 bra       BB11_115;
-       bra.uni         BB11_114;
+       setp.lt.s32     %p114, %r4, 0;
+       and.pred        %p2, %p114, %p113;
+       @!%p2 bra       BB12_121;
+       bra.uni         BB12_120;
 
-BB11_114:
+BB12_120:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r63}, %fd105;
+       mov.b64         {%temp, %r64}, %fd107;
        }
-       xor.b32         %r64, %r63, -2147483648;
+       xor.b32         %r65, %r64, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r65, %temp}, %fd105;
+       mov.b64         {%r66, %temp}, %fd107;
        }
-       mov.b64         %fd105, {%r65, %r64};
-
-BB11_115:
-       mov.f64         %fd104, %fd105;
-       setp.eq.f64     %p121, %fd1, 0d0000000000000000;
-       @%p121 bra      BB11_118;
-       bra.uni         BB11_116;
-
-BB11_118:
-       selp.b32        %r66, %r4, 0, %p119;
-       or.b32          %r67, %r66, 2146435072;
-       setp.lt.s32     %p125, %r5, 0;
-       selp.b32        %r68, %r67, %r66, %p125;
-       mov.u32         %r69, 0;
-       mov.b64         %fd104, {%r69, %r68};
-       bra.uni         BB11_119;
-
-BB11_92:
-       setp.gt.s32     %p75, %r6, 15;
-       @%p75 bra       BB11_96;
-
-       setp.eq.s32     %p79, %r6, 14;
-       @%p79 bra       BB11_107;
-       bra.uni         BB11_94;
-
-BB11_107:
+       mov.b64         %fd107, {%r66, %r65};
+
+BB12_121:
+       mov.f64         %fd106, %fd107;
+       setp.eq.f64     %p115, %fd1, 0d0000000000000000;
+       @%p115 bra      BB12_124;
+       bra.uni         BB12_122;
+
+BB12_124:
+       selp.b32        %r67, %r4, 0, %p113;
+       or.b32          %r68, %r67, 2146435072;
+       setp.lt.s32     %p119, %r5, 0;
+       selp.b32        %r69, %r68, %r67, %p119;
+       mov.u32         %r70, 0;
+       mov.b64         %fd106, {%r70, %r69};
+       bra.uni         BB12_125;
+
+BB12_96:
+       setp.gt.s32     %p71, %r6, 15;
+       @%p71 bra       BB12_100;
+
+       setp.eq.s32     %p75, %r6, 14;
+       @%p75 bra       BB12_113;
+       bra.uni         BB12_98;
+
+BB12_113:
        cvt.rni.s64.f64 %rd15, %fd1;
        cvt.rni.s64.f64 %rd16, %fd68;
-       cvt.u32.u64     %r55, %rd15;
-       cvt.u32.u64     %r56, %rd16;
-       or.b32          %r57, %r56, %r55;
-       setp.eq.s32     %p111, %r57, 0;
-       selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-       bra.uni         BB11_129;
-
-BB11_8:
+       cvt.u32.u64     %r56, %rd15;
+       cvt.u32.u64     %r57, %rd16;
+       or.b32          %r58, %r57, %r56;
+       setp.eq.s32     %p105, %r58, 0;
+       selp.f64        %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
+       bra.uni         BB12_137;
+
+BB12_8:
        setp.eq.s32     %p27, %r6, 2;
-       @%p27 bra       BB11_63;
-       bra.uni         BB11_9;
+       @%p27 bra       BB12_67;
+       bra.uni         BB12_9;
 
-BB11_63:
-       mul.f64         %fd98, %fd1, %fd68;
-       bra.uni         BB11_65;
+BB12_67:
+       mul.f64         %fd99, %fd1, %fd68;
+       bra.uni         BB12_69;
 
-BB11_24:
+BB12_24:
        setp.eq.s32     %p14, %r6, 11;
-       @%p14 bra       BB11_45;
+       @%p14 bra       BB12_47;
 
        setp.eq.s32     %p15, %r6, 12;
-       @%p15 bra       BB11_44;
-       bra.uni         BB11_26;
+       @%p15 bra       BB12_46;
+       bra.uni         BB12_26;
 
-BB11_44:
-       max.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB11_65;
+BB12_46:
+       max.f64         %fd99, %fd68, %fd1;
+       bra.uni         BB12_69;
 
-BB11_15:
+BB12_15:
        setp.eq.s32     %p21, %r6, 6;
-       @%p21 bra       BB11_48;
+       @%p21 bra       BB12_50;
 
        setp.eq.s32     %p22, %r6, 7;
-       @%p22 bra       BB11_47;
-       bra.uni         BB11_17;
+       @%p22 bra       BB12_49;
+       bra.uni         BB12_17;
 
-BB11_47:
-       setp.lt.f64     %p48, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-       bra.uni         BB11_65;
+BB12_49:
+       setp.lt.f64     %p46, %fd1, %fd68;
+       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
+       bra.uni         BB12_69;
 
-BB11_32:
+BB12_32:
        setp.eq.s32     %p8, %r6, 16;
-       @%p8 bra        BB11_42;
+       @%p8 bra        BB12_44;
 
        setp.eq.s32     %p9, %r6, 17;
-       @%p9 bra        BB11_38;
-       bra.uni         BB11_34;
+       @%p9 bra        BB12_39;
+       bra.uni         BB12_34;
 
-BB11_38:
-       setp.eq.f64     %p35, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p36, %fd1, 0d8000000000000000;
-       or.pred         %p37, %p35, %p36;
-       mov.f64         %fd98, 0d7FF8000000000000;
-       @%p37 bra       BB11_65;
+BB12_39:
+       setp.eq.f64     %p34, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p35, %fd1, 0d8000000000000000;
+       or.pred         %p36, %p34, %p35;
+       mov.f64         %fd99, 0d7FF8000000000000;
+       @%p36 bra       BB12_69;
 
-       div.rn.f64      %fd98, %fd68, %fd1;
-       abs.f64         %fd72, %fd98;
-       setp.gtu.f64    %p38, %fd72, 0d7FF0000000000000;
-       @%p38 bra       BB11_65;
+       div.rn.f64      %fd99, %fd68, %fd1;
+       abs.f64         %fd72, %fd99;
+       setp.gtu.f64    %p37, %fd72, 0d7FF0000000000000;
+       @%p37 bra       BB12_69;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r15, %temp}, %fd98;
+       mov.b64         {%temp, %r15}, %fd99;
        }
+       and.b32         %r16, %r15, 2147483647;
+       setp.ne.s32     %p38, %r16, 2146435072;
+       @%p38 bra       BB12_43;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r16}, %fd98;
+       mov.b64         {%r17, %temp}, %fd99;
        }
-       and.b32         %r17, %r16, 2147483647;
-       setp.ne.s32     %p39, %r17, 2146435072;
-       setp.ne.s32     %p40, %r15, 0;
-       or.pred         %p41, %p39, %p40;
-       @!%p41 bra      BB11_65;
-       bra.uni         BB11_41;
+       setp.eq.s32     %p39, %r17, 0;
+       @%p39 bra       BB12_69;
 
-BB11_41:
-       cvt.rmi.f64.f64 %fd73, %fd98;
+BB12_43:
+       cvt.rmi.f64.f64 %fd73, %fd99;
        mul.f64         %fd74, %fd1, %fd73;
-       sub.f64         %fd98, %fd68, %fd74;
-       bra.uni         BB11_65;
-
-BB11_72:
-       setp.eq.s32     %p95, %r6, 2;
-       @%p95 bra       BB11_127;
-       bra.uni         BB11_73;
-
-BB11_127:
-       mul.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB11_129;
-
-BB11_88:
-       setp.eq.s32     %p82, %r6, 11;
-       @%p82 bra       BB11_109;
-
-       setp.eq.s32     %p83, %r6, 12;
-       @%p83 bra       BB11_108;
-       bra.uni         BB11_90;
-
-BB11_108:
-       max.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB11_129;
-
-BB11_79:
-       setp.eq.s32     %p89, %r6, 6;
-       @%p89 bra       BB11_112;
-
-       setp.eq.s32     %p90, %r6, 7;
-       @%p90 bra       BB11_111;
-       bra.uni         BB11_81;
-
-BB11_111:
-       setp.gt.f64     %p116, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-       bra.uni         BB11_129;
-
-BB11_96:
-       setp.eq.s32     %p76, %r6, 16;
-       @%p76 bra       BB11_106;
-
-       setp.eq.s32     %p77, %r6, 17;
-       @%p77 bra       BB11_102;
-       bra.uni         BB11_98;
-
-BB11_102:
-       setp.eq.f64     %p103, %fd68, 0d0000000000000000;
-       setp.eq.f64     %p104, %fd68, 0d8000000000000000;
-       or.pred         %p105, %p103, %p104;
-       mov.f64         %fd106, 0d7FF8000000000000;
-       @%p105 bra      BB11_129;
-
-       div.rn.f64      %fd106, %fd1, %fd68;
-       abs.f64         %fd83, %fd106;
-       setp.gtu.f64    %p106, %fd83, 0d7FF0000000000000;
-       @%p106 bra      BB11_129;
+       sub.f64         %fd99, %fd68, %fd74;
+       bra.uni         BB12_69;
+
+BB12_76:
+       setp.eq.s32     %p91, %r6, 2;
+       @%p91 bra       BB12_135;
+       bra.uni         BB12_77;
+
+BB12_135:
+       mul.f64         %fd108, %fd1, %fd68;
+       bra.uni         BB12_137;
+
+BB12_92:
+       setp.eq.s32     %p78, %r6, 11;
+       @%p78 bra       BB12_115;
+
+       setp.eq.s32     %p79, %r6, 12;
+       @%p79 bra       BB12_114;
+       bra.uni         BB12_94;
+
+BB12_114:
+       max.f64         %fd108, %fd1, %fd68;
+       bra.uni         BB12_137;
+
+BB12_83:
+       setp.eq.s32     %p85, %r6, 6;
+       @%p85 bra       BB12_118;
+
+       setp.eq.s32     %p86, %r6, 7;
+       @%p86 bra       BB12_117;
+       bra.uni         BB12_85;
+
+BB12_117:
+       setp.gt.f64     %p110, %fd1, %fd68;
+       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
+       bra.uni         BB12_137;
+
+BB12_100:
+       setp.eq.s32     %p72, %r6, 16;
+       @%p72 bra       BB12_112;
+
+       setp.eq.s32     %p73, %r6, 17;
+       @%p73 bra       BB12_107;
+       bra.uni         BB12_102;
+
+BB12_107:
+       setp.eq.f64     %p98, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p99, %fd68, 0d8000000000000000;
+       or.pred         %p100, %p98, %p99;
+       mov.f64         %fd108, 0d7FF8000000000000;
+       @%p100 bra      BB12_137;
+
+       div.rn.f64      %fd108, %fd1, %fd68;
+       abs.f64         %fd83, %fd108;
+       setp.gtu.f64    %p101, %fd83, 0d7FF0000000000000;
+       @%p101 bra      BB12_137;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r52, %temp}, %fd106;
+       mov.b64         {%temp, %r53}, %fd108;
        }
+       and.b32         %r54, %r53, 2147483647;
+       setp.ne.s32     %p102, %r54, 2146435072;
+       @%p102 bra      BB12_111;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r53}, %fd106;
+       mov.b64         {%r55, %temp}, %fd108;
        }
-       and.b32         %r54, %r53, 2147483647;
-       setp.ne.s32     %p107, %r54, 2146435072;
-       setp.ne.s32     %p108, %r52, 0;
-       or.pred         %p109, %p107, %p108;
-       @!%p109 bra     BB11_129;
-       bra.uni         BB11_105;
-
-BB11_105:
-       cvt.rmi.f64.f64 %fd84, %fd106;
+       setp.eq.s32     %p103, %r55, 0;
+       @%p103 bra      BB12_137;
+
+BB12_111:
+       cvt.rmi.f64.f64 %fd84, %fd108;
        mul.f64         %fd85, %fd84, %fd68;
-       sub.f64         %fd106, %fd1, %fd85;
-       bra.uni         BB11_129;
+       sub.f64         %fd108, %fd1, %fd85;
+       bra.uni         BB12_137;
 
-BB11_6:
+BB12_6:
        setp.eq.s32     %p30, %r6, 1;
-       @%p30 bra       BB11_7;
-       bra.uni         BB11_65;
+       @%p30 bra       BB12_7;
+       bra.uni         BB12_69;
 
-BB11_7:
-       sub.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB11_65;
+BB12_7:
+       sub.f64         %fd99, %fd68, %fd1;
+       bra.uni         BB12_69;
 
-BB11_22:
+BB12_22:
        setp.eq.s32     %p18, %r6, 10;
-       @%p18 bra       BB11_23;
-       bra.uni         BB11_65;
+       @%p18 bra       BB12_23;
+       bra.uni         BB12_69;
 
-BB11_23:
-       setp.neu.f64    %p45, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-       bra.uni         BB11_65;
+BB12_23:
+       setp.neu.f64    %p43, %fd1, %fd68;
+       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
+       bra.uni         BB12_69;
 
-BB11_13:
+BB12_13:
        setp.eq.s32     %p25, %r6, 5;
-       @%p25 bra       BB11_14;
-       bra.uni         BB11_65;
+       @%p25 bra       BB12_14;
+       bra.uni         BB12_69;
 
-BB11_14:
-       setp.gt.f64     %p50, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB11_65;
+BB12_14:
+       setp.gt.f64     %p48, %fd1, %fd68;
+       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
+       bra.uni         BB12_69;
 
-BB11_30:
+BB12_30:
        setp.eq.s32     %p12, %r6, 15;
-       @%p12 bra       BB11_31;
-       bra.uni         BB11_65;
+       @%p12 bra       BB12_31;
+       bra.uni         BB12_69;
 
-BB11_31:
+BB12_31:
        mul.f64         %fd76, %fd1, %fd68;
        mov.f64         %fd77, 0d3FF0000000000000;
-       sub.f64         %fd98, %fd77, %fd76;
-       bra.uni         BB11_65;
+       sub.f64         %fd99, %fd77, %fd76;
+       bra.uni         BB12_69;
 
-BB11_9:
+BB12_9:
        setp.eq.s32     %p28, %r6, 3;
-       @%p28 bra       BB11_10;
-       bra.uni         BB11_65;
+       @%p28 bra       BB12_10;
+       bra.uni         BB12_69;
 
-BB11_10:
-       div.rn.f64      %fd98, %fd68, %fd1;
-       bra.uni         BB11_65;
+BB12_10:
+       div.rn.f64      %fd99, %fd68, %fd1;
+       bra.uni         BB12_69;
 
-BB11_45:
-       min.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB11_65;
+BB12_47:
+       min.f64         %fd99, %fd68, %fd1;
+       bra.uni         BB12_69;
 
-BB11_26:
+BB12_26:
        setp.eq.s32     %p16, %r6, 13;
-       @%p16 bra       BB11_27;
-       bra.uni         BB11_65;
+       @%p16 bra       BB12_27;
+       bra.uni         BB12_69;
 
-BB11_27:
+BB12_27:
        cvt.rni.s64.f64 %rd12, %fd68;
        cvt.rni.s64.f64 %rd13, %fd1;
        cvt.u32.u64     %r21, %rd12;
        cvt.u32.u64     %r22, %rd13;
        and.b32         %r23, %r22, %r21;
-       setp.eq.s32     %p44, %r23, 0;
-       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-       bra.uni         BB11_65;
+       setp.eq.s32     %p42, %r23, 0;
+       selp.f64        %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
+       bra.uni         BB12_69;
 
-BB11_48:
-       setp.ge.f64     %p49, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
-       bra.uni         BB11_65;
+BB12_50:
+       setp.ge.f64     %p47, %fd1, %fd68;
+       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p47;
+       bra.uni         BB12_69;
 
-BB11_17:
+BB12_17:
        setp.eq.s32     %p23, %r6, 8;
-       @%p23 bra       BB11_18;
-       bra.uni         BB11_65;
+       @%p23 bra       BB12_18;
+       bra.uni         BB12_69;
 
-BB11_18:
-       setp.le.f64     %p47, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
-       bra.uni         BB11_65;
+BB12_18:
+       setp.le.f64     %p45, %fd1, %fd68;
+       selp.f64        %fd99, 0d3FF0000000000000, 0d0000000000000000, %p45;
+       bra.uni         BB12_69;
 
-BB11_42:
-       setp.neu.f64    %p42, %fd68, 0d0000000000000000;
+BB12_44:
+       setp.neu.f64    %p40, %fd68, 0d0000000000000000;
        sub.f64         %fd75, %fd68, %fd1;
-       selp.f64        %fd98, %fd75, 0d0000000000000000, %p42;
-       bra.uni         BB11_65;
+       selp.f64        %fd99, %fd75, 0d0000000000000000, %p40;
+       bra.uni         BB12_69;
 
-BB11_34:
+BB12_34:
        setp.ne.s32     %p10, %r6, 18;
-       @%p10 bra       BB11_65;
+       @%p10 bra       BB12_69;
 
-       div.rn.f64      %fd98, %fd68, %fd1;
-       abs.f64         %fd70, %fd98;
+       div.rn.f64      %fd99, %fd68, %fd1;
+       abs.f64         %fd70, %fd99;
        setp.gtu.f64    %p31, %fd70, 0d7FF0000000000000;
-       @%p31 bra       BB11_65;
+       @%p31 bra       BB12_69;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r12, %temp}, %fd98;
+       mov.b64         {%temp, %r12}, %fd99;
        }
+       and.b32         %r13, %r12, 2147483647;
+       setp.ne.s32     %p32, %r13, 2146435072;
+       @%p32 bra       BB12_38;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r13}, %fd98;
+       mov.b64         {%r14, %temp}, %fd99;
        }
-       and.b32         %r14, %r13, 2147483647;
-       setp.ne.s32     %p32, %r14, 2146435072;
-       setp.ne.s32     %p33, %r12, 0;
-       or.pred         %p34, %p32, %p33;
-       @!%p34 bra      BB11_65;
-       bra.uni         BB11_37;
-
-BB11_37:
-       cvt.rmi.f64.f64 %fd98, %fd98;
-       bra.uni         BB11_65;
-
-BB11_70:
-       setp.eq.s32     %p98, %r6, 1;
-       @%p98 bra       BB11_71;
-       bra.uni         BB11_129;
-
-BB11_71:
-       sub.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB11_129;
-
-BB11_86:
-       setp.eq.s32     %p86, %r6, 10;
-       @%p86 bra       BB11_87;
-       bra.uni         BB11_129;
-
-BB11_87:
-       setp.neu.f64    %p113, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
-       bra.uni         BB11_129;
-
-BB11_77:
-       setp.eq.s32     %p93, %r6, 5;
-       @%p93 bra       BB11_78;
-       bra.uni         BB11_129;
-
-BB11_78:
-       setp.lt.f64     %p118, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
-       bra.uni         BB11_129;
-
-BB11_94:
-       setp.eq.s32     %p80, %r6, 15;
-       @%p80 bra       BB11_95;
-       bra.uni         BB11_129;
-
-BB11_95:
+       setp.eq.s32     %p33, %r14, 0;
+       @%p33 bra       BB12_69;
+
+BB12_38:
+       cvt.rmi.f64.f64 %fd99, %fd99;
+       bra.uni         BB12_69;
+
+BB12_74:
+       setp.eq.s32     %p94, %r6, 1;
+       @%p94 bra       BB12_75;
+       bra.uni         BB12_137;
+
+BB12_75:
+       sub.f64         %fd108, %fd1, %fd68;
+       bra.uni         BB12_137;
+
+BB12_90:
+       setp.eq.s32     %p82, %r6, 10;
+       @%p82 bra       BB12_91;
+       bra.uni         BB12_137;
+
+BB12_91:
+       setp.neu.f64    %p107, %fd1, %fd68;
+       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p107;
+       bra.uni         BB12_137;
+
+BB12_81:
+       setp.eq.s32     %p89, %r6, 5;
+       @%p89 bra       BB12_82;
+       bra.uni         BB12_137;
+
+BB12_82:
+       setp.lt.f64     %p112, %fd1, %fd68;
+       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p112;
+       bra.uni         BB12_137;
+
+BB12_98:
+       setp.eq.s32     %p76, %r6, 15;
+       @%p76 bra       BB12_99;
+       bra.uni         BB12_137;
+
+BB12_99:
        mul.f64         %fd87, %fd1, %fd68;
        mov.f64         %fd88, 0d3FF0000000000000;
-       sub.f64         %fd106, %fd88, %fd87;
-       bra.uni         BB11_129;
+       sub.f64         %fd108, %fd88, %fd87;
+       bra.uni         BB12_137;
 
-BB11_73:
-       setp.eq.s32     %p96, %r6, 3;
-       @%p96 bra       BB11_74;
-       bra.uni         BB11_129;
+BB12_77:
+       setp.eq.s32     %p92, %r6, 3;
+       @%p92 bra       BB12_78;
+       bra.uni         BB12_137;
 
-BB11_74:
-       div.rn.f64      %fd106, %fd1, %fd68;
-       bra.uni         BB11_129;
+BB12_78:
+       div.rn.f64      %fd108, %fd1, %fd68;
+       bra.uni         BB12_137;
 
-BB11_109:
-       min.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB11_129;
+BB12_115:
+       min.f64         %fd108, %fd1, %fd68;
+       bra.uni         BB12_137;
 
-BB11_90:
-       setp.eq.s32     %p84, %r6, 13;
-       @%p84 bra       BB11_91;
-       bra.uni         BB11_129;
+BB12_94:
+       setp.eq.s32     %p80, %r6, 13;
+       @%p80 bra       BB12_95;
+       bra.uni         BB12_137;
 
-BB11_91:
+BB12_95:
        cvt.rni.s64.f64 %rd17, %fd1;
        cvt.rni.s64.f64 %rd18, %fd68;
-       cvt.u32.u64     %r58, %rd17;
-       cvt.u32.u64     %r59, %rd18;
-       and.b32         %r60, %r59, %r58;
-       setp.eq.s32     %p112, %r60, 0;
-       selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
-       bra.uni         BB11_129;
-
-BB11_112:
-       setp.le.f64     %p117, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
-       bra.uni         BB11_129;
-
-BB11_81:
-       setp.eq.s32     %p91, %r6, 8;
-       @%p91 bra       BB11_82;
-       bra.uni         BB11_129;
-
-BB11_82:
-       setp.ge.f64     %p115, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
-       bra.uni         BB11_129;
-
-BB11_106:
-       setp.neu.f64    %p110, %fd1, 0d0000000000000000;
+       cvt.u32.u64     %r59, %rd17;
+       cvt.u32.u64     %r60, %rd18;
+       and.b32         %r61, %r60, %r59;
+       setp.eq.s32     %p106, %r61, 0;
+       selp.f64        %fd108, 0d0000000000000000, 0d3FF0000000000000, %p106;
+       bra.uni         BB12_137;
+
+BB12_118:
+       setp.le.f64     %p111, %fd1, %fd68;
+       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p111;
+       bra.uni         BB12_137;
+
+BB12_85:
+       setp.eq.s32     %p87, %r6, 8;
+       @%p87 bra       BB12_86;
+       bra.uni         BB12_137;
+
+BB12_86:
+       setp.ge.f64     %p109, %fd1, %fd68;
+       selp.f64        %fd108, 0d3FF0000000000000, 0d0000000000000000, %p109;
+       bra.uni         BB12_137;
+
+BB12_112:
+       setp.neu.f64    %p104, %fd1, 0d0000000000000000;
        sub.f64         %fd86, %fd1, %fd68;
-       selp.f64        %fd106, %fd86, 0d0000000000000000, %p110;
-       bra.uni         BB11_129;
+       selp.f64        %fd108, %fd86, 0d0000000000000000, %p104;
+       bra.uni         BB12_137;
 
-BB11_98:
-       setp.ne.s32     %p78, %r6, 18;
-       @%p78 bra       BB11_129;
+BB12_102:
+       setp.ne.s32     %p74, %r6, 18;
+       @%p74 bra       BB12_137;
 
-       div.rn.f64      %fd106, %fd1, %fd68;
-       abs.f64         %fd81, %fd106;
-       setp.gtu.f64    %p99, %fd81, 0d7FF0000000000000;
-       @%p99 bra       BB11_129;
+       div.rn.f64      %fd108, %fd1, %fd68;
+       abs.f64         %fd81, %fd108;
+       setp.gtu.f64    %p95, %fd81, 0d7FF0000000000000;
+       @%p95 bra       BB12_137;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r49, %temp}, %fd106;
+       mov.b64         {%temp, %r50}, %fd108;
        }
+       and.b32         %r51, %r50, 2147483647;
+       setp.ne.s32     %p96, %r51, 2146435072;
+       @%p96 bra       BB12_106;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r50}, %fd106;
+       mov.b64         {%r52, %temp}, %fd108;
        }
-       and.b32         %r51, %r50, 2147483647;
-       setp.ne.s32     %p100, %r51, 2146435072;
-       setp.ne.s32     %p101, %r49, 0;
-       or.pred         %p102, %p100, %p101;
-       @!%p102 bra     BB11_129;
-       bra.uni         BB11_101;
+       setp.eq.s32     %p97, %r52, 0;
+       @%p97 bra       BB12_137;
 
-BB11_101:
-       cvt.rmi.f64.f64 %fd106, %fd106;
-       bra.uni         BB11_129;
+BB12_106:
+       cvt.rmi.f64.f64 %fd108, %fd108;
+       bra.uni         BB12_137;
 
-BB11_52:
-       setp.gt.s32     %p54, %r2, -1;
-       @%p54 bra       BB11_55;
+BB12_54:
+       setp.gt.s32     %p52, %r2, -1;
+       @%p52 bra       BB12_57;
 
        cvt.rzi.f64.f64 %fd78, %fd1;
-       setp.neu.f64    %p55, %fd78, %fd1;
-       selp.f64        %fd96, 0dFFF8000000000000, %fd96, %p55;
+       setp.neu.f64    %p53, %fd78, %fd1;
+       selp.f64        %fd97, 0dFFF8000000000000, %fd97, %p53;
 
-BB11_55:
-       mov.f64         %fd24, %fd96;
+BB12_57:
+       mov.f64         %fd24, %fd97;
        add.f64         %fd25, %fd1, %fd68;
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r33}, %fd25;
        }
        and.b32         %r34, %r33, 2146435072;
-       setp.ne.s32     %p58, %r34, 2146435072;
-       mov.f64         %fd95, %fd24;
-       @%p58 bra       BB11_62;
+       setp.ne.s32     %p56, %r34, 2146435072;
+       mov.f64         %fd96, %fd24;
+       @%p56 bra       BB12_66;
 
-       setp.gtu.f64    %p59, %fd18, 0d7FF0000000000000;
-       mov.f64         %fd95, %fd25;
-       @%p59 bra       BB11_62;
+       setp.gtu.f64    %p57, %fd18, 0d7FF0000000000000;
+       mov.f64         %fd96, %fd25;
+       @%p57 bra       BB12_66;
 
        abs.f64         %fd79, %fd1;
-       setp.gtu.f64    %p60, %fd79, 0d7FF0000000000000;
-       mov.f64         %fd94, %fd25;
-       mov.f64         %fd95, %fd94;
-       @%p60 bra       BB11_62;
+       setp.gtu.f64    %p58, %fd79, 0d7FF0000000000000;
+       mov.f64         %fd95, %fd25;
+       mov.f64         %fd96, %fd95;
+       @%p58 bra       BB12_66;
+
+       and.b32         %r35, %r3, 2147483647;
+       setp.ne.s32     %p59, %r35, 2146435072;
+       @%p59 bra       BB12_62;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd1;
+       mov.b64         {%r36, %temp}, %fd1;
        }
-       and.b32         %r36, %r3, 2147483647;
-       setp.eq.s32     %p61, %r36, 2146435072;
-       setp.eq.s32     %p62, %r35, 0;
-       and.pred        %p63, %p61, %p62;
-       @%p63 bra       BB11_61;
-       bra.uni         BB11_59;
+       setp.eq.s32     %p60, %r36, 0;
+       @%p60 bra       BB12_65;
 
-BB11_61:
-       setp.gt.f64     %p67, %fd18, 0d3FF0000000000000;
-       selp.b32        %r44, 2146435072, 0, %p67;
-       xor.b32         %r45, %r44, 2146435072;
-       setp.lt.s32     %p68, %r3, 0;
-       selp.b32        %r46, %r45, %r44, %p68;
-       setp.eq.f64     %p69, %fd68, 0dBFF0000000000000;
-       selp.b32        %r47, 1072693248, %r46, %p69;
-       mov.u32         %r48, 0;
-       mov.b64         %fd95, {%r48, %r47};
-       bra.uni         BB11_62;
-
-BB11_116:
-       setp.gt.s32     %p122, %r4, -1;
-       @%p122 bra      BB11_119;
+BB12_62:
+       and.b32         %r37, %r2, 2147483647;
+       setp.ne.s32     %p61, %r37, 2146435072;
+       mov.f64         %fd93, %fd24;
+       mov.f64         %fd96, %fd93;
+       @%p61 bra       BB12_66;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r38, %temp}, %fd68;
+       }
+       setp.ne.s32     %p62, %r38, 0;
+       mov.f64         %fd96, %fd24;
+       @%p62 bra       BB12_66;
+
+       shr.s32         %r39, %r3, 31;
+       and.b32         %r40, %r39, -2146435072;
+       add.s32         %r41, %r40, 2146435072;
+       or.b32          %r42, %r41, -2147483648;
+       selp.b32        %r43, %r42, %r41, %p1;
+       mov.u32         %r44, 0;
+       mov.b64         %fd96, {%r44, %r43};
+       bra.uni         BB12_66;
+
+BB12_122:
+       setp.gt.s32     %p116, %r4, -1;
+       @%p116 bra      BB12_125;
 
        cvt.rzi.f64.f64 %fd89, %fd68;
-       setp.neu.f64    %p123, %fd89, %fd68;
-       selp.f64        %fd104, 0dFFF8000000000000, %fd104, %p123;
+       setp.neu.f64    %p117, %fd89, %fd68;
+       selp.f64        %fd106, 0dFFF8000000000000, %fd106, %p117;
 
-BB11_119:
-       mov.f64         %fd57, %fd104;
+BB12_125:
+       mov.f64         %fd57, %fd106;
        add.f64         %fd58, %fd1, %fd68;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r70}, %fd58;
+       mov.b64         {%temp, %r71}, %fd58;
        }
-       and.b32         %r71, %r70, 2146435072;
-       setp.ne.s32     %p126, %r71, 2146435072;
-       mov.f64         %fd103, %fd57;
-       @%p126 bra      BB11_126;
+       and.b32         %r72, %r71, 2146435072;
+       setp.ne.s32     %p120, %r72, 2146435072;
+       mov.f64         %fd105, %fd57;
+       @%p120 bra      BB12_134;
 
-       setp.gtu.f64    %p127, %fd51, 0d7FF0000000000000;
-       mov.f64         %fd103, %fd58;
-       @%p127 bra      BB11_126;
+       setp.gtu.f64    %p121, %fd51, 0d7FF0000000000000;
+       mov.f64         %fd105, %fd58;
+       @%p121 bra      BB12_134;
 
        abs.f64         %fd90, %fd68;
-       setp.gtu.f64    %p128, %fd90, 0d7FF0000000000000;
-       mov.f64         %fd102, %fd58;
-       mov.f64         %fd103, %fd102;
-       @%p128 bra      BB11_126;
+       setp.gtu.f64    %p122, %fd90, 0d7FF0000000000000;
+       mov.f64         %fd104, %fd58;
+       mov.f64         %fd105, %fd104;
+       @%p122 bra      BB12_134;
 
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r72, %temp}, %fd68;
-       }
        and.b32         %r73, %r5, 2147483647;
-       setp.eq.s32     %p129, %r73, 2146435072;
-       setp.eq.s32     %p130, %r72, 0;
-       and.pred        %p131, %p129, %p130;
-       @%p131 bra      BB11_125;
-       bra.uni         BB11_123;
-
-BB11_125:
-       setp.gt.f64     %p135, %fd51, 0d3FF0000000000000;
-       selp.b32        %r81, 2146435072, 0, %p135;
-       xor.b32         %r82, %r81, 2146435072;
-       setp.lt.s32     %p136, %r5, 0;
-       selp.b32        %r83, %r82, %r81, %p136;
-       setp.eq.f64     %p137, %fd1, 0dBFF0000000000000;
-       selp.b32        %r84, 1072693248, %r83, %p137;
-       mov.u32         %r85, 0;
-       mov.b64         %fd103, {%r85, %r84};
-       bra.uni         BB11_126;
+       setp.ne.s32     %p123, %r73, 2146435072;
+       @%p123 bra      BB12_130;
 
-BB11_59:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r37, %temp}, %fd68;
+       mov.b64         {%r74, %temp}, %fd68;
        }
-       and.b32         %r38, %r2, 2147483647;
-       setp.eq.s32     %p64, %r38, 2146435072;
-       setp.eq.s32     %p65, %r37, 0;
-       and.pred        %p66, %p64, %p65;
-       mov.f64         %fd95, %fd24;
-       @!%p66 bra      BB11_62;
-       bra.uni         BB11_60;
-
-BB11_60:
-       shr.s32         %r39, %r3, 31;
-       and.b32         %r40, %r39, -2146435072;
-       selp.b32        %r41, -1048576, 2146435072, %p1;
-       add.s32         %r42, %r41, %r40;
-       mov.u32         %r43, 0;
-       mov.b64         %fd95, {%r43, %r42};
-
-BB11_62:
-       setp.eq.f64     %p70, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p71, %fd68, 0d3FF0000000000000;
-       or.pred         %p72, %p71, %p70;
-       selp.f64        %fd98, 0d3FF0000000000000, %fd95, %p72;
+       setp.eq.s32     %p124, %r74, 0;
+       @%p124 bra      BB12_133;
 
-BB11_65:
-       st.global.f64   [%rd1], %fd98;
-       bra.uni         BB11_130;
+BB12_130:
+       and.b32         %r75, %r4, 2147483647;
+       setp.ne.s32     %p125, %r75, 2146435072;
+       mov.f64         %fd102, %fd57;
+       mov.f64         %fd105, %fd102;
+       @%p125 bra      BB12_134;
 
-BB11_123:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r74, %temp}, %fd1;
+       mov.b64         {%r76, %temp}, %fd1;
        }
-       and.b32         %r75, %r4, 2147483647;
-       setp.eq.s32     %p132, %r75, 2146435072;
-       setp.eq.s32     %p133, %r74, 0;
-       and.pred        %p134, %p132, %p133;
-       mov.f64         %fd103, %fd57;
-       @!%p134 bra     BB11_126;
-       bra.uni         BB11_124;
-
-BB11_124:
-       shr.s32         %r76, %r5, 31;
-       and.b32         %r77, %r76, -2146435072;
-       selp.b32        %r78, -1048576, 2146435072, %p2;
-       add.s32         %r79, %r78, %r77;
-       mov.u32         %r80, 0;
-       mov.b64         %fd103, {%r80, %r79};
-
-BB11_126:
-       setp.eq.f64     %p138, %fd68, 0d0000000000000000;
-       setp.eq.f64     %p139, %fd1, 0d3FF0000000000000;
-       or.pred         %p140, %p139, %p138;
-       selp.f64        %fd106, 0d3FF0000000000000, %fd103, %p140;
-
-BB11_129:
-       st.global.f64   [%rd1], %fd106;
-
-BB11_130:
+       setp.ne.s32     %p126, %r76, 0;
+       mov.f64         %fd105, %fd57;
+       @%p126 bra      BB12_134;
+
+       shr.s32         %r77, %r5, 31;
+       and.b32         %r78, %r77, -2146435072;
+       add.s32         %r79, %r78, 2146435072;
+       or.b32          %r80, %r79, -2147483648;
+       selp.b32        %r81, %r80, %r79, %p2;
+       mov.u32         %r82, 0;
+       mov.b64         %fd105, {%r82, %r81};
+       bra.uni         BB12_134;
+
+BB12_65:
+       setp.gt.f64     %p63, %fd18, 0d3FF0000000000000;
+       selp.b32        %r45, 2146435072, 0, %p63;
+       xor.b32         %r46, %r45, 2146435072;
+       setp.lt.s32     %p64, %r3, 0;
+       selp.b32        %r47, %r46, %r45, %p64;
+       setp.eq.f64     %p65, %fd68, 0dBFF0000000000000;
+       selp.b32        %r48, 1072693248, %r47, %p65;
+       mov.u32         %r49, 0;
+       mov.b64         %fd96, {%r49, %r48};
+
+BB12_66:
+       setp.eq.f64     %p66, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p67, %fd68, 0d3FF0000000000000;
+       or.pred         %p68, %p67, %p66;
+       selp.f64        %fd99, 0d3FF0000000000000, %fd96, %p68;
+
+BB12_69:
+       st.global.f64   [%rd1], %fd99;
+       bra.uni         BB12_138;
+
+BB12_133:
+       setp.gt.f64     %p127, %fd51, 0d3FF0000000000000;
+       selp.b32        %r83, 2146435072, 0, %p127;
+       xor.b32         %r84, %r83, 2146435072;
+       setp.lt.s32     %p128, %r5, 0;
+       selp.b32        %r85, %r84, %r83, %p128;
+       setp.eq.f64     %p129, %fd1, 0dBFF0000000000000;
+       selp.b32        %r86, 1072693248, %r85, %p129;
+       mov.u32         %r87, 0;
+       mov.b64         %fd105, {%r87, %r86};
+
+BB12_134:
+       setp.eq.f64     %p130, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p131, %fd1, 0d3FF0000000000000;
+       or.pred         %p132, %p131, %p130;
+       selp.f64        %fd108, 0d3FF0000000000000, %fd105, %p132;
+
+BB12_137:
+       st.global.f64   [%rd1], %fd108;
+
+BB12_138:
        bar.sync        0;
        ret;
 }
@@ -1968,14 +2039,14 @@ BB11_130:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB12_2;
+       @%p1 bra        BB13_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB12_2:
+BB13_2:
        ret;
 }
 
@@ -2015,10 +2086,10 @@ BB12_2:
        setp.lt.s32     %p1, %r1, %r7;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB13_2;
-       bra.uni         BB13_1;
+       @!%p3 bra       BB14_2;
+       bra.uni         BB14_1;
 
-BB13_1:
+BB14_1:
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r13, %r1, %r4, %r2;
        mul.wide.s32    %rd6, %r13, 8;
@@ -2029,14 +2100,14 @@ BB13_1:
        add.s64         %rd9, %rd1, %rd8;
        st.global.f64   [%rd9], %fd1;
 
-BB13_2:
+BB14_2:
        setp.lt.s32     %p4, %r1, %r5;
        setp.lt.s32     %p5, %r2, %r6;
        and.pred        %p6, %p4, %p5;
-       @!%p6 bra       BB13_4;
-       bra.uni         BB13_3;
+       @!%p6 bra       BB14_4;
+       bra.uni         BB14_3;
 
-BB13_3:
+BB14_3:
        cvta.to.global.u64      %rd10, %rd3;
        mad.lo.s32      %r15, %r1, %r6, %r2;
        mul.wide.s32    %rd11, %r15, 8;
@@ -2048,7 +2119,7 @@ BB13_3:
        add.s64         %rd14, %rd1, %rd13;
        st.global.f64   [%rd14], %fd2;
 
-BB13_4:
+BB14_4:
        ret;
 }
 
@@ -2087,10 +2158,10 @@ BB13_4:
        setp.lt.s32     %p1, %r1, %r3;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB14_2;
-       bra.uni         BB14_1;
+       @!%p3 bra       BB15_2;
+       bra.uni         BB15_1;
 
-BB14_1:
+BB15_1:
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd6, %r12, 8;
@@ -2099,14 +2170,14 @@ BB14_1:
        add.s64         %rd8, %rd1, %rd6;
        st.global.f64   [%rd8], %fd1;
 
-BB14_2:
+BB15_2:
        setp.lt.s32     %p4, %r1, %r5;
        setp.lt.s32     %p5, %r2, %r6;
        and.pred        %p6, %p4, %p5;
-       @!%p6 bra       BB14_4;
-       bra.uni         BB14_3;
+       @!%p6 bra       BB15_4;
+       bra.uni         BB15_3;
 
-BB14_3:
+BB15_3:
        cvta.to.global.u64      %rd9, %rd3;
        mad.lo.s32      %r13, %r1, %r6, %r2;
        mul.wide.s32    %rd10, %r13, 8;
@@ -2118,7 +2189,7 @@ BB14_3:
        add.s64         %rd13, %rd1, %rd12;
        st.global.f64   [%rd13], %fd2;
 
-BB14_4:
+BB15_4:
        ret;
 }
 
@@ -2146,9 +2217,9 @@ BB14_4:
        mov.f64         %fd76, 0d0000000000000000;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB15_4;
+       @%p1 bra        BB16_4;
 
-BB15_1:
+BB16_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -2157,23 +2228,23 @@ BB15_1:
        add.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB15_3;
+       @%p2 bra        BB16_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        add.f64         %fd78, %fd78, %fd31;
 
-BB15_3:
+BB16_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB15_1;
+       @%p3 bra        BB16_1;
 
-BB15_4:
+BB16_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2181,130 +2252,130 @@ BB15_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB15_8;
+       @%p4 bra        BB16_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB15_7;
+       @%p5 bra        BB16_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        add.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB15_7:
+BB16_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB15_8:
+BB16_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB15_12;
+       @%p6 bra        BB16_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB15_11;
+       @%p7 bra        BB16_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        add.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB15_11:
+BB16_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB15_12:
+BB16_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB15_16;
+       @%p8 bra        BB16_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB15_15;
+       @%p9 bra        BB16_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        add.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB15_15:
+BB16_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB15_16:
+BB16_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB15_20;
+       @%p10 bra       BB16_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB15_19;
+       @%p11 bra       BB16_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        add.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB15_19:
+BB16_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB15_20:
+BB16_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB15_33;
+       @%p12 bra       BB16_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB15_23;
+       @%p13 bra       BB16_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        add.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB15_23:
+BB16_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB15_25;
+       @%p14 bra       BB16_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        add.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB15_25:
+BB16_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB15_27;
+       @%p15 bra       BB16_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        add.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB15_27:
+BB16_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB15_29;
+       @%p16 bra       BB16_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        add.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB15_29:
+BB16_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB15_31;
+       @%p17 bra       BB16_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        add.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB15_31:
+BB16_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB15_33;
+       @%p18 bra       BB16_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        add.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB15_33:
+BB16_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB15_35;
+       @%p19 bra       BB16_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2312,7 +2383,7 @@ BB15_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB15_35:
+BB16_35:
        ret;
 }
 
@@ -2336,17 +2407,17 @@ BB15_35:
        ld.param.u32    %r4, [reduce_row_sum_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB16_35;
+       @%p1 bra        BB17_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd72, 0d0000000000000000;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB16_4;
+       @%p2 bra        BB17_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB16_3:
+BB17_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -2356,9 +2427,9 @@ BB16_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB16_3;
+       @%p3 bra        BB17_3;
 
-BB16_4:
+BB17_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -2368,130 +2439,130 @@ BB16_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB16_8;
+       @%p4 bra        BB17_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB16_7;
+       @%p5 bra        BB17_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        add.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB16_7:
+BB17_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB16_8:
+BB17_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB16_12;
+       @%p6 bra        BB17_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB16_11;
+       @%p7 bra        BB17_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        add.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB16_11:
+BB17_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB16_12:
+BB17_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB16_16;
+       @%p8 bra        BB17_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB16_15;
+       @%p9 bra        BB17_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        add.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB16_15:
+BB17_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB16_16:
+BB17_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB16_20;
+       @%p10 bra       BB17_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB16_19;
+       @%p11 bra       BB17_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        add.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB16_19:
+BB17_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB16_20:
+BB17_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB16_33;
+       @%p12 bra       BB17_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB16_23;
+       @%p13 bra       BB17_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        add.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB16_23:
+BB17_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB16_25;
+       @%p14 bra       BB17_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
        add.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB16_25:
+BB17_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB16_27;
+       @%p15 bra       BB17_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
        add.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB16_27:
+BB17_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB16_29;
+       @%p16 bra       BB17_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
        add.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB16_29:
+BB17_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB16_31;
+       @%p17 bra       BB17_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
        add.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB16_31:
+BB17_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB16_33;
+       @%p18 bra       BB17_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
        add.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB16_33:
+BB17_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB16_35;
+       @%p19 bra       BB17_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -2499,7 +2570,7 @@ BB16_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB16_35:
+BB17_35:
        ret;
 }
 
@@ -2526,18 +2597,18 @@ BB16_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB17_5;
+       @%p1 bra        BB18_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0d0000000000000000;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB17_4;
+       @%p2 bra        BB18_4;
 
        mov.u32         %r10, %r1;
 
-BB17_3:
+BB18_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -2547,15 +2618,15 @@ BB17_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB17_3;
+       @%p3 bra        BB18_3;
 
-BB17_4:
+BB18_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB17_5:
+BB18_5:
        ret;
 }
 
@@ -2583,9 +2654,9 @@ BB17_5:
        mov.f64         %fd76, 0dFFEFFFFFFFFFFFFF;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB18_4;
+       @%p1 bra        BB19_4;
 
-BB18_1:
+BB19_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -2594,23 +2665,23 @@ BB18_1:
        max.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB18_3;
+       @%p2 bra        BB19_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        max.f64         %fd78, %fd78, %fd31;
 
-BB18_3:
+BB19_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB18_1;
+       @%p3 bra        BB19_1;
 
-BB18_4:
+BB19_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2618,130 +2689,130 @@ BB18_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB18_8;
+       @%p4 bra        BB19_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB18_7;
+       @%p5 bra        BB19_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        max.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB18_7:
+BB19_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB18_8:
+BB19_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB18_12;
+       @%p6 bra        BB19_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB18_11;
+       @%p7 bra        BB19_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        max.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB18_11:
+BB19_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB18_12:
+BB19_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB18_16;
+       @%p8 bra        BB19_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB18_15;
+       @%p9 bra        BB19_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        max.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB18_15:
+BB19_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB18_16:
+BB19_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB18_20;
+       @%p10 bra       BB19_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB18_19;
+       @%p11 bra       BB19_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        max.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB18_19:
+BB19_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB18_20:
+BB19_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB18_33;
+       @%p12 bra       BB19_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB18_23;
+       @%p13 bra       BB19_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        max.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB18_23:
+BB19_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB18_25;
+       @%p14 bra       BB19_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        max.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB18_25:
+BB19_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB18_27;
+       @%p15 bra       BB19_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        max.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB18_27:
+BB19_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB18_29;
+       @%p16 bra       BB19_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        max.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB18_29:
+BB19_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB18_31;
+       @%p17 bra       BB19_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        max.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB18_31:
+BB19_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB18_33;
+       @%p18 bra       BB19_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        max.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB18_33:
+BB19_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB18_35;
+       @%p19 bra       BB19_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2749,7 +2820,7 @@ BB18_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB18_35:
+BB19_35:
        ret;
 }
 
@@ -2773,17 +2844,17 @@ BB18_35:
        ld.param.u32    %r4, [reduce_row_max_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB19_35;
+       @%p1 bra        BB20_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd72, 0dFFEFFFFFFFFFFFFF;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB19_4;
+       @%p2 bra        BB20_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB19_3:
+BB20_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -2793,9 +2864,9 @@ BB19_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB19_3;
+       @%p3 bra        BB20_3;
 
-BB19_4:
+BB20_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -2805,130 +2876,130 @@ BB19_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB19_8;
+       @%p4 bra        BB20_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB19_7;
+       @%p5 bra        BB20_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        max.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB19_7:
+BB20_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB19_8:
+BB20_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB19_12;
+       @%p6 bra        BB20_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB19_11;
+       @%p7 bra        BB20_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        max.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB19_11:
+BB20_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB19_12:
+BB20_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB19_16;
+       @%p8 bra        BB20_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB19_15;
+       @%p9 bra        BB20_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        max.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB19_15:
+BB20_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB19_16:
+BB20_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB19_20;
+       @%p10 bra       BB20_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB19_19;
+       @%p11 bra       BB20_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        max.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB19_19:
+BB20_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB19_20:
+BB20_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB19_33;
+       @%p12 bra       BB20_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB19_23;
+       @%p13 bra       BB20_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        max.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB19_23:
+BB20_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB19_25;
+       @%p14 bra       BB20_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
        max.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB19_25:
+BB20_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB19_27;
+       @%p15 bra       BB20_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
        max.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB19_27:
+BB20_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB19_29;
+       @%p16 bra       BB20_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
        max.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB19_29:
+BB20_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB19_31;
+       @%p17 bra       BB20_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
        max.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB19_31:
+BB20_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB19_33;
+       @%p18 bra       BB20_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
        max.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB19_33:
+BB20_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB19_35;
+       @%p19 bra       BB20_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -2936,7 +3007,7 @@ BB19_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB19_35:
+BB20_35:
        ret;
 }
 
@@ -2963,18 +3034,18 @@ BB19_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB20_5;
+       @%p1 bra        BB21_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0dFFEFFFFFFFFFFFFF;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB20_4;
+       @%p2 bra        BB21_4;
 
        mov.u32         %r10, %r1;
 
-BB20_3:
+BB21_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -2984,15 +3055,15 @@ BB20_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB20_3;
+       @%p3 bra        BB21_3;
 
-BB20_4:
+BB21_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB20_5:
+BB21_5:
        ret;
 }
 
@@ -3020,9 +3091,9 @@ BB20_5:
        mov.f64         %fd76, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB21_4;
+       @%p1 bra        BB22_4;
 
-BB21_1:
+BB22_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -3031,23 +3102,23 @@ BB21_1:
        min.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB21_3;
+       @%p2 bra        BB22_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        min.f64         %fd78, %fd78, %fd31;
 
-BB21_3:
+BB22_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB21_1;
+       @%p3 bra        BB22_1;
 
-BB21_4:
+BB22_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -3055,130 +3126,130 @@ BB21_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB21_8;
+       @%p4 bra        BB22_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB21_7;
+       @%p5 bra        BB22_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        min.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB21_7:
+BB22_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB21_8:
+BB22_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB21_12;
+       @%p6 bra        BB22_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB21_11;
+       @%p7 bra        BB22_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        min.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB21_11:
+BB22_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB21_12:
+BB22_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB21_16;
+       @%p8 bra        BB22_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB21_15;
+       @%p9 bra        BB22_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        min.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB21_15:
+BB22_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB21_16:
+BB22_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB21_20;
+       @%p10 bra       BB22_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB21_19;
+       @%p11 bra       BB22_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        min.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB21_19:
+BB22_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB21_20:
+BB22_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB21_33;
+       @%p12 bra       BB22_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB21_23;
+       @%p13 bra       BB22_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        min.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB21_23:
+BB22_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB21_25;
+       @%p14 bra       BB22_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        min.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB21_25:
+BB22_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB21_27;
+       @%p15 bra       BB22_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        min.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB21_27:
+BB22_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB21_29;
+       @%p16 bra       BB22_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        min.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB21_29:
+BB22_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB21_31;
+       @%p17 bra       BB22_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        min.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB21_31:
+BB22_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB21_33;
+       @%p18 bra       BB22_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        min.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB21_33:
+BB22_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB21_35;
+       @%p19 bra       BB22_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ 

<TRUNCATED>

Reply via email to