This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 7dcf6fe  [MINOR] CUDA 10.2 PTX and spoof cuda helper binaries 
(Win/Lin/64)
7dcf6fe is described below

commit 7dcf6fe424e80901852785a4a0bf7065c15973bb
Author: Mark Dokter <[email protected]>
AuthorDate: Thu Jun 17 01:42:42 2021 +0200

    [MINOR] CUDA 10.2 PTX and spoof cuda helper binaries (Win/Lin/64)
    
    Build command for your reference:
    rm -rf target/build-cuda ; cmake -S src/main/cuda -B target/build-cuda -G 
"Ninja Multi-Config" ; cmake --build target/build-cuda/ --target install 
--config Release
---
 .../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so |  Bin 303688 -> 302880 
bytes
 .../lib/libsystemds_spoof_cuda-Windows-AMD64.dll   |  Bin 244736 -> 244736 
bytes
 src/main/cuda/kernels/SystemDS.ptx                 |  449 +--
 src/main/cuda/kernels/reduction.ptx                | 3737 ++++++--------------
 4 files changed, 1220 insertions(+), 2966 deletions(-)

diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so 
b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
index 5bb044f..ec5be11 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and 
b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll 
b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
index bdf0a4f..b005c1b 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll and 
b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll differ
diff --git a/src/main/cuda/kernels/SystemDS.ptx 
b/src/main/cuda/kernels/SystemDS.ptx
index b5ca8de..ee355bf 100644
--- a/src/main/cuda/kernels/SystemDS.ptx
+++ b/src/main/cuda/kernels/SystemDS.ptx
@@ -9190,7 +9190,7 @@ BB75_35:
        .reg .pred      %p<20>;
        .reg .b32       %r<72>;
        .reg .f64       %fd<58>;
-       .reg .b64       %rd<9>;
+       .reg .b64       %rd<10>;
 
 
        ld.param.u64    %rd1, [reduce_row_mean_d_param_0];
@@ -9338,12 +9338,13 @@ BB76_33:
        @%p19 bra       BB76_35;
 
        ld.shared.f64   %fd40, [memory];
-       cvt.rn.f64.s32  %fd41, %r4;
+       cvt.u64.u32     %rd6, %r4;
+       cvt.rn.f64.s64  %fd41, %rd6;
        div.rn.f64      %fd42, %fd40, %fd41;
-       cvta.to.global.u64      %rd6, %rd2;
-       mul.wide.u32    %rd7, %r6, 8;
-       add.s64         %rd8, %rd6, %rd7;
-       st.global.f64   [%rd8], %fd42;
+       cvta.to.global.u64      %rd7, %rd2;
+       mul.wide.u32    %rd8, %r6, 8;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f64   [%rd9], %fd42;
 
 BB76_35:
        ret;
@@ -9360,7 +9361,7 @@ BB76_35:
        .reg .pred      %p<20>;
        .reg .f32       %f<58>;
        .reg .b32       %r<72>;
-       .reg .b64       %rd<9>;
+       .reg .b64       %rd<10>;
 
 
        ld.param.u64    %rd1, [reduce_row_mean_f_param_0];
@@ -9508,12 +9509,13 @@ BB77_33:
        @%p19 bra       BB77_35;
 
        ld.shared.f32   %f40, [memory];
-       cvt.rn.f32.s32  %f41, %r4;
+       cvt.u64.u32     %rd6, %r4;
+       cvt.rn.f32.s64  %f41, %rd6;
        div.rn.f32      %f42, %f40, %f41;
-       cvta.to.global.u64      %rd6, %rd2;
-       mul.wide.u32    %rd7, %r6, 4;
-       add.s64         %rd8, %rd6, %rd7;
-       st.global.f32   [%rd8], %f42;
+       cvta.to.global.u64      %rd7, %rd2;
+       mul.wide.u32    %rd8, %r6, 4;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f32   [%rd9], %f42;
 
 BB77_35:
        ret;
@@ -9530,7 +9532,7 @@ BB77_35:
        .reg .pred      %p<4>;
        .reg .b32       %r<11>;
        .reg .f64       %fd<11>;
-       .reg .b64       %rd<9>;
+       .reg .b64       %rd<10>;
 
 
        ld.param.u64    %rd2, [reduce_col_mean_d_param_0];
@@ -9562,12 +9564,13 @@ BB78_3:
        @%p3 bra        BB78_3;
 
 BB78_4:
-       cvt.rn.f64.s32  %fd7, %r5;
+       cvt.u64.u32     %rd6, %r5;
+       cvt.rn.f64.s64  %fd7, %rd6;
        div.rn.f64      %fd8, %fd10, %fd7;
-       cvta.to.global.u64      %rd6, %rd3;
-       mul.wide.u32    %rd7, %r1, 8;
-       add.s64         %rd8, %rd6, %rd7;
-       st.global.f64   [%rd8], %fd8;
+       cvta.to.global.u64      %rd7, %rd3;
+       mul.wide.u32    %rd8, %r1, 8;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f64   [%rd9], %fd8;
 
 BB78_5:
        ret;
@@ -9584,7 +9587,7 @@ BB78_5:
        .reg .pred      %p<4>;
        .reg .f32       %f<11>;
        .reg .b32       %r<11>;
-       .reg .b64       %rd<9>;
+       .reg .b64       %rd<10>;
 
 
        ld.param.u64    %rd2, [reduce_col_mean_f_param_0];
@@ -9616,12 +9619,13 @@ BB79_3:
        @%p3 bra        BB79_3;
 
 BB79_4:
-       cvt.rn.f32.s32  %f7, %r5;
+       cvt.u64.u32     %rd6, %r5;
+       cvt.rn.f32.s64  %f7, %rd6;
        div.rn.f32      %f8, %f10, %f7;
-       cvta.to.global.u64      %rd6, %rd3;
-       mul.wide.u32    %rd7, %r1, 4;
-       add.s64         %rd8, %rd6, %rd7;
-       st.global.f32   [%rd8], %f8;
+       cvta.to.global.u64      %rd7, %rd3;
+       mul.wide.u32    %rd8, %r1, 4;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f32   [%rd9], %f8;
 
 BB79_5:
        ret;
@@ -10594,7 +10598,7 @@ BB94_11:
        .reg .b64       %SPL;
        .reg .pred      %p<13>;
        .reg .f32       %f<38>;
-       .reg .b32       %r<70>;
+       .reg .b32       %r<69>;
        .reg .f64       %fd<3>;
        .reg .b64       %rd<24>;
 
@@ -10602,12 +10606,12 @@ BB94_11:
        mov.u64         %SPL, __local_depot95;
        ld.param.u64    %rd7, [matrix_sin_f_param_0];
        ld.param.u64    %rd8, [matrix_sin_f_param_1];
-       ld.param.u32    %r30, [matrix_sin_f_param_2];
-       mov.u32         %r31, %ntid.x;
-       mov.u32         %r32, %ctaid.x;
-       mov.u32         %r33, %tid.x;
-       mad.lo.s32      %r1, %r31, %r32, %r33;
-       setp.ge.u32     %p1, %r1, %r30;
+       ld.param.u32    %r29, [matrix_sin_f_param_2];
+       mov.u32         %r30, %ntid.x;
+       mov.u32         %r31, %ctaid.x;
+       mov.u32         %r32, %tid.x;
+       mad.lo.s32      %r1, %r30, %r31, %r32;
+       setp.ge.u32     %p1, %r1, %r29;
        @%p1 bra        BB95_17;
 
        cvta.to.global.u64      %rd9, %rd7;
@@ -10616,8 +10620,8 @@ BB94_11:
        add.u64         %rd1, %SPL, 0;
        ld.global.f32   %f1, [%rd11];
        mul.f32         %f15, %f1, 0f3F22F983;
-       cvt.rni.s32.f32 %r69, %f15;
-       cvt.rn.f32.s32  %f16, %r69;
+       cvt.rni.s32.f32 %r68, %f15;
+       cvt.rn.f32.s32  %f16, %r68;
        mov.f32         %f17, 0fBFC90FDA;
        fma.rn.f32      %f18, %f16, %f17, %f1;
        mov.f32         %f19, 0fB3A22168;
@@ -10639,96 +10643,95 @@ BB95_11:
 
 BB95_3:
        mov.b32          %r3, %f1;
-       shr.u32         %r4, %r3, 23;
-       shl.b32         %r36, %r3, 8;
-       or.b32          %r5, %r36, -2147483648;
-       mov.u32         %r63, 0;
+       shl.b32         %r35, %r3, 8;
+       or.b32          %r4, %r35, -2147483648;
+       mov.u32         %r62, 0;
        mov.u64         %rd22, __cudart_i2opi_f;
-       mov.u32         %r62, -6;
+       mov.u32         %r61, -6;
        mov.u64         %rd23, %rd1;
 
 BB95_4:
        .pragma "nounroll";
-       ld.const.u32    %r39, [%rd22];
+       ld.const.u32    %r38, [%rd22];
        // inline asm
        {
-       mad.lo.cc.u32   %r37, %r39, %r5, %r63;
-       madc.hi.u32     %r63, %r39, %r5,  0;
+       mad.lo.cc.u32   %r36, %r38, %r4, %r62;
+       madc.hi.u32     %r62, %r38, %r4,  0;
        }
        // inline asm
-       st.local.u32    [%rd23], %r37;
+       st.local.u32    [%rd23], %r36;
        add.s64         %rd23, %rd23, 4;
        add.s64         %rd22, %rd22, 4;
-       add.s32         %r62, %r62, 1;
-       setp.ne.s32     %p4, %r62, 0;
+       add.s32         %r61, %r61, 1;
+       setp.ne.s32     %p4, %r61, 0;
        @%p4 bra        BB95_4;
 
-       and.b32         %r42, %r4, 255;
-       add.s32         %r43, %r42, -128;
-       shr.u32         %r44, %r43, 5;
-       and.b32         %r10, %r3, -2147483648;
-       st.local.u32    [%rd1+24], %r63;
-       mov.u32         %r45, 6;
-       sub.s32         %r46, %r45, %r44;
-       mul.wide.s32    %rd14, %r46, 4;
+       bfe.u32         %r41, %r3, 23, 8;
+       add.s32         %r42, %r41, -128;
+       shr.u32         %r43, %r42, 5;
+       and.b32         %r9, %r3, -2147483648;
+       st.local.u32    [%rd1+24], %r62;
+       bfe.u32         %r10, %r3, 23, 5;
+       mov.u32         %r44, 6;
+       sub.s32         %r45, %r44, %r43;
+       mul.wide.s32    %rd14, %r45, 4;
        add.s64         %rd6, %rd1, %rd14;
-       ld.local.u32    %r65, [%rd6];
-       ld.local.u32    %r64, [%rd6+-4];
-       and.b32         %r13, %r4, 31;
-       setp.eq.s32     %p5, %r13, 0;
+       ld.local.u32    %r64, [%rd6];
+       ld.local.u32    %r63, [%rd6+-4];
+       setp.eq.s32     %p5, %r10, 0;
        @%p5 bra        BB95_7;
 
-       mov.u32         %r47, 32;
-       sub.s32         %r48, %r47, %r13;
-       shr.u32         %r49, %r64, %r48;
-       shl.b32         %r50, %r65, %r13;
-       add.s32         %r65, %r49, %r50;
-       ld.local.u32    %r51, [%rd6+-8];
-       shr.u32         %r52, %r51, %r48;
-       shl.b32         %r53, %r64, %r13;
-       add.s32         %r64, %r52, %r53;
+       mov.u32         %r46, 32;
+       sub.s32         %r47, %r46, %r10;
+       shr.u32         %r48, %r63, %r47;
+       shl.b32         %r49, %r64, %r10;
+       add.s32         %r64, %r48, %r49;
+       ld.local.u32    %r50, [%rd6+-8];
+       shr.u32         %r51, %r50, %r47;
+       shl.b32         %r52, %r63, %r10;
+       add.s32         %r63, %r51, %r52;
 
 BB95_7:
-       shr.u32         %r54, %r64, 30;
-       shl.b32         %r55, %r65, 2;
-       add.s32         %r67, %r55, %r54;
-       shl.b32         %r19, %r64, 2;
-       shr.u32         %r56, %r67, 31;
-       shr.u32         %r57, %r65, 30;
-       add.s32         %r20, %r56, %r57;
-       setp.eq.s32     %p6, %r56, 0;
+       shr.u32         %r53, %r63, 30;
+       shl.b32         %r54, %r64, 2;
+       add.s32         %r66, %r54, %r53;
+       shl.b32         %r18, %r63, 2;
+       shr.u32         %r55, %r66, 31;
+       shr.u32         %r56, %r64, 30;
+       add.s32         %r19, %r55, %r56;
+       setp.eq.s32     %p6, %r55, 0;
        @%p6 bra        BB95_8;
 
-       not.b32         %r58, %r67;
-       neg.s32         %r66, %r19;
-       setp.eq.s32     %p7, %r19, 0;
-       selp.u32        %r59, 1, 0, %p7;
-       add.s32         %r67, %r59, %r58;
-       xor.b32         %r68, %r10, -2147483648;
+       not.b32         %r57, %r66;
+       neg.s32         %r65, %r18;
+       setp.eq.s32     %p7, %r18, 0;
+       selp.u32        %r58, 1, 0, %p7;
+       add.s32         %r66, %r58, %r57;
+       xor.b32         %r67, %r9, -2147483648;
        bra.uni         BB95_10;
 
 BB95_8:
-       mov.u32         %r66, %r19;
-       mov.u32         %r68, %r10;
+       mov.u32         %r65, %r18;
+       mov.u32         %r67, %r9;
 
 BB95_10:
-       cvt.u64.u32     %rd15, %r67;
+       cvt.u64.u32     %rd15, %r66;
        shl.b64         %rd16, %rd15, 32;
-       cvt.u64.u32     %rd17, %r66;
+       cvt.u64.u32     %rd17, %r65;
        or.b64          %rd18, %rd16, %rd17;
        cvt.rn.f64.s64  %fd1, %rd18;
        mul.f64         %fd2, %fd1, 0d3BF921FB54442D19;
        cvt.rn.f32.f64  %f22, %fd2;
        neg.f32         %f23, %f22;
-       setp.eq.s32     %p8, %r68, 0;
+       setp.eq.s32     %p8, %r67, 0;
        selp.f32        %f35, %f22, %f23, %p8;
-       setp.eq.s32     %p9, %r10, 0;
-       neg.s32         %r60, %r20;
-       selp.b32        %r69, %r20, %r60, %p9;
+       setp.eq.s32     %p9, %r9, 0;
+       neg.s32         %r59, %r19;
+       selp.b32        %r68, %r19, %r59, %p9;
 
 BB95_12:
-       and.b32         %r29, %r69, 1;
-       setp.eq.s32     %p10, %r29, 0;
+       and.b32         %r28, %r68, 1;
+       setp.eq.s32     %p10, %r28, 0;
        selp.f32        %f7, %f35, 0f3F800000, %p10;
        mul.rn.f32      %f8, %f35, %f35;
        mov.f32         %f26, 0f00000000;
@@ -10746,8 +10749,8 @@ BB95_14:
        selp.f32        %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
        fma.rn.f32      %f32, %f30, %f8, %f31;
        fma.rn.f32      %f37, %f32, %f9, %f7;
-       and.b32         %r61, %r69, 2;
-       setp.eq.s32     %p12, %r61, 0;
+       and.b32         %r60, %r68, 2;
+       setp.eq.s32     %p12, %r60, 0;
        @%p12 bra       BB95_16;
 
        mov.f32         %f34, 0fBF800000;
@@ -11142,7 +11145,7 @@ BB98_11:
        .reg .b64       %SPL;
        .reg .pred      %p<13>;
        .reg .f32       %f<38>;
-       .reg .b32       %r<71>;
+       .reg .b32       %r<70>;
        .reg .f64       %fd<3>;
        .reg .b64       %rd<24>;
 
@@ -11150,12 +11153,12 @@ BB98_11:
        mov.u64         %SPL, __local_depot99;
        ld.param.u64    %rd7, [matrix_cos_f_param_0];
        ld.param.u64    %rd8, [matrix_cos_f_param_1];
-       ld.param.u32    %r31, [matrix_cos_f_param_2];
-       mov.u32         %r32, %ntid.x;
-       mov.u32         %r33, %ctaid.x;
-       mov.u32         %r34, %tid.x;
-       mad.lo.s32      %r1, %r32, %r33, %r34;
-       setp.ge.u32     %p1, %r1, %r31;
+       ld.param.u32    %r30, [matrix_cos_f_param_2];
+       mov.u32         %r31, %ntid.x;
+       mov.u32         %r32, %ctaid.x;
+       mov.u32         %r33, %tid.x;
+       mad.lo.s32      %r1, %r31, %r32, %r33;
+       setp.ge.u32     %p1, %r1, %r30;
        @%p1 bra        BB99_17;
 
        cvta.to.global.u64      %rd9, %rd7;
@@ -11164,8 +11167,8 @@ BB98_11:
        add.u64         %rd1, %SPL, 0;
        ld.global.f32   %f1, [%rd11];
        mul.f32         %f15, %f1, 0f3F22F983;
-       cvt.rni.s32.f32 %r70, %f15;
-       cvt.rn.f32.s32  %f16, %r70;
+       cvt.rni.s32.f32 %r69, %f15;
+       cvt.rn.f32.s32  %f16, %r69;
        mov.f32         %f17, 0fBFC90FDA;
        fma.rn.f32      %f18, %f16, %f17, %f1;
        mov.f32         %f19, 0fB3A22168;
@@ -11187,97 +11190,96 @@ BB99_11:
 
 BB99_3:
        mov.b32          %r3, %f1;
-       shr.u32         %r4, %r3, 23;
-       shl.b32         %r37, %r3, 8;
-       or.b32          %r5, %r37, -2147483648;
-       mov.u32         %r64, 0;
+       shl.b32         %r36, %r3, 8;
+       or.b32          %r4, %r36, -2147483648;
+       mov.u32         %r63, 0;
        mov.u64         %rd22, __cudart_i2opi_f;
-       mov.u32         %r63, -6;
+       mov.u32         %r62, -6;
        mov.u64         %rd23, %rd1;
 
 BB99_4:
        .pragma "nounroll";
-       ld.const.u32    %r40, [%rd22];
+       ld.const.u32    %r39, [%rd22];
        // inline asm
        {
-       mad.lo.cc.u32   %r38, %r40, %r5, %r64;
-       madc.hi.u32     %r64, %r40, %r5,  0;
+       mad.lo.cc.u32   %r37, %r39, %r4, %r63;
+       madc.hi.u32     %r63, %r39, %r4,  0;
        }
        // inline asm
-       st.local.u32    [%rd23], %r38;
+       st.local.u32    [%rd23], %r37;
        add.s64         %rd23, %rd23, 4;
        add.s64         %rd22, %rd22, 4;
-       add.s32         %r63, %r63, 1;
-       setp.ne.s32     %p4, %r63, 0;
+       add.s32         %r62, %r62, 1;
+       setp.ne.s32     %p4, %r62, 0;
        @%p4 bra        BB99_4;
 
-       and.b32         %r43, %r4, 255;
-       add.s32         %r44, %r43, -128;
-       shr.u32         %r45, %r44, 5;
-       and.b32         %r10, %r3, -2147483648;
-       st.local.u32    [%rd1+24], %r64;
-       mov.u32         %r46, 6;
-       sub.s32         %r47, %r46, %r45;
-       mul.wide.s32    %rd14, %r47, 4;
+       bfe.u32         %r42, %r3, 23, 8;
+       add.s32         %r43, %r42, -128;
+       shr.u32         %r44, %r43, 5;
+       and.b32         %r9, %r3, -2147483648;
+       st.local.u32    [%rd1+24], %r63;
+       bfe.u32         %r10, %r3, 23, 5;
+       mov.u32         %r45, 6;
+       sub.s32         %r46, %r45, %r44;
+       mul.wide.s32    %rd14, %r46, 4;
        add.s64         %rd6, %rd1, %rd14;
-       ld.local.u32    %r66, [%rd6];
-       ld.local.u32    %r65, [%rd6+-4];
-       and.b32         %r13, %r4, 31;
-       setp.eq.s32     %p5, %r13, 0;
+       ld.local.u32    %r65, [%rd6];
+       ld.local.u32    %r64, [%rd6+-4];
+       setp.eq.s32     %p5, %r10, 0;
        @%p5 bra        BB99_7;
 
-       mov.u32         %r48, 32;
-       sub.s32         %r49, %r48, %r13;
-       shr.u32         %r50, %r65, %r49;
-       shl.b32         %r51, %r66, %r13;
-       add.s32         %r66, %r50, %r51;
-       ld.local.u32    %r52, [%rd6+-8];
-       shr.u32         %r53, %r52, %r49;
-       shl.b32         %r54, %r65, %r13;
-       add.s32         %r65, %r53, %r54;
+       mov.u32         %r47, 32;
+       sub.s32         %r48, %r47, %r10;
+       shr.u32         %r49, %r64, %r48;
+       shl.b32         %r50, %r65, %r10;
+       add.s32         %r65, %r49, %r50;
+       ld.local.u32    %r51, [%rd6+-8];
+       shr.u32         %r52, %r51, %r48;
+       shl.b32         %r53, %r64, %r10;
+       add.s32         %r64, %r52, %r53;
 
 BB99_7:
-       shr.u32         %r55, %r65, 30;
-       shl.b32         %r56, %r66, 2;
-       add.s32         %r68, %r56, %r55;
-       shl.b32         %r19, %r65, 2;
-       shr.u32         %r57, %r68, 31;
-       shr.u32         %r58, %r66, 30;
-       add.s32         %r20, %r57, %r58;
-       setp.eq.s32     %p6, %r57, 0;
+       shr.u32         %r54, %r64, 30;
+       shl.b32         %r55, %r65, 2;
+       add.s32         %r67, %r55, %r54;
+       shl.b32         %r18, %r64, 2;
+       shr.u32         %r56, %r67, 31;
+       shr.u32         %r57, %r65, 30;
+       add.s32         %r19, %r56, %r57;
+       setp.eq.s32     %p6, %r56, 0;
        @%p6 bra        BB99_8;
 
-       not.b32         %r59, %r68;
-       neg.s32         %r67, %r19;
-       setp.eq.s32     %p7, %r19, 0;
-       selp.u32        %r60, 1, 0, %p7;
-       add.s32         %r68, %r60, %r59;
-       xor.b32         %r69, %r10, -2147483648;
+       not.b32         %r58, %r67;
+       neg.s32         %r66, %r18;
+       setp.eq.s32     %p7, %r18, 0;
+       selp.u32        %r59, 1, 0, %p7;
+       add.s32         %r67, %r59, %r58;
+       xor.b32         %r68, %r9, -2147483648;
        bra.uni         BB99_10;
 
 BB99_8:
-       mov.u32         %r67, %r19;
-       mov.u32         %r69, %r10;
+       mov.u32         %r66, %r18;
+       mov.u32         %r68, %r9;
 
 BB99_10:
-       cvt.u64.u32     %rd15, %r68;
+       cvt.u64.u32     %rd15, %r67;
        shl.b64         %rd16, %rd15, 32;
-       cvt.u64.u32     %rd17, %r67;
+       cvt.u64.u32     %rd17, %r66;
        or.b64          %rd18, %rd16, %rd17;
        cvt.rn.f64.s64  %fd1, %rd18;
        mul.f64         %fd2, %fd1, 0d3BF921FB54442D19;
        cvt.rn.f32.f64  %f22, %fd2;
        neg.f32         %f23, %f22;
-       setp.eq.s32     %p8, %r69, 0;
+       setp.eq.s32     %p8, %r68, 0;
        selp.f32        %f35, %f22, %f23, %p8;
-       setp.eq.s32     %p9, %r10, 0;
-       neg.s32         %r61, %r20;
-       selp.b32        %r70, %r20, %r61, %p9;
+       setp.eq.s32     %p9, %r9, 0;
+       neg.s32         %r60, %r19;
+       selp.b32        %r69, %r19, %r60, %p9;
 
 BB99_12:
-       add.s32         %r29, %r70, 1;
-       and.b32         %r30, %r29, 1;
-       setp.eq.s32     %p10, %r30, 0;
+       add.s32         %r28, %r69, 1;
+       and.b32         %r29, %r28, 1;
+       setp.eq.s32     %p10, %r29, 0;
        selp.f32        %f7, %f35, 0f3F800000, %p10;
        mul.rn.f32      %f8, %f35, %f35;
        mov.f32         %f26, 0f00000000;
@@ -11295,8 +11297,8 @@ BB99_14:
        selp.f32        %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
        fma.rn.f32      %f32, %f30, %f8, %f31;
        fma.rn.f32      %f37, %f32, %f9, %f7;
-       and.b32         %r62, %r29, 2;
-       setp.eq.s32     %p12, %r62, 0;
+       and.b32         %r61, %r28, 2;
+       setp.eq.s32     %p12, %r61, 0;
        @%p12 bra       BB99_16;
 
        mov.f32         %f34, 0fBF800000;
@@ -11647,7 +11649,7 @@ BB102_9:
        .reg .b64       %SPL;
        .reg .pred      %p<12>;
        .reg .f32       %f<39>;
-       .reg .b32       %r<69>;
+       .reg .b32       %r<68>;
        .reg .f64       %fd<3>;
        .reg .b64       %rd<24>;
 
@@ -11655,12 +11657,12 @@ BB102_9:
        mov.u64         %SPL, __local_depot103;
        ld.param.u64    %rd7, [matrix_tan_f_param_0];
        ld.param.u64    %rd8, [matrix_tan_f_param_1];
-       ld.param.u32    %r29, [matrix_tan_f_param_2];
-       mov.u32         %r30, %ntid.x;
-       mov.u32         %r31, %ctaid.x;
-       mov.u32         %r32, %tid.x;
-       mad.lo.s32      %r1, %r30, %r31, %r32;
-       setp.ge.u32     %p1, %r1, %r29;
+       ld.param.u32    %r28, [matrix_tan_f_param_2];
+       mov.u32         %r29, %ntid.x;
+       mov.u32         %r30, %ctaid.x;
+       mov.u32         %r31, %tid.x;
+       mad.lo.s32      %r1, %r29, %r30, %r31;
+       setp.ge.u32     %p1, %r1, %r28;
        @%p1 bra        BB103_15;
 
        cvta.to.global.u64      %rd9, %rd7;
@@ -11669,8 +11671,8 @@ BB102_9:
        add.u64         %rd1, %SPL, 0;
        ld.global.f32   %f1, [%rd11];
        mul.f32         %f10, %f1, 0f3F22F983;
-       cvt.rni.s32.f32 %r68, %f10;
-       cvt.rn.f32.s32  %f11, %r68;
+       cvt.rni.s32.f32 %r67, %f10;
+       cvt.rn.f32.s32  %f11, %r67;
        mov.f32         %f12, 0fBFC90FDA;
        fma.rn.f32      %f13, %f11, %f12, %f1;
        mov.f32         %f14, 0fB3A22168;
@@ -11692,92 +11694,91 @@ BB103_11:
 
 BB103_3:
        mov.b32          %r3, %f1;
-       shr.u32         %r4, %r3, 23;
-       shl.b32         %r35, %r3, 8;
-       or.b32          %r5, %r35, -2147483648;
-       mov.u32         %r62, 0;
+       shl.b32         %r34, %r3, 8;
+       or.b32          %r4, %r34, -2147483648;
+       mov.u32         %r61, 0;
        mov.u64         %rd22, __cudart_i2opi_f;
-       mov.u32         %r61, -6;
+       mov.u32         %r60, -6;
        mov.u64         %rd23, %rd1;
 
 BB103_4:
        .pragma "nounroll";
-       ld.const.u32    %r38, [%rd22];
+       ld.const.u32    %r37, [%rd22];
        // inline asm
        {
-       mad.lo.cc.u32   %r36, %r38, %r5, %r62;
-       madc.hi.u32     %r62, %r38, %r5,  0;
+       mad.lo.cc.u32   %r35, %r37, %r4, %r61;
+       madc.hi.u32     %r61, %r37, %r4,  0;
        }
        // inline asm
-       st.local.u32    [%rd23], %r36;
+       st.local.u32    [%rd23], %r35;
        add.s64         %rd23, %rd23, 4;
        add.s64         %rd22, %rd22, 4;
-       add.s32         %r61, %r61, 1;
-       setp.ne.s32     %p4, %r61, 0;
+       add.s32         %r60, %r60, 1;
+       setp.ne.s32     %p4, %r60, 0;
        @%p4 bra        BB103_4;
 
-       and.b32         %r41, %r4, 255;
-       add.s32         %r42, %r41, -128;
-       shr.u32         %r43, %r42, 5;
-       and.b32         %r10, %r3, -2147483648;
-       st.local.u32    [%rd1+24], %r62;
-       mov.u32         %r44, 6;
-       sub.s32         %r45, %r44, %r43;
-       mul.wide.s32    %rd14, %r45, 4;
+       bfe.u32         %r40, %r3, 23, 8;
+       add.s32         %r41, %r40, -128;
+       shr.u32         %r42, %r41, 5;
+       and.b32         %r9, %r3, -2147483648;
+       st.local.u32    [%rd1+24], %r61;
+       bfe.u32         %r10, %r3, 23, 5;
+       mov.u32         %r43, 6;
+       sub.s32         %r44, %r43, %r42;
+       mul.wide.s32    %rd14, %r44, 4;
        add.s64         %rd6, %rd1, %rd14;
-       ld.local.u32    %r64, [%rd6];
-       ld.local.u32    %r63, [%rd6+-4];
-       and.b32         %r13, %r4, 31;
-       setp.eq.s32     %p5, %r13, 0;
+       ld.local.u32    %r63, [%rd6];
+       ld.local.u32    %r62, [%rd6+-4];
+       setp.eq.s32     %p5, %r10, 0;
        @%p5 bra        BB103_7;
 
-       mov.u32         %r46, 32;
-       sub.s32         %r47, %r46, %r13;
-       shr.u32         %r48, %r63, %r47;
-       shl.b32         %r49, %r64, %r13;
-       add.s32         %r64, %r48, %r49;
-       ld.local.u32    %r50, [%rd6+-8];
-       shr.u32         %r51, %r50, %r47;
-       shl.b32         %r52, %r63, %r13;
-       add.s32         %r63, %r51, %r52;
+       mov.u32         %r45, 32;
+       sub.s32         %r46, %r45, %r10;
+       shr.u32         %r47, %r62, %r46;
+       shl.b32         %r48, %r63, %r10;
+       add.s32         %r63, %r47, %r48;
+       ld.local.u32    %r49, [%rd6+-8];
+       shr.u32         %r50, %r49, %r46;
+       shl.b32         %r51, %r62, %r10;
+       add.s32         %r62, %r50, %r51;
 
 BB103_7:
-       shr.u32         %r53, %r63, 30;
-       shl.b32         %r54, %r64, 2;
-       add.s32         %r66, %r54, %r53;
-       shl.b32         %r19, %r63, 2;
-       shr.u32         %r55, %r66, 31;
-       shr.u32         %r56, %r64, 30;
-       add.s32         %r20, %r55, %r56;
-       setp.eq.s32     %p6, %r55, 0;
+       shr.u32         %r52, %r62, 30;
+       shl.b32         %r53, %r63, 2;
+       add.s32         %r65, %r53, %r52;
+       shl.b32         %r18, %r62, 2;
+       shr.u32         %r54, %r65, 31;
+       shr.u32         %r55, %r63, 30;
+       add.s32         %r19, %r54, %r55;
+       setp.eq.s32     %p6, %r54, 0;
        @%p6 bra        BB103_8;
 
-       not.b32         %r57, %r66;
-       neg.s32         %r65, %r19;
-       setp.eq.s32     %p7, %r19, 0;
-       selp.u32        %r58, 1, 0, %p7;
-       add.s32         %r66, %r58, %r57;
-       xor.b32         %r67, %r10, -2147483648;
+       not.b32         %r56, %r65;
+       neg.s32         %r64, %r18;
+       setp.eq.s32     %p7, %r18, 0;
+       selp.u32        %r57, 1, 0, %p7;
+       add.s32         %r65, %r57, %r56;
+       xor.b32         %r66, %r9, -2147483648;
        bra.uni         BB103_10;
 
 BB103_8:
-       mov.u32         %r65, %r19;
-       mov.u32         %r67, %r10;
+       mov.u32         %r64, %r18;
+       mov.u32         %r66, %r9;
 
 BB103_10:
-       cvt.u64.u32     %rd15, %r66;
+       cvt.u64.u32     %rd15, %r65;
        shl.b64         %rd16, %rd15, 32;
-       cvt.u64.u32     %rd17, %r65;
+       cvt.u64.u32     %rd17, %r64;
        or.b64          %rd18, %rd16, %rd17;
        cvt.rn.f64.s64  %fd1, %rd18;
        mul.f64         %fd2, %fd1, 0d3BF921FB54442D19;
        cvt.rn.f32.f64  %f17, %fd2;
        neg.f32         %f18, %f17;
-       setp.eq.s32     %p8, %r67, 0;
+       setp.eq.s32     %p8, %r66, 0;
        selp.f32        %f37, %f17, %f18, %p8;
-       setp.eq.s32     %p9, %r10, 0;
-       neg.s32         %r59, %r20;
-       selp.b32        %r68, %r20, %r59, %p9;
+       setp.eq.s32     %p9, %r9, 0;
+       neg.s32         %r58, %r19;
+       selp.b32        %r67, %r19, %r58, %p9;
 
 BB103_12:
        mul.f32         %f20, %f37, %f37;
@@ -11797,8 +11798,8 @@ BB103_12:
        abs.f32         %f34, %f37;
        setp.eq.f32     %p10, %f34, 0f3A00B43C;
        selp.f32        %f38, %f37, %f33, %p10;
-       and.b32         %r60, %r68, 1;
-       setp.eq.b32     %p11, %r60, 1;
+       and.b32         %r59, %r67, 1;
+       setp.eq.b32     %p11, %r59, 1;
        @!%p11 bra      BB103_14;
        bra.uni         BB103_13;
 
diff --git a/src/main/cuda/kernels/reduction.ptx 
b/src/main/cuda/kernels/reduction.ptx
index 31038f5..72b9225 100644
--- a/src/main/cuda/kernels/reduction.ptx
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -10,649 +10,9 @@
 .target sm_30
 .address_size 64
 
-.extern .func  (.param .b32 func_retval0) vprintf
-(
-       .param .b64 vprintf_param_0,
-       .param .b64 vprintf_param_1
-)
-;
+       // .globl       double2float_f
 .extern .shared .align 1 .b8 memory[];
-.global .align 1 .b8 $str[78] = {69, 82, 82, 79, 82, 58, 32, 110, 111, 32, 99, 
111, 108, 117, 109, 110, 32, 105, 110, 100, 105, 99, 101, 115, 32, 97, 114, 
114, 97, 121, 32, 105, 110, 32, 97, 32, 100, 101, 110, 115, 101, 32, 109, 97, 
116, 114, 105, 120, 33, 32, 84, 104, 105, 115, 32, 119, 105, 108, 108, 32, 108, 
105, 107, 101, 108, 121, 32, 99, 114, 97, 115, 104, 32, 58, 45, 47, 10, 0};
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIfE9len_denseEv(
-       .param .b64 _ZN14MatrixAccessorIfE9len_denseEv_param_0
-)
-{
-       .reg .b32       %r<4>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE9len_denseEv_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r1, [%rd2+4];
-       ld.u32  %r2, [%rd2+8];
-       mul.lo.s32      %r3, %r2, %r1;
-       st.param.b32    [func_retval0+0], %r3;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIfE9pos_denseEj(
-       .param .b64 _ZN14MatrixAccessorIfE9pos_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE9pos_denseEj_param_1
-)
-{
-       .reg .b32       %r<4>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE9pos_denseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE9pos_denseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r2, [%rd2+8];
-       mul.lo.s32      %r3, %r2, %r1;
-       st.param.b32    [func_retval0+0], %r3;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE10cols_denseEj(
-       .param .b64 _ZN14MatrixAccessorIfE10cols_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE10cols_denseEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<4>;
-
-
-       mov.u64         %rd1, $str;
-       cvta.global.u64         %rd2, %rd1;
-       mov.u64         %rd3, 0;
-       // Callseq Start 0
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd2;
-       .param .b64 param1;
-       st.param.b64    [param1+0], %rd3;
-       .param .b32 retval0;
-       call.uni (retval0), 
-       vprintf, 
-       (
-       param0, 
-       param1
-       );
-       ld.param.b32    %r1, [retval0+0];
-       
-       //{
-       }// Callseq End 0
-       st.param.b64    [func_retval0+0], %rd3;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE12val_dense_rcEjj(
-       .param .b64 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_1,
-       .param .b32 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_2
-)
-{
-       .reg .b32       %r<5>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_1];
-       ld.param.u32    %r2, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_2];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       ld.u32  %r3, [%rd2+8];
-       mad.lo.s32      %r4, %r3, %r1, %r2;
-       mul.wide.u32    %rd4, %r4, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE10vals_denseEj(
-       .param .b64 _ZN14MatrixAccessorIfE10vals_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE10vals_denseEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE10vals_denseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE10vals_denseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIfE13row_len_denseEj(
-       .param .b64 _ZN14MatrixAccessorIfE13row_len_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE13row_len_denseEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE13row_len_denseEj_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r1, [%rd2+4];
-       st.param.b32    [func_retval0+0], %r1;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11val_dense_iEj(
-       .param .b64 _ZN14MatrixAccessorIfE11val_dense_iEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE11val_dense_iEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE11val_dense_iEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE11val_dense_iEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIfE10len_sparseEv(
-       .param .b64 _ZN14MatrixAccessorIfE10len_sparseEv_param_0
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE10len_sparseEv_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r1, [%rd2];
-       st.param.b32    [func_retval0+0], %r1;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIfE10pos_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIfE10pos_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE10pos_sparseEj_param_1
-)
-{
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE10pos_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE10pos_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+16];
-       mul.wide.u32    %rd4, %r1, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       ld.u32  %r2, [%rd5];
-       st.param.b32    [func_retval0+0], %r2;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11cols_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIfE11cols_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE11cols_sparseEj_param_1
-)
-{
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<9>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE11cols_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE11cols_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+24];
-       ld.u64  %rd4, [%rd2+16];
-       mul.wide.u32    %rd5, %r1, 4;
-       add.s64         %rd6, %rd4, %rd5;
-       ld.u32  %r2, [%rd6];
-       mul.wide.u32    %rd7, %r2, 4;
-       add.s64         %rd8, %rd3, %rd7;
-       st.param.b64    [func_retval0+0], %rd8;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE13val_sparse_rcEjj(
-       .param .b64 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_1,
-       .param .b32 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_2
-)
-{
-       .reg .b64       %rd<4>;
-
-
-       ld.param.u64    %rd1, 
[_ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       st.param.b64    [func_retval0+0], %rd3;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11vals_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIfE11vals_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE11vals_sparseEj_param_1
-)
-{
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<9>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE11vals_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE11vals_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       ld.u64  %rd4, [%rd2+16];
-       mul.wide.u32    %rd5, %r1, 4;
-       add.s64         %rd6, %rd4, %rd5;
-       ld.u32  %r2, [%rd6];
-       mul.wide.u32    %rd7, %r2, 4;
-       add.s64         %rd8, %rd3, %rd7;
-       st.param.b64    [func_retval0+0], %rd8;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIfE14row_len_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIfE14row_len_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE14row_len_sparseEj_param_1
-)
-{
-       .reg .b32       %r<6>;
-       .reg .b64       %rd<8>;
-
-
-       ld.param.u64    %rd1, 
[_ZN14MatrixAccessorIfE14row_len_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE14row_len_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+16];
-       add.s32         %r2, %r1, 1;
-       mul.wide.u32    %rd4, %r2, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       ld.u32  %r3, [%rd5];
-       mul.wide.u32    %rd6, %r1, 4;
-       add.s64         %rd7, %rd3, %rd6;
-       ld.u32  %r4, [%rd7];
-       sub.s32         %r5, %r3, %r4;
-       st.param.b32    [func_retval0+0], %r5;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIfE12val_sparse_iEj(
-       .param .b64 _ZN14MatrixAccessorIfE12val_sparse_iEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE12val_sparse_iEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE12val_sparse_iEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE12val_sparse_iEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func _ZN14MatrixAccessorIfE10set_sparseEjjf(
-       .param .b64 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_0,
-       .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_1,
-       .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_2,
-       .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_3
-)
-{
-       .reg .f32       %f<2>;
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<9>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_1];
-       ld.param.u32    %r2, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_2];
-       ld.param.f32    %f1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_3];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       st.f32  [%rd5], %f1;
-       ld.u64  %rd6, [%rd1];
-       ld.u64  %rd7, [%rd6+24];
-       add.s64         %rd8, %rd7, %rd4;
-       st.u32  [%rd8], %r2;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIdE9len_denseEv(
-       .param .b64 _ZN14MatrixAccessorIdE9len_denseEv_param_0
-)
-{
-       .reg .b32       %r<4>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE9len_denseEv_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r1, [%rd2+4];
-       ld.u32  %r2, [%rd2+8];
-       mul.lo.s32      %r3, %r2, %r1;
-       st.param.b32    [func_retval0+0], %r3;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIdE9pos_denseEj(
-       .param .b64 _ZN14MatrixAccessorIdE9pos_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE9pos_denseEj_param_1
-)
-{
-       .reg .b32       %r<4>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE9pos_denseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE9pos_denseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r2, [%rd2+8];
-       mul.lo.s32      %r3, %r2, %r1;
-       st.param.b32    [func_retval0+0], %r3;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE10cols_denseEj(
-       .param .b64 _ZN14MatrixAccessorIdE10cols_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE10cols_denseEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<4>;
-
-
-       mov.u64         %rd1, $str;
-       cvta.global.u64         %rd2, %rd1;
-       mov.u64         %rd3, 0;
-       // Callseq Start 1
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd2;
-       .param .b64 param1;
-       st.param.b64    [param1+0], %rd3;
-       .param .b32 retval0;
-       call.uni (retval0), 
-       vprintf, 
-       (
-       param0, 
-       param1
-       );
-       ld.param.b32    %r1, [retval0+0];
-       
-       //{
-       }// Callseq End 1
-       st.param.b64    [func_retval0+0], %rd3;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE12val_dense_rcEjj(
-       .param .b64 _ZN14MatrixAccessorIdE12val_dense_rcEjj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE12val_dense_rcEjj_param_1,
-       .param .b32 _ZN14MatrixAccessorIdE12val_dense_rcEjj_param_2
-)
-{
-       .reg .b32       %r<5>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_1];
-       ld.param.u32    %r2, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_2];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       ld.u32  %r3, [%rd2+8];
-       mad.lo.s32      %r4, %r3, %r1, %r2;
-       mul.wide.u32    %rd4, %r4, 8;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE10vals_denseEj(
-       .param .b64 _ZN14MatrixAccessorIdE10vals_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE10vals_denseEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE10vals_denseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE10vals_denseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 8;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIdE13row_len_denseEj(
-       .param .b64 _ZN14MatrixAccessorIdE13row_len_denseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE13row_len_denseEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE13row_len_denseEj_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r1, [%rd2+4];
-       st.param.b32    [func_retval0+0], %r1;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE11val_dense_iEj(
-       .param .b64 _ZN14MatrixAccessorIdE11val_dense_iEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE11val_dense_iEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE11val_dense_iEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE11val_dense_iEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 8;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIdE10len_sparseEv(
-       .param .b64 _ZN14MatrixAccessorIdE10len_sparseEv_param_0
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE10len_sparseEv_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u32  %r1, [%rd2];
-       st.param.b32    [func_retval0+0], %r1;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIdE10pos_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIdE10pos_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE10pos_sparseEj_param_1
-)
-{
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE10pos_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE10pos_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+16];
-       mul.wide.u32    %rd4, %r1, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       ld.u32  %r2, [%rd5];
-       st.param.b32    [func_retval0+0], %r2;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE11cols_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIdE11cols_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE11cols_sparseEj_param_1
-)
-{
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<9>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE11cols_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE11cols_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+24];
-       ld.u64  %rd4, [%rd2+16];
-       mul.wide.u32    %rd5, %r1, 4;
-       add.s64         %rd6, %rd4, %rd5;
-       ld.u32  %r2, [%rd6];
-       mul.wide.u32    %rd7, %r2, 4;
-       add.s64         %rd8, %rd3, %rd7;
-       st.param.b64    [func_retval0+0], %rd8;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE13val_sparse_rcEjj(
-       .param .b64 _ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_1,
-       .param .b32 _ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_2
-)
-{
-       .reg .b64       %rd<4>;
-
-
-       ld.param.u64    %rd1, 
[_ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_0];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       st.param.b64    [func_retval0+0], %rd3;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE11vals_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIdE11vals_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE11vals_sparseEj_param_1
-)
-{
-       .reg .b32       %r<3>;
-       .reg .b64       %rd<9>;
 
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE11vals_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE11vals_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       ld.u64  %rd4, [%rd2+16];
-       mul.wide.u32    %rd5, %r1, 4;
-       add.s64         %rd6, %rd4, %rd5;
-       ld.u32  %r2, [%rd6];
-       mul.wide.u32    %rd7, %r2, 8;
-       add.s64         %rd8, %rd3, %rd7;
-       st.param.b64    [func_retval0+0], %rd8;
-       ret;
-}
-
-.func  (.param .b32 func_retval0) _ZN14MatrixAccessorIdE14row_len_sparseEj(
-       .param .b64 _ZN14MatrixAccessorIdE14row_len_sparseEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE14row_len_sparseEj_param_1
-)
-{
-       .reg .b32       %r<6>;
-       .reg .b64       %rd<8>;
-
-
-       ld.param.u64    %rd1, 
[_ZN14MatrixAccessorIdE14row_len_sparseEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE14row_len_sparseEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+16];
-       add.s32         %r2, %r1, 1;
-       mul.wide.u32    %rd4, %r2, 4;
-       add.s64         %rd5, %rd3, %rd4;
-       ld.u32  %r3, [%rd5];
-       mul.wide.u32    %rd6, %r1, 4;
-       add.s64         %rd7, %rd3, %rd6;
-       ld.u32  %r4, [%rd7];
-       sub.s32         %r5, %r3, %r4;
-       st.param.b32    [func_retval0+0], %r5;
-       ret;
-}
-
-.func  (.param .b64 func_retval0) _ZN14MatrixAccessorIdE12val_sparse_iEj(
-       .param .b64 _ZN14MatrixAccessorIdE12val_sparse_iEj_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE12val_sparse_iEj_param_1
-)
-{
-       .reg .b32       %r<2>;
-       .reg .b64       %rd<6>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE12val_sparse_iEj_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE12val_sparse_iEj_param_1];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 8;
-       add.s64         %rd5, %rd3, %rd4;
-       st.param.b64    [func_retval0+0], %rd5;
-       ret;
-}
-
-.func _ZN14MatrixAccessorIdE10set_sparseEjjd(
-       .param .b64 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_0,
-       .param .b32 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_1,
-       .param .b32 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_2,
-       .param .b64 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_3
-)
-{
-       .reg .b32       %r<3>;
-       .reg .f64       %fd<2>;
-       .reg .b64       %rd<10>;
-
-
-       ld.param.u64    %rd1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_0];
-       ld.param.u32    %r1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_1];
-       ld.param.u32    %r2, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_2];
-       ld.param.f64    %fd1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_3];
-       ld.u64  %rd2, [%rd1];
-       ld.u64  %rd3, [%rd2+32];
-       mul.wide.u32    %rd4, %r1, 8;
-       add.s64         %rd5, %rd3, %rd4;
-       st.f64  [%rd5], %fd1;
-       ld.u64  %rd6, [%rd1];
-       ld.u64  %rd7, [%rd6+24];
-       mul.wide.u32    %rd8, %r1, 4;
-       add.s64         %rd9, %rd7, %rd8;
-       st.u32  [%rd9], %r2;
-       ret;
-}
-
-       // .globl       double2float_f
 .visible .entry double2float_f(
        .param .u64 double2float_f_param_0,
        .param .u64 double2float_f_param_1,
@@ -674,7 +34,7 @@
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB30_2;
+       @%p1 bra        BB0_2;
 
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
@@ -686,7 +46,7 @@
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f1;
 
-BB30_2:
+BB0_2:
        ret;
 }
 
@@ -712,7 +72,7 @@ BB30_2:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB31_2;
+       @%p1 bra        BB1_2;
 
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 4;
@@ -724,7 +84,7 @@ BB30_2:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd1;
 
-BB31_2:
+BB1_2:
        ret;
 }
 
@@ -735,400 +95,225 @@ BB31_2:
        .param .u32 reduce_sum_f_param_2
 )
 {
-       .local .align 8 .b8     __local_depot32[272];
-       .reg .b64       %SP;
-       .reg .b64       %SPL;
        .reg .pred      %p<25>;
-       .reg .f32       %f<60>;
-       .reg .b32       %r<44>;
-       .reg .b64       %rd<123>;
-
-
-       mov.u64         %SPL, __local_depot32;
-       cvta.local.u64  %SP, %SPL;
-       ld.param.u64    %rd17, [reduce_sum_f_param_0];
-       ld.param.u64    %rd16, [reduce_sum_f_param_1];
-       ld.param.u32    %r5, [reduce_sum_f_param_2];
-       add.u64         %rd18, %SP, 0;
-       add.u64         %rd1, %SPL, 0;
-       st.local.u64    [%rd1], %rd17;
-       cvta.to.global.u64      %rd19, %rd17;
-       ld.global.u64   %rd20, [%rd19+16];
-       setp.eq.s64     %p1, %rd20, 0;
-       @%p1 bra        BB32_2;
-
-       mov.u64         %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
-       st.local.u64    [%rd1+8], %rd21;
-       mov.u64         %rd23, 0;
-       st.local.u64    [%rd1+16], %rd23;
-       mov.u64         %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
-       st.local.u64    [%rd1+40], %rd24;
-       st.local.u64    [%rd1+48], %rd23;
-       mov.u64         %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
-       st.local.u64    [%rd1+56], %rd26;
-       st.local.u64    [%rd1+64], %rd23;
-       mov.u64         %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
-       st.local.u64    [%rd1+88], %rd28;
-       st.local.u64    [%rd1+96], %rd23;
-       mov.u64         %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
-       st.local.u64    [%rd1+104], %rd30;
-       st.local.u64    [%rd1+112], %rd23;
-       mov.u64         %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
-       st.local.u64    [%rd1+24], %rd32;
-       st.local.u64    [%rd1+32], %rd23;
-       mov.u64         %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
-       st.local.u64    [%rd1+72], %rd34;
-       st.local.u64    [%rd1+80], %rd23;
-       mov.u64         %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
-       st.local.u64    [%rd1+120], %rd36;
-       st.local.u64    [%rd1+128], %rd23;
-       bra.uni         BB32_3;
-
-BB32_2:
-       mov.u64         %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
-       st.local.u64    [%rd1+8], %rd38;
-       mov.u64         %rd40, 0;
-       st.local.u64    [%rd1+16], %rd40;
-       mov.u64         %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
-       st.local.u64    [%rd1+40], %rd41;
-       st.local.u64    [%rd1+48], %rd40;
-       mov.u64         %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
-       st.local.u64    [%rd1+56], %rd43;
-       st.local.u64    [%rd1+64], %rd40;
-       mov.u64         %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
-       st.local.u64    [%rd1+88], %rd45;
-       st.local.u64    [%rd1+96], %rd40;
-       mov.u64         %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
-       st.local.u64    [%rd1+104], %rd47;
-       st.local.u64    [%rd1+112], %rd40;
-       mov.u64         %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
-       st.local.u64    [%rd1+24], %rd49;
-       st.local.u64    [%rd1+32], %rd40;
-       mov.u64         %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
-       st.local.u64    [%rd1+72], %rd51;
-       st.local.u64    [%rd1+80], %rd40;
-
-BB32_3:
-       add.u64         %rd53, %SP, 136;
-       add.u64         %rd2, %SPL, 136;
-       st.local.u64    [%rd2], %rd16;
-       cvta.to.global.u64      %rd54, %rd16;
-       ld.global.u64   %rd55, [%rd54+16];
-       setp.eq.s64     %p2, %rd55, 0;
-       @%p2 bra        BB32_5;
-
-       mov.u64         %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
-       st.local.u64    [%rd2+8], %rd56;
-       mov.u64         %rd58, 0;
-       st.local.u64    [%rd2+16], %rd58;
-       mov.u64         %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
-       st.local.u64    [%rd2+40], %rd59;
-       st.local.u64    [%rd2+48], %rd58;
-       mov.u64         %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
-       st.local.u64    [%rd2+56], %rd61;
-       st.local.u64    [%rd2+64], %rd58;
-       mov.u64         %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
-       st.local.u64    [%rd2+88], %rd63;
-       st.local.u64    [%rd2+96], %rd58;
-       mov.u64         %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
-       st.local.u64    [%rd2+104], %rd65;
-       st.local.u64    [%rd2+112], %rd58;
-       mov.u64         %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
-       st.local.u64    [%rd2+24], %rd67;
-       st.local.u64    [%rd2+32], %rd58;
-       mov.u64         %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
-       st.local.u64    [%rd2+72], %rd69;
-       st.local.u64    [%rd2+80], %rd58;
-       mov.u64         %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
-       st.local.u64    [%rd2+120], %rd71;
-       st.local.u64    [%rd2+128], %rd58;
-       bra.uni         BB32_6;
-
-BB32_5:
-       mov.u64         %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
-       st.local.u64    [%rd2+8], %rd73;
-       mov.u64         %rd75, 0;
-       st.local.u64    [%rd2+16], %rd75;
-       mov.u64         %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
-       st.local.u64    [%rd2+40], %rd76;
-       st.local.u64    [%rd2+48], %rd75;
-       mov.u64         %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
-       st.local.u64    [%rd2+56], %rd78;
-       st.local.u64    [%rd2+64], %rd75;
-       mov.u64         %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
-       st.local.u64    [%rd2+88], %rd80;
-       st.local.u64    [%rd2+96], %rd75;
-       mov.u64         %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
-       st.local.u64    [%rd2+104], %rd82;
-       st.local.u64    [%rd2+112], %rd75;
-       mov.u64         %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
-       st.local.u64    [%rd2+24], %rd84;
-       st.local.u64    [%rd2+32], %rd75;
-       mov.u64         %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
-       st.local.u64    [%rd2+72], %rd86;
-       st.local.u64    [%rd2+80], %rd75;
-
-BB32_6:
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r43, %r8, %r9, %r6;
-       mov.f32         %f44, 0f00000000;
-       setp.ge.u32     %p3, %r43, %r5;
-       @%p3 bra        BB32_15;
-
-       mov.f32         %f44, 0f00000000;
-
-BB32_8:
-       ld.local.u64    %rd3, [%rd1+112];
-       ld.local.u64    %rd120, [%rd1+104];
-       and.b64         %rd90, %rd120, 1;
-       setp.eq.b64     %p4, %rd90, 1;
-       @!%p4 bra       BB32_10;
-       bra.uni         BB32_9;
-
-BB32_9:
-       add.s64         %rd93, %rd1, %rd3;
-       ld.local.u64    %rd94, [%rd93];
-       add.s64         %rd95, %rd120, %rd94;
-       ld.u64  %rd120, [%rd95+-1];
-
-BB32_10:
-       add.s64         %rd97, %rd18, %rd3;
-       // Callseq Start 2
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd97;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r43;
-       .param .b64 retval0;
-       prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd120, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_2;
-       ld.param.b64    %rd99, [retval0+0];
-       
-       //{
-       }// Callseq End 2
-       ld.f32  %f31, [%rd99];
-       add.f32         %f44, %f44, %f31;
-       add.s32         %r16, %r43, %r9;
-       setp.ge.u32     %p5, %r16, %r5;
-       @%p5 bra        BB32_14;
-
-       ld.local.u64    %rd121, [%rd1+104];
-       and.b64         %rd102, %rd121, 1;
-       setp.eq.b64     %p6, %rd102, 1;
-       ld.local.u64    %rd8, [%rd1+112];
-       @!%p6 bra       BB32_13;
-       bra.uni         BB32_12;
-
-BB32_12:
-       add.s64         %rd105, %rd1, %rd8;
-       ld.local.u64    %rd106, [%rd105];
-       add.s64         %rd107, %rd121, %rd106;
-       ld.u64  %rd121, [%rd107+-1];
-
-BB32_13:
-       add.s64         %rd109, %rd18, %rd8;
-       // Callseq Start 3
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd109;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r16;
-       .param .b64 retval0;
-       prototype_3 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd121, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_3;
-       ld.param.b64    %rd111, [retval0+0];
-       
-       //{
-       }// Callseq End 3
-       ld.f32  %f32, [%rd111];
-       add.f32         %f44, %f44, %f32;
-
-BB32_14:
-       shl.b32         %r20, %r9, 1;
-       mov.u32         %r21, %nctaid.x;
-       mad.lo.s32      %r43, %r20, %r21, %r43;
-       setp.lt.u32     %p7, %r43, %r5;
-       @%p7 bra        BB32_8;
-
-BB32_15:
-       shl.b32         %r23, %r6, 2;
-       mov.u32         %r24, memory;
-       add.s32         %r4, %r24, %r23;
-       st.shared.f32   [%r4], %f44;
+       .reg .f32       %f<69>;
+       .reg .b32       %r<57>;
+       .reg .b64       %rd<36>;
+
+
+       ld.param.u64    %rd9, [reduce_sum_f_param_0];
+       ld.param.u64    %rd10, [reduce_sum_f_param_1];
+       ld.param.u32    %r13, [reduce_sum_f_param_2];
+       mov.u32         %r14, %ctaid.x;
+       shl.b32         %r15, %r14, 1;
+       mov.u32         %r16, %ntid.x;
+       mov.u32         %r17, %tid.x;
+       mad.lo.s32      %r56, %r15, %r16, %r17;
+       mov.f32         %f51, 0f00000000;
+       setp.ge.u32     %p1, %r56, %r13;
+       @%p1 bra        BB2_11;
+
+       cvta.to.global.u64      %rd11, %rd9;
+       ld.global.u64   %rd1, [%rd11+16];
+       setp.eq.s64     %p2, %rd1, 0;
+       ld.global.u64   %rd12, [%rd11+32];
+       cvta.to.global.u64      %rd2, %rd12;
+       mov.f32         %f51, 0f00000000;
+       @%p2 bra        BB2_8;
+
+       mad.lo.s32      %r54, %r15, %r16, %r17;
+       mov.f32         %f51, 0f00000000;
+       mov.u64         %rd32, %rd1;
+
+BB2_3:
+       cvta.to.global.u64      %rd13, %rd32;
+       mul.wide.u32    %rd14, %r54, 4;
+       add.s64         %rd15, %rd13, %rd14;
+       ld.global.u32   %r27, [%rd15];
+       mul.wide.u32    %rd16, %r27, 4;
+       add.s64         %rd17, %rd2, %rd16;
+       ld.global.f32   %f36, [%rd17];
+       add.f32         %f51, %f51, %f36;
+       add.s32         %r55, %r54, %r16;
+       setp.ge.u32     %p3, %r55, %r13;
+       @%p3 bra        BB2_7;
+
+       setp.eq.s64     %p4, %rd32, 0;
+       mov.u64         %rd32, 0;
+       @%p4 bra        BB2_6;
+
+       cvta.to.global.u64      %rd19, %rd1;
+       mul.wide.u32    %rd20, %r55, 4;
+       add.s64         %rd21, %rd19, %rd20;
+       ld.global.u32   %r55, [%rd21];
+       mov.u64         %rd32, %rd1;
+
+BB2_6:
+       mul.wide.u32    %rd22, %r55, 4;
+       add.s64         %rd23, %rd2, %rd22;
+       ld.global.f32   %f37, [%rd23];
+       add.f32         %f51, %f51, %f37;
+
+BB2_7:
+       shl.b32         %r30, %r16, 1;
+       mov.u32         %r31, %nctaid.x;
+       mad.lo.s32      %r54, %r30, %r31, %r54;
+       setp.lt.u32     %p5, %r54, %r13;
+       @%p5 bra        BB2_3;
+       bra.uni         BB2_11;
+
+BB2_8:
+       mul.wide.u32    %rd24, %r56, 4;
+       add.s64         %rd25, %rd2, %rd24;
+       ld.global.f32   %f38, [%rd25];
+       add.f32         %f51, %f51, %f38;
+       add.s32         %r10, %r56, %r16;
+       setp.ge.u32     %p6, %r10, %r13;
+       @%p6 bra        BB2_10;
+
+       mul.wide.u32    %rd26, %r10, 4;
+       add.s64         %rd27, %rd2, %rd26;
+       ld.global.f32   %f39, [%rd27];
+       add.f32         %f51, %f51, %f39;
+
+BB2_10:
+       mov.u32         %r32, %nctaid.x;
+       shl.b32         %r33, %r16, 1;
+       mad.lo.s32      %r56, %r33, %r32, %r56;
+       setp.lt.u32     %p7, %r56, %r13;
+       @%p7 bra        BB2_8;
+
+BB2_11:
+       shl.b32         %r35, %r17, 2;
+       mov.u32         %r36, memory;
+       add.s32         %r12, %r36, %r35;
+       st.shared.f32   [%r12], %f51;
        bar.sync        0;
-       setp.lt.u32     %p8, %r9, 1024;
-       @%p8 bra        BB32_19;
+       setp.lt.u32     %p8, %r16, 1024;
+       @%p8 bra        BB2_15;
 
-       setp.gt.u32     %p9, %r6, 511;
-       @%p9 bra        BB32_18;
+       setp.gt.u32     %p9, %r17, 511;
+       @%p9 bra        BB2_14;
 
-       ld.shared.f32   %f33, [%r4+2048];
-       add.f32         %f44, %f44, %f33;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f40, [%r12+2048];
+       add.f32         %f51, %f51, %f40;
+       st.shared.f32   [%r12], %f51;
 
-BB32_18:
+BB2_14:
        bar.sync        0;
 
-BB32_19:
-       setp.lt.u32     %p10, %r9, 512;
-       @%p10 bra       BB32_23;
+BB2_15:
+       setp.lt.u32     %p10, %r16, 512;
+       @%p10 bra       BB2_19;
 
-       setp.gt.u32     %p11, %r6, 255;
-       @%p11 bra       BB32_22;
+       setp.gt.u32     %p11, %r17, 255;
+       @%p11 bra       BB2_18;
 
-       ld.shared.f32   %f34, [%r4+1024];
-       add.f32         %f44, %f44, %f34;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f41, [%r12+1024];
+       add.f32         %f51, %f51, %f41;
+       st.shared.f32   [%r12], %f51;
 
-BB32_22:
+BB2_18:
        bar.sync        0;
 
-BB32_23:
-       setp.lt.u32     %p12, %r9, 256;
-       @%p12 bra       BB32_27;
+BB2_19:
+       setp.lt.u32     %p12, %r16, 256;
+       @%p12 bra       BB2_23;
 
-       setp.gt.u32     %p13, %r6, 127;
-       @%p13 bra       BB32_26;
+       setp.gt.u32     %p13, %r17, 127;
+       @%p13 bra       BB2_22;
 
-       ld.shared.f32   %f35, [%r4+512];
-       add.f32         %f44, %f44, %f35;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f42, [%r12+512];
+       add.f32         %f51, %f51, %f42;
+       st.shared.f32   [%r12], %f51;
 
-BB32_26:
+BB2_22:
        bar.sync        0;
 
-BB32_27:
-       setp.lt.u32     %p14, %r9, 128;
-       @%p14 bra       BB32_31;
+BB2_23:
+       setp.lt.u32     %p14, %r16, 128;
+       @%p14 bra       BB2_27;
 
-       setp.gt.u32     %p15, %r6, 63;
-       @%p15 bra       BB32_30;
+       setp.gt.u32     %p15, %r17, 63;
+       @%p15 bra       BB2_26;
 
-       ld.shared.f32   %f36, [%r4+256];
-       add.f32         %f44, %f44, %f36;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f43, [%r12+256];
+       add.f32         %f51, %f51, %f43;
+       st.shared.f32   [%r12], %f51;
 
-BB32_30:
+BB2_26:
        bar.sync        0;
 
-BB32_31:
-       setp.gt.u32     %p16, %r6, 31;
-       @%p16 bra       BB32_44;
+BB2_27:
+       setp.gt.u32     %p16, %r17, 31;
+       @%p16 bra       BB2_40;
 
-       setp.lt.u32     %p17, %r9, 64;
-       @%p17 bra       BB32_34;
+       setp.lt.u32     %p17, %r16, 64;
+       @%p17 bra       BB2_30;
 
-       ld.volatile.shared.f32  %f37, [%r4+128];
-       add.f32         %f44, %f44, %f37;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f44, [%r12+128];
+       add.f32         %f51, %f51, %f44;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB32_34:
-       setp.lt.u32     %p18, %r9, 32;
-       @%p18 bra       BB32_36;
+BB2_30:
+       setp.lt.u32     %p18, %r16, 32;
+       @%p18 bra       BB2_32;
 
-       ld.volatile.shared.f32  %f38, [%r4+64];
-       add.f32         %f44, %f44, %f38;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f45, [%r12+64];
+       add.f32         %f51, %f51, %f45;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB32_36:
-       setp.lt.u32     %p19, %r9, 16;
-       @%p19 bra       BB32_38;
+BB2_32:
+       setp.lt.u32     %p19, %r16, 16;
+       @%p19 bra       BB2_34;
 
-       ld.volatile.shared.f32  %f39, [%r4+32];
-       add.f32         %f44, %f44, %f39;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f46, [%r12+32];
+       add.f32         %f51, %f51, %f46;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB32_38:
-       setp.lt.u32     %p20, %r9, 8;
-       @%p20 bra       BB32_40;
+BB2_34:
+       setp.lt.u32     %p20, %r16, 8;
+       @%p20 bra       BB2_36;
 
-       ld.volatile.shared.f32  %f40, [%r4+16];
-       add.f32         %f44, %f44, %f40;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f47, [%r12+16];
+       add.f32         %f51, %f51, %f47;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB32_40:
-       setp.lt.u32     %p21, %r9, 4;
-       @%p21 bra       BB32_42;
+BB2_36:
+       setp.lt.u32     %p21, %r16, 4;
+       @%p21 bra       BB2_38;
 
-       ld.volatile.shared.f32  %f41, [%r4+8];
-       add.f32         %f44, %f44, %f41;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f48, [%r12+8];
+       add.f32         %f51, %f51, %f48;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB32_42:
-       setp.lt.u32     %p22, %r9, 2;
-       @%p22 bra       BB32_44;
+BB2_38:
+       setp.lt.u32     %p22, %r16, 2;
+       @%p22 bra       BB2_40;
 
-       ld.volatile.shared.f32  %f42, [%r4+4];
-       add.f32         %f43, %f44, %f42;
-       st.volatile.shared.f32  [%r4], %f43;
+       ld.volatile.shared.f32  %f49, [%r12+4];
+       add.f32         %f50, %f51, %f49;
+       st.volatile.shared.f32  [%r12], %f50;
 
-BB32_44:
-       setp.ne.s32     %p23, %r6, 0;
-       @%p23 bra       BB32_48;
+BB2_40:
+       setp.ne.s32     %p23, %r17, 0;
+       @%p23 bra       BB2_44;
 
-       ld.shared.f32   %f28, [memory];
-       ld.local.u64    %rd114, [%rd2+96];
-       add.s64         %rd11, %rd2, %rd114;
-       add.s64         %rd12, %rd53, %rd114;
-       ld.local.u64    %rd122, [%rd2+88];
-       and.b64         %rd115, %rd122, 1;
-       setp.eq.b64     %p24, %rd115, 1;
-       @!%p24 bra      BB32_47;
-       bra.uni         BB32_46;
-
-BB32_46:
-       ld.local.u64    %rd116, [%rd11];
-       add.s64         %rd117, %rd122, %rd116;
-       ld.u64  %rd122, [%rd117+-1];
-
-BB32_47:
-       mov.u32         %r42, 0;
-       // Callseq Start 4
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd12;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r42;
-       .param .b32 param2;
-       st.param.b32    [param2+0], %r7;
-       .param .b64 retval0;
-       prototype_4 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _, .param .b32 _) ;
-       call (retval0), 
-       %rd122, 
-       (
-       param0, 
-       param1, 
-       param2
-       )
-       , prototype_4;
-       ld.param.b64    %rd119, [retval0+0];
-       
-       //{
-       }// Callseq End 4
-       st.f32  [%rd119], %f28;
-
-BB32_48:
+       ld.shared.f32   %f32, [memory];
+       cvta.to.global.u64      %rd28, %rd10;
+       ld.global.u64   %rd29, [%rd28+16];
+       ld.global.u64   %rd30, [%rd28+32];
+       cvta.to.global.u64      %rd35, %rd30;
+       setp.ne.s64     %p24, %rd29, 0;
+       @%p24 bra       BB2_43;
+
+       mul.wide.u32    %rd31, %r14, 4;
+       add.s64         %rd35, %rd35, %rd31;
+
+BB2_43:
+       st.global.f32   [%rd35], %f32;
+
+BB2_44:
        ret;
 }
 
@@ -1139,400 +324,225 @@ BB32_48:
        .param .u32 reduce_sum_d_param_2
 )
 {
-       .local .align 8 .b8     __local_depot33[272];
-       .reg .b64       %SP;
-       .reg .b64       %SPL;
        .reg .pred      %p<25>;
-       .reg .b32       %r<44>;
-       .reg .f64       %fd<60>;
-       .reg .b64       %rd<123>;
-
-
-       mov.u64         %SPL, __local_depot33;
-       cvta.local.u64  %SP, %SPL;
-       ld.param.u64    %rd17, [reduce_sum_d_param_0];
-       ld.param.u64    %rd16, [reduce_sum_d_param_1];
-       ld.param.u32    %r5, [reduce_sum_d_param_2];
-       add.u64         %rd18, %SP, 0;
-       add.u64         %rd1, %SPL, 0;
-       st.local.u64    [%rd1], %rd17;
-       cvta.to.global.u64      %rd19, %rd17;
-       ld.global.u64   %rd20, [%rd19+16];
-       setp.eq.s64     %p1, %rd20, 0;
-       @%p1 bra        BB33_2;
-
-       mov.u64         %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
-       st.local.u64    [%rd1+8], %rd21;
-       mov.u64         %rd23, 0;
-       st.local.u64    [%rd1+16], %rd23;
-       mov.u64         %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
-       st.local.u64    [%rd1+40], %rd24;
-       st.local.u64    [%rd1+48], %rd23;
-       mov.u64         %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
-       st.local.u64    [%rd1+56], %rd26;
-       st.local.u64    [%rd1+64], %rd23;
-       mov.u64         %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
-       st.local.u64    [%rd1+88], %rd28;
-       st.local.u64    [%rd1+96], %rd23;
-       mov.u64         %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
-       st.local.u64    [%rd1+104], %rd30;
-       st.local.u64    [%rd1+112], %rd23;
-       mov.u64         %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
-       st.local.u64    [%rd1+24], %rd32;
-       st.local.u64    [%rd1+32], %rd23;
-       mov.u64         %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
-       st.local.u64    [%rd1+72], %rd34;
-       st.local.u64    [%rd1+80], %rd23;
-       mov.u64         %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
-       st.local.u64    [%rd1+120], %rd36;
-       st.local.u64    [%rd1+128], %rd23;
-       bra.uni         BB33_3;
-
-BB33_2:
-       mov.u64         %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
-       st.local.u64    [%rd1+8], %rd38;
-       mov.u64         %rd40, 0;
-       st.local.u64    [%rd1+16], %rd40;
-       mov.u64         %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
-       st.local.u64    [%rd1+40], %rd41;
-       st.local.u64    [%rd1+48], %rd40;
-       mov.u64         %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
-       st.local.u64    [%rd1+56], %rd43;
-       st.local.u64    [%rd1+64], %rd40;
-       mov.u64         %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
-       st.local.u64    [%rd1+88], %rd45;
-       st.local.u64    [%rd1+96], %rd40;
-       mov.u64         %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
-       st.local.u64    [%rd1+104], %rd47;
-       st.local.u64    [%rd1+112], %rd40;
-       mov.u64         %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
-       st.local.u64    [%rd1+24], %rd49;
-       st.local.u64    [%rd1+32], %rd40;
-       mov.u64         %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
-       st.local.u64    [%rd1+72], %rd51;
-       st.local.u64    [%rd1+80], %rd40;
-
-BB33_3:
-       add.u64         %rd53, %SP, 136;
-       add.u64         %rd2, %SPL, 136;
-       st.local.u64    [%rd2], %rd16;
-       cvta.to.global.u64      %rd54, %rd16;
-       ld.global.u64   %rd55, [%rd54+16];
-       setp.eq.s64     %p2, %rd55, 0;
-       @%p2 bra        BB33_5;
-
-       mov.u64         %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
-       st.local.u64    [%rd2+8], %rd56;
-       mov.u64         %rd58, 0;
-       st.local.u64    [%rd2+16], %rd58;
-       mov.u64         %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
-       st.local.u64    [%rd2+40], %rd59;
-       st.local.u64    [%rd2+48], %rd58;
-       mov.u64         %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
-       st.local.u64    [%rd2+56], %rd61;
-       st.local.u64    [%rd2+64], %rd58;
-       mov.u64         %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
-       st.local.u64    [%rd2+88], %rd63;
-       st.local.u64    [%rd2+96], %rd58;
-       mov.u64         %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
-       st.local.u64    [%rd2+104], %rd65;
-       st.local.u64    [%rd2+112], %rd58;
-       mov.u64         %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
-       st.local.u64    [%rd2+24], %rd67;
-       st.local.u64    [%rd2+32], %rd58;
-       mov.u64         %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
-       st.local.u64    [%rd2+72], %rd69;
-       st.local.u64    [%rd2+80], %rd58;
-       mov.u64         %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
-       st.local.u64    [%rd2+120], %rd71;
-       st.local.u64    [%rd2+128], %rd58;
-       bra.uni         BB33_6;
-
-BB33_5:
-       mov.u64         %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
-       st.local.u64    [%rd2+8], %rd73;
-       mov.u64         %rd75, 0;
-       st.local.u64    [%rd2+16], %rd75;
-       mov.u64         %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
-       st.local.u64    [%rd2+40], %rd76;
-       st.local.u64    [%rd2+48], %rd75;
-       mov.u64         %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
-       st.local.u64    [%rd2+56], %rd78;
-       st.local.u64    [%rd2+64], %rd75;
-       mov.u64         %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
-       st.local.u64    [%rd2+88], %rd80;
-       st.local.u64    [%rd2+96], %rd75;
-       mov.u64         %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
-       st.local.u64    [%rd2+104], %rd82;
-       st.local.u64    [%rd2+112], %rd75;
-       mov.u64         %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
-       st.local.u64    [%rd2+24], %rd84;
-       st.local.u64    [%rd2+32], %rd75;
-       mov.u64         %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
-       st.local.u64    [%rd2+72], %rd86;
-       st.local.u64    [%rd2+80], %rd75;
-
-BB33_6:
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r43, %r8, %r9, %r6;
-       mov.f64         %fd44, 0d0000000000000000;
-       setp.ge.u32     %p3, %r43, %r5;
-       @%p3 bra        BB33_15;
-
-       mov.f64         %fd44, 0d0000000000000000;
-
-BB33_8:
-       ld.local.u64    %rd3, [%rd1+112];
-       ld.local.u64    %rd120, [%rd1+104];
-       and.b64         %rd90, %rd120, 1;
-       setp.eq.b64     %p4, %rd90, 1;
-       @!%p4 bra       BB33_10;
-       bra.uni         BB33_9;
-
-BB33_9:
-       add.s64         %rd93, %rd1, %rd3;
-       ld.local.u64    %rd94, [%rd93];
-       add.s64         %rd95, %rd120, %rd94;
-       ld.u64  %rd120, [%rd95+-1];
-
-BB33_10:
-       add.s64         %rd97, %rd18, %rd3;
-       // Callseq Start 5
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd97;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r43;
-       .param .b64 retval0;
-       prototype_5 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd120, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_5;
-       ld.param.b64    %rd99, [retval0+0];
-       
-       //{
-       }// Callseq End 5
-       ld.f64  %fd31, [%rd99];
-       add.f64         %fd44, %fd44, %fd31;
-       add.s32         %r16, %r43, %r9;
-       setp.ge.u32     %p5, %r16, %r5;
-       @%p5 bra        BB33_14;
-
-       ld.local.u64    %rd121, [%rd1+104];
-       and.b64         %rd102, %rd121, 1;
-       setp.eq.b64     %p6, %rd102, 1;
-       ld.local.u64    %rd8, [%rd1+112];
-       @!%p6 bra       BB33_13;
-       bra.uni         BB33_12;
-
-BB33_12:
-       add.s64         %rd105, %rd1, %rd8;
-       ld.local.u64    %rd106, [%rd105];
-       add.s64         %rd107, %rd121, %rd106;
-       ld.u64  %rd121, [%rd107+-1];
-
-BB33_13:
-       add.s64         %rd109, %rd18, %rd8;
-       // Callseq Start 6
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd109;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r16;
-       .param .b64 retval0;
-       prototype_6 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd121, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_6;
-       ld.param.b64    %rd111, [retval0+0];
-       
-       //{
-       }// Callseq End 6
-       ld.f64  %fd32, [%rd111];
-       add.f64         %fd44, %fd44, %fd32;
-
-BB33_14:
-       shl.b32         %r20, %r9, 1;
-       mov.u32         %r21, %nctaid.x;
-       mad.lo.s32      %r43, %r20, %r21, %r43;
-       setp.lt.u32     %p7, %r43, %r5;
-       @%p7 bra        BB33_8;
-
-BB33_15:
-       shl.b32         %r23, %r6, 3;
-       mov.u32         %r24, memory;
-       add.s32         %r4, %r24, %r23;
-       st.shared.f64   [%r4], %fd44;
+       .reg .b32       %r<57>;
+       .reg .f64       %fd<69>;
+       .reg .b64       %rd<36>;
+
+
+       ld.param.u64    %rd9, [reduce_sum_d_param_0];
+       ld.param.u64    %rd10, [reduce_sum_d_param_1];
+       ld.param.u32    %r13, [reduce_sum_d_param_2];
+       mov.u32         %r14, %ctaid.x;
+       shl.b32         %r15, %r14, 1;
+       mov.u32         %r16, %ntid.x;
+       mov.u32         %r17, %tid.x;
+       mad.lo.s32      %r56, %r15, %r16, %r17;
+       mov.f64         %fd51, 0d0000000000000000;
+       setp.ge.u32     %p1, %r56, %r13;
+       @%p1 bra        BB3_11;
+
+       cvta.to.global.u64      %rd11, %rd9;
+       ld.global.u64   %rd1, [%rd11+16];
+       setp.eq.s64     %p2, %rd1, 0;
+       ld.global.u64   %rd12, [%rd11+32];
+       cvta.to.global.u64      %rd2, %rd12;
+       mov.f64         %fd51, 0d0000000000000000;
+       @%p2 bra        BB3_8;
+
+       mad.lo.s32      %r54, %r15, %r16, %r17;
+       mov.f64         %fd51, 0d0000000000000000;
+       mov.u64         %rd32, %rd1;
+
+BB3_3:
+       cvta.to.global.u64      %rd13, %rd32;
+       mul.wide.u32    %rd14, %r54, 4;
+       add.s64         %rd15, %rd13, %rd14;
+       ld.global.u32   %r27, [%rd15];
+       mul.wide.u32    %rd16, %r27, 8;
+       add.s64         %rd17, %rd2, %rd16;
+       ld.global.f64   %fd36, [%rd17];
+       add.f64         %fd51, %fd51, %fd36;
+       add.s32         %r55, %r54, %r16;
+       setp.ge.u32     %p3, %r55, %r13;
+       @%p3 bra        BB3_7;
+
+       setp.eq.s64     %p4, %rd32, 0;
+       mov.u64         %rd32, 0;
+       @%p4 bra        BB3_6;
+
+       cvta.to.global.u64      %rd19, %rd1;
+       mul.wide.u32    %rd20, %r55, 4;
+       add.s64         %rd21, %rd19, %rd20;
+       ld.global.u32   %r55, [%rd21];
+       mov.u64         %rd32, %rd1;
+
+BB3_6:
+       mul.wide.u32    %rd22, %r55, 8;
+       add.s64         %rd23, %rd2, %rd22;
+       ld.global.f64   %fd37, [%rd23];
+       add.f64         %fd51, %fd51, %fd37;
+
+BB3_7:
+       shl.b32         %r30, %r16, 1;
+       mov.u32         %r31, %nctaid.x;
+       mad.lo.s32      %r54, %r30, %r31, %r54;
+       setp.lt.u32     %p5, %r54, %r13;
+       @%p5 bra        BB3_3;
+       bra.uni         BB3_11;
+
+BB3_8:
+       mul.wide.u32    %rd24, %r56, 8;
+       add.s64         %rd25, %rd2, %rd24;
+       ld.global.f64   %fd38, [%rd25];
+       add.f64         %fd51, %fd51, %fd38;
+       add.s32         %r10, %r56, %r16;
+       setp.ge.u32     %p6, %r10, %r13;
+       @%p6 bra        BB3_10;
+
+       mul.wide.u32    %rd26, %r10, 8;
+       add.s64         %rd27, %rd2, %rd26;
+       ld.global.f64   %fd39, [%rd27];
+       add.f64         %fd51, %fd51, %fd39;
+
+BB3_10:
+       mov.u32         %r32, %nctaid.x;
+       shl.b32         %r33, %r16, 1;
+       mad.lo.s32      %r56, %r33, %r32, %r56;
+       setp.lt.u32     %p7, %r56, %r13;
+       @%p7 bra        BB3_8;
+
+BB3_11:
+       shl.b32         %r35, %r17, 3;
+       mov.u32         %r36, memory;
+       add.s32         %r12, %r36, %r35;
+       st.shared.f64   [%r12], %fd51;
        bar.sync        0;
-       setp.lt.u32     %p8, %r9, 1024;
-       @%p8 bra        BB33_19;
+       setp.lt.u32     %p8, %r16, 1024;
+       @%p8 bra        BB3_15;
 
-       setp.gt.u32     %p9, %r6, 511;
-       @%p9 bra        BB33_18;
+       setp.gt.u32     %p9, %r17, 511;
+       @%p9 bra        BB3_14;
 
-       ld.shared.f64   %fd33, [%r4+4096];
-       add.f64         %fd44, %fd44, %fd33;
-       st.shared.f64   [%r4], %fd44;
+       ld.shared.f64   %fd40, [%r12+4096];
+       add.f64         %fd51, %fd51, %fd40;
+       st.shared.f64   [%r12], %fd51;
 
-BB33_18:
+BB3_14:
        bar.sync        0;
 
-BB33_19:
-       setp.lt.u32     %p10, %r9, 512;
-       @%p10 bra       BB33_23;
+BB3_15:
+       setp.lt.u32     %p10, %r16, 512;
+       @%p10 bra       BB3_19;
 
-       setp.gt.u32     %p11, %r6, 255;
-       @%p11 bra       BB33_22;
+       setp.gt.u32     %p11, %r17, 255;
+       @%p11 bra       BB3_18;
 
-       ld.shared.f64   %fd34, [%r4+2048];
-       add.f64         %fd44, %fd44, %fd34;
-       st.shared.f64   [%r4], %fd44;
+       ld.shared.f64   %fd41, [%r12+2048];
+       add.f64         %fd51, %fd51, %fd41;
+       st.shared.f64   [%r12], %fd51;
 
-BB33_22:
+BB3_18:
        bar.sync        0;
 
-BB33_23:
-       setp.lt.u32     %p12, %r9, 256;
-       @%p12 bra       BB33_27;
+BB3_19:
+       setp.lt.u32     %p12, %r16, 256;
+       @%p12 bra       BB3_23;
 
-       setp.gt.u32     %p13, %r6, 127;
-       @%p13 bra       BB33_26;
+       setp.gt.u32     %p13, %r17, 127;
+       @%p13 bra       BB3_22;
 
-       ld.shared.f64   %fd35, [%r4+1024];
-       add.f64         %fd44, %fd44, %fd35;
-       st.shared.f64   [%r4], %fd44;
+       ld.shared.f64   %fd42, [%r12+1024];
+       add.f64         %fd51, %fd51, %fd42;
+       st.shared.f64   [%r12], %fd51;
 
-BB33_26:
+BB3_22:
        bar.sync        0;
 
-BB33_27:
-       setp.lt.u32     %p14, %r9, 128;
-       @%p14 bra       BB33_31;
+BB3_23:
+       setp.lt.u32     %p14, %r16, 128;
+       @%p14 bra       BB3_27;
 
-       setp.gt.u32     %p15, %r6, 63;
-       @%p15 bra       BB33_30;
+       setp.gt.u32     %p15, %r17, 63;
+       @%p15 bra       BB3_26;
 
-       ld.shared.f64   %fd36, [%r4+512];
-       add.f64         %fd44, %fd44, %fd36;
-       st.shared.f64   [%r4], %fd44;
+       ld.shared.f64   %fd43, [%r12+512];
+       add.f64         %fd51, %fd51, %fd43;
+       st.shared.f64   [%r12], %fd51;
 
-BB33_30:
+BB3_26:
        bar.sync        0;
 
-BB33_31:
-       setp.gt.u32     %p16, %r6, 31;
-       @%p16 bra       BB33_44;
+BB3_27:
+       setp.gt.u32     %p16, %r17, 31;
+       @%p16 bra       BB3_40;
 
-       setp.lt.u32     %p17, %r9, 64;
-       @%p17 bra       BB33_34;
+       setp.lt.u32     %p17, %r16, 64;
+       @%p17 bra       BB3_30;
 
-       ld.volatile.shared.f64  %fd37, [%r4+256];
-       add.f64         %fd44, %fd44, %fd37;
-       st.volatile.shared.f64  [%r4], %fd44;
+       ld.volatile.shared.f64  %fd44, [%r12+256];
+       add.f64         %fd51, %fd51, %fd44;
+       st.volatile.shared.f64  [%r12], %fd51;
 
-BB33_34:
-       setp.lt.u32     %p18, %r9, 32;
-       @%p18 bra       BB33_36;
+BB3_30:
+       setp.lt.u32     %p18, %r16, 32;
+       @%p18 bra       BB3_32;
 
-       ld.volatile.shared.f64  %fd38, [%r4+128];
-       add.f64         %fd44, %fd44, %fd38;
-       st.volatile.shared.f64  [%r4], %fd44;
+       ld.volatile.shared.f64  %fd45, [%r12+128];
+       add.f64         %fd51, %fd51, %fd45;
+       st.volatile.shared.f64  [%r12], %fd51;
 
-BB33_36:
-       setp.lt.u32     %p19, %r9, 16;
-       @%p19 bra       BB33_38;
+BB3_32:
+       setp.lt.u32     %p19, %r16, 16;
+       @%p19 bra       BB3_34;
 
-       ld.volatile.shared.f64  %fd39, [%r4+64];
-       add.f64         %fd44, %fd44, %fd39;
-       st.volatile.shared.f64  [%r4], %fd44;
+       ld.volatile.shared.f64  %fd46, [%r12+64];
+       add.f64         %fd51, %fd51, %fd46;
+       st.volatile.shared.f64  [%r12], %fd51;
 
-BB33_38:
-       setp.lt.u32     %p20, %r9, 8;
-       @%p20 bra       BB33_40;
+BB3_34:
+       setp.lt.u32     %p20, %r16, 8;
+       @%p20 bra       BB3_36;
 
-       ld.volatile.shared.f64  %fd40, [%r4+32];
-       add.f64         %fd44, %fd44, %fd40;
-       st.volatile.shared.f64  [%r4], %fd44;
+       ld.volatile.shared.f64  %fd47, [%r12+32];
+       add.f64         %fd51, %fd51, %fd47;
+       st.volatile.shared.f64  [%r12], %fd51;
 
-BB33_40:
-       setp.lt.u32     %p21, %r9, 4;
-       @%p21 bra       BB33_42;
+BB3_36:
+       setp.lt.u32     %p21, %r16, 4;
+       @%p21 bra       BB3_38;
 
-       ld.volatile.shared.f64  %fd41, [%r4+16];
-       add.f64         %fd44, %fd44, %fd41;
-       st.volatile.shared.f64  [%r4], %fd44;
+       ld.volatile.shared.f64  %fd48, [%r12+16];
+       add.f64         %fd51, %fd51, %fd48;
+       st.volatile.shared.f64  [%r12], %fd51;
 
-BB33_42:
-       setp.lt.u32     %p22, %r9, 2;
-       @%p22 bra       BB33_44;
+BB3_38:
+       setp.lt.u32     %p22, %r16, 2;
+       @%p22 bra       BB3_40;
 
-       ld.volatile.shared.f64  %fd42, [%r4+8];
-       add.f64         %fd43, %fd44, %fd42;
-       st.volatile.shared.f64  [%r4], %fd43;
+       ld.volatile.shared.f64  %fd49, [%r12+8];
+       add.f64         %fd50, %fd51, %fd49;
+       st.volatile.shared.f64  [%r12], %fd50;
 
-BB33_44:
-       setp.ne.s32     %p23, %r6, 0;
-       @%p23 bra       BB33_48;
+BB3_40:
+       setp.ne.s32     %p23, %r17, 0;
+       @%p23 bra       BB3_44;
 
-       ld.shared.f64   %fd28, [memory];
-       ld.local.u64    %rd114, [%rd2+96];
-       add.s64         %rd11, %rd2, %rd114;
-       add.s64         %rd12, %rd53, %rd114;
-       ld.local.u64    %rd122, [%rd2+88];
-       and.b64         %rd115, %rd122, 1;
-       setp.eq.b64     %p24, %rd115, 1;
-       @!%p24 bra      BB33_47;
-       bra.uni         BB33_46;
-
-BB33_46:
-       ld.local.u64    %rd116, [%rd11];
-       add.s64         %rd117, %rd122, %rd116;
-       ld.u64  %rd122, [%rd117+-1];
-
-BB33_47:
-       mov.u32         %r42, 0;
-       // Callseq Start 7
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd12;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r42;
-       .param .b32 param2;
-       st.param.b32    [param2+0], %r7;
-       .param .b64 retval0;
-       prototype_7 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _, .param .b32 _) ;
-       call (retval0), 
-       %rd122, 
-       (
-       param0, 
-       param1, 
-       param2
-       )
-       , prototype_7;
-       ld.param.b64    %rd119, [retval0+0];
-       
-       //{
-       }// Callseq End 7
-       st.f64  [%rd119], %fd28;
-
-BB33_48:
+       ld.shared.f64   %fd32, [memory];
+       cvta.to.global.u64      %rd28, %rd10;
+       ld.global.u64   %rd29, [%rd28+16];
+       ld.global.u64   %rd30, [%rd28+32];
+       cvta.to.global.u64      %rd35, %rd30;
+       setp.ne.s64     %p24, %rd29, 0;
+       @%p24 bra       BB3_43;
+
+       mul.wide.u32    %rd31, %r14, 8;
+       add.s64         %rd35, %rd35, %rd31;
+
+BB3_43:
+       st.global.f64   [%rd35], %fd32;
+
+BB3_44:
        ret;
 }
 
@@ -1543,400 +553,225 @@ BB33_48:
        .param .u32 reduce_max_f_param_2
 )
 {
-       .local .align 8 .b8     __local_depot34[272];
-       .reg .b64       %SP;
-       .reg .b64       %SPL;
        .reg .pred      %p<25>;
-       .reg .f32       %f<60>;
-       .reg .b32       %r<44>;
-       .reg .b64       %rd<123>;
-
-
-       mov.u64         %SPL, __local_depot34;
-       cvta.local.u64  %SP, %SPL;
-       ld.param.u64    %rd17, [reduce_max_f_param_0];
-       ld.param.u64    %rd16, [reduce_max_f_param_1];
-       ld.param.u32    %r5, [reduce_max_f_param_2];
-       add.u64         %rd18, %SP, 0;
-       add.u64         %rd1, %SPL, 0;
-       st.local.u64    [%rd1], %rd17;
-       cvta.to.global.u64      %rd19, %rd17;
-       ld.global.u64   %rd20, [%rd19+16];
-       setp.eq.s64     %p1, %rd20, 0;
-       @%p1 bra        BB34_2;
-
-       mov.u64         %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
-       st.local.u64    [%rd1+8], %rd21;
-       mov.u64         %rd23, 0;
-       st.local.u64    [%rd1+16], %rd23;
-       mov.u64         %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
-       st.local.u64    [%rd1+40], %rd24;
-       st.local.u64    [%rd1+48], %rd23;
-       mov.u64         %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
-       st.local.u64    [%rd1+56], %rd26;
-       st.local.u64    [%rd1+64], %rd23;
-       mov.u64         %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
-       st.local.u64    [%rd1+88], %rd28;
-       st.local.u64    [%rd1+96], %rd23;
-       mov.u64         %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
-       st.local.u64    [%rd1+104], %rd30;
-       st.local.u64    [%rd1+112], %rd23;
-       mov.u64         %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
-       st.local.u64    [%rd1+24], %rd32;
-       st.local.u64    [%rd1+32], %rd23;
-       mov.u64         %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
-       st.local.u64    [%rd1+72], %rd34;
-       st.local.u64    [%rd1+80], %rd23;
-       mov.u64         %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
-       st.local.u64    [%rd1+120], %rd36;
-       st.local.u64    [%rd1+128], %rd23;
-       bra.uni         BB34_3;
-
-BB34_2:
-       mov.u64         %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
-       st.local.u64    [%rd1+8], %rd38;
-       mov.u64         %rd40, 0;
-       st.local.u64    [%rd1+16], %rd40;
-       mov.u64         %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
-       st.local.u64    [%rd1+40], %rd41;
-       st.local.u64    [%rd1+48], %rd40;
-       mov.u64         %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
-       st.local.u64    [%rd1+56], %rd43;
-       st.local.u64    [%rd1+64], %rd40;
-       mov.u64         %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
-       st.local.u64    [%rd1+88], %rd45;
-       st.local.u64    [%rd1+96], %rd40;
-       mov.u64         %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
-       st.local.u64    [%rd1+104], %rd47;
-       st.local.u64    [%rd1+112], %rd40;
-       mov.u64         %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
-       st.local.u64    [%rd1+24], %rd49;
-       st.local.u64    [%rd1+32], %rd40;
-       mov.u64         %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
-       st.local.u64    [%rd1+72], %rd51;
-       st.local.u64    [%rd1+80], %rd40;
-
-BB34_3:
-       add.u64         %rd53, %SP, 136;
-       add.u64         %rd2, %SPL, 136;
-       st.local.u64    [%rd2], %rd16;
-       cvta.to.global.u64      %rd54, %rd16;
-       ld.global.u64   %rd55, [%rd54+16];
-       setp.eq.s64     %p2, %rd55, 0;
-       @%p2 bra        BB34_5;
-
-       mov.u64         %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
-       st.local.u64    [%rd2+8], %rd56;
-       mov.u64         %rd58, 0;
-       st.local.u64    [%rd2+16], %rd58;
-       mov.u64         %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
-       st.local.u64    [%rd2+40], %rd59;
-       st.local.u64    [%rd2+48], %rd58;
-       mov.u64         %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
-       st.local.u64    [%rd2+56], %rd61;
-       st.local.u64    [%rd2+64], %rd58;
-       mov.u64         %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
-       st.local.u64    [%rd2+88], %rd63;
-       st.local.u64    [%rd2+96], %rd58;
-       mov.u64         %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
-       st.local.u64    [%rd2+104], %rd65;
-       st.local.u64    [%rd2+112], %rd58;
-       mov.u64         %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
-       st.local.u64    [%rd2+24], %rd67;
-       st.local.u64    [%rd2+32], %rd58;
-       mov.u64         %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
-       st.local.u64    [%rd2+72], %rd69;
-       st.local.u64    [%rd2+80], %rd58;
-       mov.u64         %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
-       st.local.u64    [%rd2+120], %rd71;
-       st.local.u64    [%rd2+128], %rd58;
-       bra.uni         BB34_6;
-
-BB34_5:
-       mov.u64         %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
-       st.local.u64    [%rd2+8], %rd73;
-       mov.u64         %rd75, 0;
-       st.local.u64    [%rd2+16], %rd75;
-       mov.u64         %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
-       st.local.u64    [%rd2+40], %rd76;
-       st.local.u64    [%rd2+48], %rd75;
-       mov.u64         %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
-       st.local.u64    [%rd2+56], %rd78;
-       st.local.u64    [%rd2+64], %rd75;
-       mov.u64         %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
-       st.local.u64    [%rd2+88], %rd80;
-       st.local.u64    [%rd2+96], %rd75;
-       mov.u64         %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
-       st.local.u64    [%rd2+104], %rd82;
-       st.local.u64    [%rd2+112], %rd75;
-       mov.u64         %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
-       st.local.u64    [%rd2+24], %rd84;
-       st.local.u64    [%rd2+32], %rd75;
-       mov.u64         %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
-       st.local.u64    [%rd2+72], %rd86;
-       st.local.u64    [%rd2+80], %rd75;
-
-BB34_6:
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r43, %r8, %r9, %r6;
-       mov.f32         %f44, 0fFF800000;
-       setp.ge.u32     %p3, %r43, %r5;
-       @%p3 bra        BB34_15;
-
-       mov.f32         %f44, 0fFF800000;
-
-BB34_8:
-       ld.local.u64    %rd3, [%rd1+112];
-       ld.local.u64    %rd120, [%rd1+104];
-       and.b64         %rd90, %rd120, 1;
-       setp.eq.b64     %p4, %rd90, 1;
-       @!%p4 bra       BB34_10;
-       bra.uni         BB34_9;
-
-BB34_9:
-       add.s64         %rd93, %rd1, %rd3;
-       ld.local.u64    %rd94, [%rd93];
-       add.s64         %rd95, %rd120, %rd94;
-       ld.u64  %rd120, [%rd95+-1];
-
-BB34_10:
-       add.s64         %rd97, %rd18, %rd3;
-       // Callseq Start 8
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd97;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r43;
-       .param .b64 retval0;
-       prototype_8 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd120, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_8;
-       ld.param.b64    %rd99, [retval0+0];
-       
-       //{
-       }// Callseq End 8
-       ld.f32  %f31, [%rd99];
-       max.f32         %f44, %f44, %f31;
-       add.s32         %r16, %r43, %r9;
-       setp.ge.u32     %p5, %r16, %r5;
-       @%p5 bra        BB34_14;
-
-       ld.local.u64    %rd121, [%rd1+104];
-       and.b64         %rd102, %rd121, 1;
-       setp.eq.b64     %p6, %rd102, 1;
-       ld.local.u64    %rd8, [%rd1+112];
-       @!%p6 bra       BB34_13;
-       bra.uni         BB34_12;
-
-BB34_12:
-       add.s64         %rd105, %rd1, %rd8;
-       ld.local.u64    %rd106, [%rd105];
-       add.s64         %rd107, %rd121, %rd106;
-       ld.u64  %rd121, [%rd107+-1];
-
-BB34_13:
-       add.s64         %rd109, %rd18, %rd8;
-       // Callseq Start 9
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd109;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r16;
-       .param .b64 retval0;
-       prototype_9 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd121, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_9;
-       ld.param.b64    %rd111, [retval0+0];
-       
-       //{
-       }// Callseq End 9
-       ld.f32  %f32, [%rd111];
-       max.f32         %f44, %f44, %f32;
-
-BB34_14:
-       shl.b32         %r20, %r9, 1;
-       mov.u32         %r21, %nctaid.x;
-       mad.lo.s32      %r43, %r20, %r21, %r43;
-       setp.lt.u32     %p7, %r43, %r5;
-       @%p7 bra        BB34_8;
-
-BB34_15:
-       shl.b32         %r23, %r6, 2;
-       mov.u32         %r24, memory;
-       add.s32         %r4, %r24, %r23;
-       st.shared.f32   [%r4], %f44;
+       .reg .f32       %f<69>;
+       .reg .b32       %r<57>;
+       .reg .b64       %rd<36>;
+
+
+       ld.param.u64    %rd9, [reduce_max_f_param_0];
+       ld.param.u64    %rd10, [reduce_max_f_param_1];
+       ld.param.u32    %r13, [reduce_max_f_param_2];
+       mov.u32         %r14, %ctaid.x;
+       shl.b32         %r15, %r14, 1;
+       mov.u32         %r16, %ntid.x;
+       mov.u32         %r17, %tid.x;
+       mad.lo.s32      %r56, %r15, %r16, %r17;
+       mov.f32         %f51, 0fFF800000;
+       setp.ge.u32     %p1, %r56, %r13;
+       @%p1 bra        BB4_11;
+
+       cvta.to.global.u64      %rd11, %rd9;
+       ld.global.u64   %rd1, [%rd11+16];
+       setp.eq.s64     %p2, %rd1, 0;
+       ld.global.u64   %rd12, [%rd11+32];
+       cvta.to.global.u64      %rd2, %rd12;
+       mov.f32         %f51, 0fFF800000;
+       @%p2 bra        BB4_8;
+
+       mad.lo.s32      %r54, %r15, %r16, %r17;
+       mov.f32         %f51, 0fFF800000;
+       mov.u64         %rd32, %rd1;
+
+BB4_3:
+       cvta.to.global.u64      %rd13, %rd32;
+       mul.wide.u32    %rd14, %r54, 4;
+       add.s64         %rd15, %rd13, %rd14;
+       ld.global.u32   %r27, [%rd15];
+       mul.wide.u32    %rd16, %r27, 4;
+       add.s64         %rd17, %rd2, %rd16;
+       ld.global.f32   %f36, [%rd17];
+       max.f32         %f51, %f51, %f36;
+       add.s32         %r55, %r54, %r16;
+       setp.ge.u32     %p3, %r55, %r13;
+       @%p3 bra        BB4_7;
+
+       setp.eq.s64     %p4, %rd32, 0;
+       mov.u64         %rd32, 0;
+       @%p4 bra        BB4_6;
+
+       cvta.to.global.u64      %rd19, %rd1;
+       mul.wide.u32    %rd20, %r55, 4;
+       add.s64         %rd21, %rd19, %rd20;
+       ld.global.u32   %r55, [%rd21];
+       mov.u64         %rd32, %rd1;
+
+BB4_6:
+       mul.wide.u32    %rd22, %r55, 4;
+       add.s64         %rd23, %rd2, %rd22;
+       ld.global.f32   %f37, [%rd23];
+       max.f32         %f51, %f51, %f37;
+
+BB4_7:
+       shl.b32         %r30, %r16, 1;
+       mov.u32         %r31, %nctaid.x;
+       mad.lo.s32      %r54, %r30, %r31, %r54;
+       setp.lt.u32     %p5, %r54, %r13;
+       @%p5 bra        BB4_3;
+       bra.uni         BB4_11;
+
+BB4_8:
+       mul.wide.u32    %rd24, %r56, 4;
+       add.s64         %rd25, %rd2, %rd24;
+       ld.global.f32   %f38, [%rd25];
+       max.f32         %f51, %f51, %f38;
+       add.s32         %r10, %r56, %r16;
+       setp.ge.u32     %p6, %r10, %r13;
+       @%p6 bra        BB4_10;
+
+       mul.wide.u32    %rd26, %r10, 4;
+       add.s64         %rd27, %rd2, %rd26;
+       ld.global.f32   %f39, [%rd27];
+       max.f32         %f51, %f51, %f39;
+
+BB4_10:
+       mov.u32         %r32, %nctaid.x;
+       shl.b32         %r33, %r16, 1;
+       mad.lo.s32      %r56, %r33, %r32, %r56;
+       setp.lt.u32     %p7, %r56, %r13;
+       @%p7 bra        BB4_8;
+
+BB4_11:
+       shl.b32         %r35, %r17, 2;
+       mov.u32         %r36, memory;
+       add.s32         %r12, %r36, %r35;
+       st.shared.f32   [%r12], %f51;
        bar.sync        0;
-       setp.lt.u32     %p8, %r9, 1024;
-       @%p8 bra        BB34_19;
+       setp.lt.u32     %p8, %r16, 1024;
+       @%p8 bra        BB4_15;
 
-       setp.gt.u32     %p9, %r6, 511;
-       @%p9 bra        BB34_18;
+       setp.gt.u32     %p9, %r17, 511;
+       @%p9 bra        BB4_14;
 
-       ld.shared.f32   %f33, [%r4+2048];
-       max.f32         %f44, %f44, %f33;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f40, [%r12+2048];
+       max.f32         %f51, %f51, %f40;
+       st.shared.f32   [%r12], %f51;
 
-BB34_18:
+BB4_14:
        bar.sync        0;
 
-BB34_19:
-       setp.lt.u32     %p10, %r9, 512;
-       @%p10 bra       BB34_23;
+BB4_15:
+       setp.lt.u32     %p10, %r16, 512;
+       @%p10 bra       BB4_19;
 
-       setp.gt.u32     %p11, %r6, 255;
-       @%p11 bra       BB34_22;
+       setp.gt.u32     %p11, %r17, 255;
+       @%p11 bra       BB4_18;
 
-       ld.shared.f32   %f34, [%r4+1024];
-       max.f32         %f44, %f44, %f34;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f41, [%r12+1024];
+       max.f32         %f51, %f51, %f41;
+       st.shared.f32   [%r12], %f51;
 
-BB34_22:
+BB4_18:
        bar.sync        0;
 
-BB34_23:
-       setp.lt.u32     %p12, %r9, 256;
-       @%p12 bra       BB34_27;
+BB4_19:
+       setp.lt.u32     %p12, %r16, 256;
+       @%p12 bra       BB4_23;
 
-       setp.gt.u32     %p13, %r6, 127;
-       @%p13 bra       BB34_26;
+       setp.gt.u32     %p13, %r17, 127;
+       @%p13 bra       BB4_22;
 
-       ld.shared.f32   %f35, [%r4+512];
-       max.f32         %f44, %f44, %f35;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f42, [%r12+512];
+       max.f32         %f51, %f51, %f42;
+       st.shared.f32   [%r12], %f51;
 
-BB34_26:
+BB4_22:
        bar.sync        0;
 
-BB34_27:
-       setp.lt.u32     %p14, %r9, 128;
-       @%p14 bra       BB34_31;
+BB4_23:
+       setp.lt.u32     %p14, %r16, 128;
+       @%p14 bra       BB4_27;
 
-       setp.gt.u32     %p15, %r6, 63;
-       @%p15 bra       BB34_30;
+       setp.gt.u32     %p15, %r17, 63;
+       @%p15 bra       BB4_26;
 
-       ld.shared.f32   %f36, [%r4+256];
-       max.f32         %f44, %f44, %f36;
-       st.shared.f32   [%r4], %f44;
+       ld.shared.f32   %f43, [%r12+256];
+       max.f32         %f51, %f51, %f43;
+       st.shared.f32   [%r12], %f51;
 
-BB34_30:
+BB4_26:
        bar.sync        0;
 
-BB34_31:
-       setp.gt.u32     %p16, %r6, 31;
-       @%p16 bra       BB34_44;
+BB4_27:
+       setp.gt.u32     %p16, %r17, 31;
+       @%p16 bra       BB4_40;
 
-       setp.lt.u32     %p17, %r9, 64;
-       @%p17 bra       BB34_34;
+       setp.lt.u32     %p17, %r16, 64;
+       @%p17 bra       BB4_30;
 
-       ld.volatile.shared.f32  %f37, [%r4+128];
-       max.f32         %f44, %f44, %f37;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f44, [%r12+128];
+       max.f32         %f51, %f51, %f44;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB34_34:
-       setp.lt.u32     %p18, %r9, 32;
-       @%p18 bra       BB34_36;
+BB4_30:
+       setp.lt.u32     %p18, %r16, 32;
+       @%p18 bra       BB4_32;
 
-       ld.volatile.shared.f32  %f38, [%r4+64];
-       max.f32         %f44, %f44, %f38;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f45, [%r12+64];
+       max.f32         %f51, %f51, %f45;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB34_36:
-       setp.lt.u32     %p19, %r9, 16;
-       @%p19 bra       BB34_38;
+BB4_32:
+       setp.lt.u32     %p19, %r16, 16;
+       @%p19 bra       BB4_34;
 
-       ld.volatile.shared.f32  %f39, [%r4+32];
-       max.f32         %f44, %f44, %f39;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f46, [%r12+32];
+       max.f32         %f51, %f51, %f46;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB34_38:
-       setp.lt.u32     %p20, %r9, 8;
-       @%p20 bra       BB34_40;
+BB4_34:
+       setp.lt.u32     %p20, %r16, 8;
+       @%p20 bra       BB4_36;
 
-       ld.volatile.shared.f32  %f40, [%r4+16];
-       max.f32         %f44, %f44, %f40;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f47, [%r12+16];
+       max.f32         %f51, %f51, %f47;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB34_40:
-       setp.lt.u32     %p21, %r9, 4;
-       @%p21 bra       BB34_42;
+BB4_36:
+       setp.lt.u32     %p21, %r16, 4;
+       @%p21 bra       BB4_38;
 
-       ld.volatile.shared.f32  %f41, [%r4+8];
-       max.f32         %f44, %f44, %f41;
-       st.volatile.shared.f32  [%r4], %f44;
+       ld.volatile.shared.f32  %f48, [%r12+8];
+       max.f32         %f51, %f51, %f48;
+       st.volatile.shared.f32  [%r12], %f51;
 
-BB34_42:
-       setp.lt.u32     %p22, %r9, 2;
-       @%p22 bra       BB34_44;
+BB4_38:
+       setp.lt.u32     %p22, %r16, 2;
+       @%p22 bra       BB4_40;
 
-       ld.volatile.shared.f32  %f42, [%r4+4];
-       max.f32         %f43, %f44, %f42;
-       st.volatile.shared.f32  [%r4], %f43;
+       ld.volatile.shared.f32  %f49, [%r12+4];
+       max.f32         %f50, %f51, %f49;
+       st.volatile.shared.f32  [%r12], %f50;
 
-BB34_44:
-       setp.ne.s32     %p23, %r6, 0;
-       @%p23 bra       BB34_48;
+BB4_40:
+       setp.ne.s32     %p23, %r17, 0;
+       @%p23 bra       BB4_44;
 
-       ld.shared.f32   %f28, [memory];
-       ld.local.u64    %rd114, [%rd2+96];
-       add.s64         %rd11, %rd2, %rd114;
-       add.s64         %rd12, %rd53, %rd114;
-       ld.local.u64    %rd122, [%rd2+88];
-       and.b64         %rd115, %rd122, 1;
-       setp.eq.b64     %p24, %rd115, 1;
-       @!%p24 bra      BB34_47;
-       bra.uni         BB34_46;
-
-BB34_46:
-       ld.local.u64    %rd116, [%rd11];
-       add.s64         %rd117, %rd122, %rd116;
-       ld.u64  %rd122, [%rd117+-1];
-
-BB34_47:
-       mov.u32         %r42, 0;
-       // Callseq Start 10
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd12;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r42;
-       .param .b32 param2;
-       st.param.b32    [param2+0], %r7;
-       .param .b64 retval0;
-       prototype_10 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _, .param .b32 _) ;
-       call (retval0), 
-       %rd122, 
-       (
-       param0, 
-       param1, 
-       param2
-       )
-       , prototype_10;
-       ld.param.b64    %rd119, [retval0+0];
-       
-       //{
-       }// Callseq End 10
-       st.f32  [%rd119], %f28;
-
-BB34_48:
+       ld.shared.f32   %f32, [memory];
+       cvta.to.global.u64      %rd28, %rd10;
+       ld.global.u64   %rd29, [%rd28+16];
+       ld.global.u64   %rd30, [%rd28+32];
+       cvta.to.global.u64      %rd35, %rd30;
+       setp.ne.s64     %p24, %rd29, 0;
+       @%p24 bra       BB4_43;
+
+       mul.wide.u32    %rd31, %r14, 4;
+       add.s64         %rd35, %rd35, %rd31;
+
+BB4_43:
+       st.global.f32   [%rd35], %f32;
+
+BB4_44:
        ret;
 }
 
@@ -1947,400 +782,206 @@ BB34_48:
        .param .u32 reduce_max_d_param_2
 )
 {
-       .local .align 8 .b8     __local_depot35[272];
-       .reg .b64       %SP;
-       .reg .b64       %SPL;
-       .reg .pred      %p<25>;
-       .reg .b32       %r<44>;
+       .reg .pred      %p<23>;
+       .reg .b32       %r<46>;
        .reg .f64       %fd<60>;
-       .reg .b64       %rd<123>;
-
-
-       mov.u64         %SPL, __local_depot35;
-       cvta.local.u64  %SP, %SPL;
-       ld.param.u64    %rd17, [reduce_max_d_param_0];
-       ld.param.u64    %rd16, [reduce_max_d_param_1];
-       ld.param.u32    %r5, [reduce_max_d_param_2];
-       add.u64         %rd18, %SP, 0;
-       add.u64         %rd1, %SPL, 0;
-       st.local.u64    [%rd1], %rd17;
-       cvta.to.global.u64      %rd19, %rd17;
-       ld.global.u64   %rd20, [%rd19+16];
-       setp.eq.s64     %p1, %rd20, 0;
-       @%p1 bra        BB35_2;
-
-       mov.u64         %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
-       st.local.u64    [%rd1+8], %rd21;
-       mov.u64         %rd23, 0;
-       st.local.u64    [%rd1+16], %rd23;
-       mov.u64         %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
-       st.local.u64    [%rd1+40], %rd24;
-       st.local.u64    [%rd1+48], %rd23;
-       mov.u64         %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
-       st.local.u64    [%rd1+56], %rd26;
-       st.local.u64    [%rd1+64], %rd23;
-       mov.u64         %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
-       st.local.u64    [%rd1+88], %rd28;
-       st.local.u64    [%rd1+96], %rd23;
-       mov.u64         %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
-       st.local.u64    [%rd1+104], %rd30;
-       st.local.u64    [%rd1+112], %rd23;
-       mov.u64         %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
-       st.local.u64    [%rd1+24], %rd32;
-       st.local.u64    [%rd1+32], %rd23;
-       mov.u64         %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
-       st.local.u64    [%rd1+72], %rd34;
-       st.local.u64    [%rd1+80], %rd23;
-       mov.u64         %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
-       st.local.u64    [%rd1+120], %rd36;
-       st.local.u64    [%rd1+128], %rd23;
-       bra.uni         BB35_3;
-
-BB35_2:
-       mov.u64         %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
-       st.local.u64    [%rd1+8], %rd38;
-       mov.u64         %rd40, 0;
-       st.local.u64    [%rd1+16], %rd40;
-       mov.u64         %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
-       st.local.u64    [%rd1+40], %rd41;
-       st.local.u64    [%rd1+48], %rd40;
-       mov.u64         %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
-       st.local.u64    [%rd1+56], %rd43;
-       st.local.u64    [%rd1+64], %rd40;
-       mov.u64         %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
-       st.local.u64    [%rd1+88], %rd45;
-       st.local.u64    [%rd1+96], %rd40;
-       mov.u64         %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
-       st.local.u64    [%rd1+104], %rd47;
-       st.local.u64    [%rd1+112], %rd40;
-       mov.u64         %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
-       st.local.u64    [%rd1+24], %rd49;
-       st.local.u64    [%rd1+32], %rd40;
-       mov.u64         %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
-       st.local.u64    [%rd1+72], %rd51;
-       st.local.u64    [%rd1+80], %rd40;
-
-BB35_3:
-       add.u64         %rd53, %SP, 136;
-       add.u64         %rd2, %SPL, 136;
-       st.local.u64    [%rd2], %rd16;
-       cvta.to.global.u64      %rd54, %rd16;
-       ld.global.u64   %rd55, [%rd54+16];
-       setp.eq.s64     %p2, %rd55, 0;
-       @%p2 bra        BB35_5;
-
-       mov.u64         %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
-       st.local.u64    [%rd2+8], %rd56;
-       mov.u64         %rd58, 0;
-       st.local.u64    [%rd2+16], %rd58;
-       mov.u64         %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
-       st.local.u64    [%rd2+40], %rd59;
-       st.local.u64    [%rd2+48], %rd58;
-       mov.u64         %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
-       st.local.u64    [%rd2+56], %rd61;
-       st.local.u64    [%rd2+64], %rd58;
-       mov.u64         %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
-       st.local.u64    [%rd2+88], %rd63;
-       st.local.u64    [%rd2+96], %rd58;
-       mov.u64         %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
-       st.local.u64    [%rd2+104], %rd65;
-       st.local.u64    [%rd2+112], %rd58;
-       mov.u64         %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
-       st.local.u64    [%rd2+24], %rd67;
-       st.local.u64    [%rd2+32], %rd58;
-       mov.u64         %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
-       st.local.u64    [%rd2+72], %rd69;
-       st.local.u64    [%rd2+80], %rd58;
-       mov.u64         %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
-       st.local.u64    [%rd2+120], %rd71;
-       st.local.u64    [%rd2+128], %rd58;
-       bra.uni         BB35_6;
-
-BB35_5:
-       mov.u64         %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
-       st.local.u64    [%rd2+8], %rd73;
-       mov.u64         %rd75, 0;
-       st.local.u64    [%rd2+16], %rd75;
-       mov.u64         %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
-       st.local.u64    [%rd2+40], %rd76;
-       st.local.u64    [%rd2+48], %rd75;
-       mov.u64         %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
-       st.local.u64    [%rd2+56], %rd78;
-       st.local.u64    [%rd2+64], %rd75;
-       mov.u64         %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
-       st.local.u64    [%rd2+88], %rd80;
-       st.local.u64    [%rd2+96], %rd75;
-       mov.u64         %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
-       st.local.u64    [%rd2+104], %rd82;
-       st.local.u64    [%rd2+112], %rd75;
-       mov.u64         %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
-       st.local.u64    [%rd2+24], %rd84;
-       st.local.u64    [%rd2+32], %rd75;
-       mov.u64         %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
-       st.local.u64    [%rd2+72], %rd86;
-       st.local.u64    [%rd2+80], %rd75;
-
-BB35_6:
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r43, %r8, %r9, %r6;
-       mov.f64         %fd44, 0dFFF0000000000000;
-       setp.ge.u32     %p3, %r43, %r5;
-       @%p3 bra        BB35_15;
+       .reg .b64       %rd<34>;
+
 
+       ld.param.u64    %rd10, [reduce_max_d_param_0];
+       ld.param.u64    %rd11, [reduce_max_d_param_1];
+       ld.param.u32    %r10, [reduce_max_d_param_2];
+       mov.u32         %r11, %tid.x;
+       mov.u32         %r12, %ctaid.x;
+       shl.b32         %r13, %r12, 1;
+       mov.u32         %r14, %ntid.x;
+       mad.lo.s32      %r43, %r13, %r14, %r11;
        mov.f64         %fd44, 0dFFF0000000000000;
+       setp.ge.u32     %p1, %r43, %r10;
+       @%p1 bra        BB5_9;
 
-BB35_8:
-       ld.local.u64    %rd3, [%rd1+112];
-       ld.local.u64    %rd120, [%rd1+104];
-       and.b64         %rd90, %rd120, 1;
-       setp.eq.b64     %p4, %rd90, 1;
-       @!%p4 bra       BB35_10;
-       bra.uni         BB35_9;
-
-BB35_9:
-       add.s64         %rd93, %rd1, %rd3;
-       ld.local.u64    %rd94, [%rd93];
-       add.s64         %rd95, %rd120, %rd94;
-       ld.u64  %rd120, [%rd95+-1];
-
-BB35_10:
-       add.s64         %rd97, %rd18, %rd3;
-       // Callseq Start 11
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd97;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r43;
-       .param .b64 retval0;
-       prototype_11 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd120, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_11;
-       ld.param.b64    %rd99, [retval0+0];
-       
-       //{
-       }// Callseq End 11
-       ld.f64  %fd31, [%rd99];
+       cvta.to.global.u64      %rd12, %rd10;
+       ld.global.u64   %rd1, [%rd12+16];
+       ld.global.u64   %rd13, [%rd12+32];
+       cvta.to.global.u64      %rd2, %rd13;
+       mov.f64         %fd44, 0dFFF0000000000000;
+       mov.u64         %rd30, %rd1;
+
+BB5_2:
+       setp.eq.s64     %p2, %rd1, 0;
+       mov.u32         %r44, %r43;
+       @%p2 bra        BB5_4;
+
+       cvta.to.global.u64      %rd14, %rd1;
+       mul.wide.u32    %rd15, %r43, 4;
+       add.s64         %rd16, %rd14, %rd15;
+       ld.global.u32   %r44, [%rd16];
+       mov.u64         %rd30, %rd1;
+
+BB5_4:
+       mul.wide.u32    %rd17, %r44, 8;
+       add.s64         %rd18, %rd2, %rd17;
+       ld.global.f64   %fd31, [%rd18];
        max.f64         %fd44, %fd44, %fd31;
-       add.s32         %r16, %r43, %r9;
-       setp.ge.u32     %p5, %r16, %r5;
-       @%p5 bra        BB35_14;
-
-       ld.local.u64    %rd121, [%rd1+104];
-       and.b64         %rd102, %rd121, 1;
-       setp.eq.b64     %p6, %rd102, 1;
-       ld.local.u64    %rd8, [%rd1+112];
-       @!%p6 bra       BB35_13;
-       bra.uni         BB35_12;
-
-BB35_12:
-       add.s64         %rd105, %rd1, %rd8;
-       ld.local.u64    %rd106, [%rd105];
-       add.s64         %rd107, %rd121, %rd106;
-       ld.u64  %rd121, [%rd107+-1];
-
-BB35_13:
-       add.s64         %rd109, %rd18, %rd8;
-       // Callseq Start 12
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd109;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r16;
-       .param .b64 retval0;
-       prototype_12 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd121, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_12;
-       ld.param.b64    %rd111, [retval0+0];
-       
-       //{
-       }// Callseq End 12
-       ld.f64  %fd32, [%rd111];
+       add.s32         %r45, %r43, %r14;
+       setp.ge.u32     %p3, %r45, %r10;
+       @%p3 bra        BB5_8;
+
+       setp.eq.s64     %p4, %rd30, 0;
+       mov.u64         %rd30, 0;
+       @%p4 bra        BB5_7;
+
+       cvta.to.global.u64      %rd20, %rd1;
+       add.s32         %r19, %r43, %r14;
+       mul.wide.u32    %rd21, %r19, 4;
+       add.s64         %rd22, %rd20, %rd21;
+       ld.global.u32   %r45, [%rd22];
+       mov.u64         %rd30, %rd1;
+
+BB5_7:
+       mul.wide.u32    %rd23, %r45, 8;
+       add.s64         %rd24, %rd2, %rd23;
+       ld.global.f64   %fd32, [%rd24];
        max.f64         %fd44, %fd44, %fd32;
 
-BB35_14:
-       shl.b32         %r20, %r9, 1;
-       mov.u32         %r21, %nctaid.x;
-       mad.lo.s32      %r43, %r20, %r21, %r43;
-       setp.lt.u32     %p7, %r43, %r5;
-       @%p7 bra        BB35_8;
-
-BB35_15:
-       shl.b32         %r23, %r6, 3;
-       mov.u32         %r24, memory;
-       add.s32         %r4, %r24, %r23;
-       st.shared.f64   [%r4], %fd44;
+BB5_8:
+       shl.b32         %r21, %r14, 1;
+       mov.u32         %r22, %nctaid.x;
+       mad.lo.s32      %r43, %r21, %r22, %r43;
+       setp.lt.u32     %p5, %r43, %r10;
+       @%p5 bra        BB5_2;
+
+BB5_9:
+       shl.b32         %r24, %r11, 3;
+       mov.u32         %r25, memory;
+       add.s32         %r9, %r25, %r24;
+       st.shared.f64   [%r9], %fd44;
        bar.sync        0;
-       setp.lt.u32     %p8, %r9, 1024;
-       @%p8 bra        BB35_19;
+       setp.lt.u32     %p6, %r14, 1024;
+       @%p6 bra        BB5_13;
 
-       setp.gt.u32     %p9, %r6, 511;
-       @%p9 bra        BB35_18;
+       setp.gt.u32     %p7, %r11, 511;
+       @%p7 bra        BB5_12;
 
-       ld.shared.f64   %fd33, [%r4+4096];
+       ld.shared.f64   %fd33, [%r9+4096];
        max.f64         %fd44, %fd44, %fd33;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB35_18:
+BB5_12:
        bar.sync        0;
 
-BB35_19:
-       setp.lt.u32     %p10, %r9, 512;
-       @%p10 bra       BB35_23;
+BB5_13:
+       setp.lt.u32     %p8, %r14, 512;
+       @%p8 bra        BB5_17;
 
-       setp.gt.u32     %p11, %r6, 255;
-       @%p11 bra       BB35_22;
+       setp.gt.u32     %p9, %r11, 255;
+       @%p9 bra        BB5_16;
 
-       ld.shared.f64   %fd34, [%r4+2048];
+       ld.shared.f64   %fd34, [%r9+2048];
        max.f64         %fd44, %fd44, %fd34;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB35_22:
+BB5_16:
        bar.sync        0;
 
-BB35_23:
-       setp.lt.u32     %p12, %r9, 256;
-       @%p12 bra       BB35_27;
+BB5_17:
+       setp.lt.u32     %p10, %r14, 256;
+       @%p10 bra       BB5_21;
 
-       setp.gt.u32     %p13, %r6, 127;
-       @%p13 bra       BB35_26;
+       setp.gt.u32     %p11, %r11, 127;
+       @%p11 bra       BB5_20;
 
-       ld.shared.f64   %fd35, [%r4+1024];
+       ld.shared.f64   %fd35, [%r9+1024];
        max.f64         %fd44, %fd44, %fd35;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB35_26:
+BB5_20:
        bar.sync        0;
 
-BB35_27:
-       setp.lt.u32     %p14, %r9, 128;
-       @%p14 bra       BB35_31;
+BB5_21:
+       setp.lt.u32     %p12, %r14, 128;
+       @%p12 bra       BB5_25;
 
-       setp.gt.u32     %p15, %r6, 63;
-       @%p15 bra       BB35_30;
+       setp.gt.u32     %p13, %r11, 63;
+       @%p13 bra       BB5_24;
 
-       ld.shared.f64   %fd36, [%r4+512];
+       ld.shared.f64   %fd36, [%r9+512];
        max.f64         %fd44, %fd44, %fd36;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB35_30:
+BB5_24:
        bar.sync        0;
 
-BB35_31:
-       setp.gt.u32     %p16, %r6, 31;
-       @%p16 bra       BB35_44;
+BB5_25:
+       setp.gt.u32     %p14, %r11, 31;
+       @%p14 bra       BB5_38;
 
-       setp.lt.u32     %p17, %r9, 64;
-       @%p17 bra       BB35_34;
+       setp.lt.u32     %p15, %r14, 64;
+       @%p15 bra       BB5_28;
 
-       ld.volatile.shared.f64  %fd37, [%r4+256];
+       ld.volatile.shared.f64  %fd37, [%r9+256];
        max.f64         %fd44, %fd44, %fd37;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB35_34:
-       setp.lt.u32     %p18, %r9, 32;
-       @%p18 bra       BB35_36;
+BB5_28:
+       setp.lt.u32     %p16, %r14, 32;
+       @%p16 bra       BB5_30;
 
-       ld.volatile.shared.f64  %fd38, [%r4+128];
+       ld.volatile.shared.f64  %fd38, [%r9+128];
        max.f64         %fd44, %fd44, %fd38;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB35_36:
-       setp.lt.u32     %p19, %r9, 16;
-       @%p19 bra       BB35_38;
+BB5_30:
+       setp.lt.u32     %p17, %r14, 16;
+       @%p17 bra       BB5_32;
 
-       ld.volatile.shared.f64  %fd39, [%r4+64];
+       ld.volatile.shared.f64  %fd39, [%r9+64];
        max.f64         %fd44, %fd44, %fd39;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB35_38:
-       setp.lt.u32     %p20, %r9, 8;
-       @%p20 bra       BB35_40;
+BB5_32:
+       setp.lt.u32     %p18, %r14, 8;
+       @%p18 bra       BB5_34;
 
-       ld.volatile.shared.f64  %fd40, [%r4+32];
+       ld.volatile.shared.f64  %fd40, [%r9+32];
        max.f64         %fd44, %fd44, %fd40;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB35_40:
-       setp.lt.u32     %p21, %r9, 4;
-       @%p21 bra       BB35_42;
+BB5_34:
+       setp.lt.u32     %p19, %r14, 4;
+       @%p19 bra       BB5_36;
 
-       ld.volatile.shared.f64  %fd41, [%r4+16];
+       ld.volatile.shared.f64  %fd41, [%r9+16];
        max.f64         %fd44, %fd44, %fd41;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB35_42:
-       setp.lt.u32     %p22, %r9, 2;
-       @%p22 bra       BB35_44;
+BB5_36:
+       setp.lt.u32     %p20, %r14, 2;
+       @%p20 bra       BB5_38;
 
-       ld.volatile.shared.f64  %fd42, [%r4+8];
+       ld.volatile.shared.f64  %fd42, [%r9+8];
        max.f64         %fd43, %fd44, %fd42;
-       st.volatile.shared.f64  [%r4], %fd43;
+       st.volatile.shared.f64  [%r9], %fd43;
 
-BB35_44:
-       setp.ne.s32     %p23, %r6, 0;
-       @%p23 bra       BB35_48;
+BB5_38:
+       setp.ne.s32     %p21, %r11, 0;
+       @%p21 bra       BB5_42;
 
        ld.shared.f64   %fd28, [memory];
-       ld.local.u64    %rd114, [%rd2+96];
-       add.s64         %rd11, %rd2, %rd114;
-       add.s64         %rd12, %rd53, %rd114;
-       ld.local.u64    %rd122, [%rd2+88];
-       and.b64         %rd115, %rd122, 1;
-       setp.eq.b64     %p24, %rd115, 1;
-       @!%p24 bra      BB35_47;
-       bra.uni         BB35_46;
-
-BB35_46:
-       ld.local.u64    %rd116, [%rd11];
-       add.s64         %rd117, %rd122, %rd116;
-       ld.u64  %rd122, [%rd117+-1];
-
-BB35_47:
-       mov.u32         %r42, 0;
-       // Callseq Start 13
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd12;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r42;
-       .param .b32 param2;
-       st.param.b32    [param2+0], %r7;
-       .param .b64 retval0;
-       prototype_13 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _, .param .b32 _) ;
-       call (retval0), 
-       %rd122, 
-       (
-       param0, 
-       param1, 
-       param2
-       )
-       , prototype_13;
-       ld.param.b64    %rd119, [retval0+0];
-       
-       //{
-       }// Callseq End 13
-       st.f64  [%rd119], %fd28;
-
-BB35_48:
+       cvta.to.global.u64      %rd25, %rd11;
+       ld.global.u64   %rd26, [%rd25+16];
+       ld.global.u64   %rd27, [%rd25+32];
+       cvta.to.global.u64      %rd33, %rd27;
+       setp.ne.s64     %p22, %rd26, 0;
+       @%p22 bra       BB5_41;
+
+       mul.wide.u32    %rd28, %r12, 8;
+       add.s64         %rd33, %rd33, %rd28;
+
+BB5_41:
+       st.global.f64   [%rd33], %fd28;
+
+BB5_42:
        ret;
 }
 
@@ -2351,400 +992,206 @@ BB35_48:
        .param .u32 reduce_min_f_param_2
 )
 {
-       .local .align 8 .b8     __local_depot36[272];
-       .reg .b64       %SP;
-       .reg .b64       %SPL;
-       .reg .pred      %p<25>;
+       .reg .pred      %p<23>;
        .reg .f32       %f<60>;
-       .reg .b32       %r<44>;
-       .reg .b64       %rd<123>;
-
-
-       mov.u64         %SPL, __local_depot36;
-       cvta.local.u64  %SP, %SPL;
-       ld.param.u64    %rd17, [reduce_min_f_param_0];
-       ld.param.u64    %rd16, [reduce_min_f_param_1];
-       ld.param.u32    %r5, [reduce_min_f_param_2];
-       add.u64         %rd18, %SP, 0;
-       add.u64         %rd1, %SPL, 0;
-       st.local.u64    [%rd1], %rd17;
-       cvta.to.global.u64      %rd19, %rd17;
-       ld.global.u64   %rd20, [%rd19+16];
-       setp.eq.s64     %p1, %rd20, 0;
-       @%p1 bra        BB36_2;
-
-       mov.u64         %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
-       st.local.u64    [%rd1+8], %rd21;
-       mov.u64         %rd23, 0;
-       st.local.u64    [%rd1+16], %rd23;
-       mov.u64         %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
-       st.local.u64    [%rd1+40], %rd24;
-       st.local.u64    [%rd1+48], %rd23;
-       mov.u64         %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
-       st.local.u64    [%rd1+56], %rd26;
-       st.local.u64    [%rd1+64], %rd23;
-       mov.u64         %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
-       st.local.u64    [%rd1+88], %rd28;
-       st.local.u64    [%rd1+96], %rd23;
-       mov.u64         %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
-       st.local.u64    [%rd1+104], %rd30;
-       st.local.u64    [%rd1+112], %rd23;
-       mov.u64         %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
-       st.local.u64    [%rd1+24], %rd32;
-       st.local.u64    [%rd1+32], %rd23;
-       mov.u64         %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
-       st.local.u64    [%rd1+72], %rd34;
-       st.local.u64    [%rd1+80], %rd23;
-       mov.u64         %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
-       st.local.u64    [%rd1+120], %rd36;
-       st.local.u64    [%rd1+128], %rd23;
-       bra.uni         BB36_3;
-
-BB36_2:
-       mov.u64         %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
-       st.local.u64    [%rd1+8], %rd38;
-       mov.u64         %rd40, 0;
-       st.local.u64    [%rd1+16], %rd40;
-       mov.u64         %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
-       st.local.u64    [%rd1+40], %rd41;
-       st.local.u64    [%rd1+48], %rd40;
-       mov.u64         %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
-       st.local.u64    [%rd1+56], %rd43;
-       st.local.u64    [%rd1+64], %rd40;
-       mov.u64         %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
-       st.local.u64    [%rd1+88], %rd45;
-       st.local.u64    [%rd1+96], %rd40;
-       mov.u64         %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
-       st.local.u64    [%rd1+104], %rd47;
-       st.local.u64    [%rd1+112], %rd40;
-       mov.u64         %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
-       st.local.u64    [%rd1+24], %rd49;
-       st.local.u64    [%rd1+32], %rd40;
-       mov.u64         %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
-       st.local.u64    [%rd1+72], %rd51;
-       st.local.u64    [%rd1+80], %rd40;
-
-BB36_3:
-       add.u64         %rd53, %SP, 136;
-       add.u64         %rd2, %SPL, 136;
-       st.local.u64    [%rd2], %rd16;
-       cvta.to.global.u64      %rd54, %rd16;
-       ld.global.u64   %rd55, [%rd54+16];
-       setp.eq.s64     %p2, %rd55, 0;
-       @%p2 bra        BB36_5;
-
-       mov.u64         %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
-       st.local.u64    [%rd2+8], %rd56;
-       mov.u64         %rd58, 0;
-       st.local.u64    [%rd2+16], %rd58;
-       mov.u64         %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
-       st.local.u64    [%rd2+40], %rd59;
-       st.local.u64    [%rd2+48], %rd58;
-       mov.u64         %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
-       st.local.u64    [%rd2+56], %rd61;
-       st.local.u64    [%rd2+64], %rd58;
-       mov.u64         %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
-       st.local.u64    [%rd2+88], %rd63;
-       st.local.u64    [%rd2+96], %rd58;
-       mov.u64         %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
-       st.local.u64    [%rd2+104], %rd65;
-       st.local.u64    [%rd2+112], %rd58;
-       mov.u64         %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
-       st.local.u64    [%rd2+24], %rd67;
-       st.local.u64    [%rd2+32], %rd58;
-       mov.u64         %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
-       st.local.u64    [%rd2+72], %rd69;
-       st.local.u64    [%rd2+80], %rd58;
-       mov.u64         %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
-       st.local.u64    [%rd2+120], %rd71;
-       st.local.u64    [%rd2+128], %rd58;
-       bra.uni         BB36_6;
-
-BB36_5:
-       mov.u64         %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
-       st.local.u64    [%rd2+8], %rd73;
-       mov.u64         %rd75, 0;
-       st.local.u64    [%rd2+16], %rd75;
-       mov.u64         %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
-       st.local.u64    [%rd2+40], %rd76;
-       st.local.u64    [%rd2+48], %rd75;
-       mov.u64         %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
-       st.local.u64    [%rd2+56], %rd78;
-       st.local.u64    [%rd2+64], %rd75;
-       mov.u64         %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
-       st.local.u64    [%rd2+88], %rd80;
-       st.local.u64    [%rd2+96], %rd75;
-       mov.u64         %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
-       st.local.u64    [%rd2+104], %rd82;
-       st.local.u64    [%rd2+112], %rd75;
-       mov.u64         %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
-       st.local.u64    [%rd2+24], %rd84;
-       st.local.u64    [%rd2+32], %rd75;
-       mov.u64         %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
-       st.local.u64    [%rd2+72], %rd86;
-       st.local.u64    [%rd2+80], %rd75;
-
-BB36_6:
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r43, %r8, %r9, %r6;
+       .reg .b32       %r<46>;
+       .reg .b64       %rd<34>;
+
+
+       ld.param.u64    %rd10, [reduce_min_f_param_0];
+       ld.param.u64    %rd11, [reduce_min_f_param_1];
+       ld.param.u32    %r10, [reduce_min_f_param_2];
+       mov.u32         %r11, %tid.x;
+       mov.u32         %r12, %ctaid.x;
+       shl.b32         %r13, %r12, 1;
+       mov.u32         %r14, %ntid.x;
+       mad.lo.s32      %r43, %r13, %r14, %r11;
        mov.f32         %f44, 0f7F800000;
-       setp.ge.u32     %p3, %r43, %r5;
-       @%p3 bra        BB36_15;
+       setp.ge.u32     %p1, %r43, %r10;
+       @%p1 bra        BB6_9;
 
+       cvta.to.global.u64      %rd12, %rd10;
+       ld.global.u64   %rd1, [%rd12+16];
+       ld.global.u64   %rd13, [%rd12+32];
+       cvta.to.global.u64      %rd2, %rd13;
        mov.f32         %f44, 0f7F800000;
-
-BB36_8:
-       ld.local.u64    %rd3, [%rd1+112];
-       ld.local.u64    %rd120, [%rd1+104];
-       and.b64         %rd90, %rd120, 1;
-       setp.eq.b64     %p4, %rd90, 1;
-       @!%p4 bra       BB36_10;
-       bra.uni         BB36_9;
-
-BB36_9:
-       add.s64         %rd93, %rd1, %rd3;
-       ld.local.u64    %rd94, [%rd93];
-       add.s64         %rd95, %rd120, %rd94;
-       ld.u64  %rd120, [%rd95+-1];
-
-BB36_10:
-       add.s64         %rd97, %rd18, %rd3;
-       // Callseq Start 14
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd97;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r43;
-       .param .b64 retval0;
-       prototype_14 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd120, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_14;
-       ld.param.b64    %rd99, [retval0+0];
-       
-       //{
-       }// Callseq End 14
-       ld.f32  %f31, [%rd99];
+       mov.u64         %rd30, %rd1;
+
+BB6_2:
+       setp.eq.s64     %p2, %rd1, 0;
+       mov.u32         %r44, %r43;
+       @%p2 bra        BB6_4;
+
+       cvta.to.global.u64      %rd14, %rd1;
+       mul.wide.u32    %rd15, %r43, 4;
+       add.s64         %rd16, %rd14, %rd15;
+       ld.global.u32   %r44, [%rd16];
+       mov.u64         %rd30, %rd1;
+
+BB6_4:
+       mul.wide.u32    %rd17, %r44, 4;
+       add.s64         %rd18, %rd2, %rd17;
+       ld.global.f32   %f31, [%rd18];
        min.f32         %f44, %f44, %f31;
-       add.s32         %r16, %r43, %r9;
-       setp.ge.u32     %p5, %r16, %r5;
-       @%p5 bra        BB36_14;
-
-       ld.local.u64    %rd121, [%rd1+104];
-       and.b64         %rd102, %rd121, 1;
-       setp.eq.b64     %p6, %rd102, 1;
-       ld.local.u64    %rd8, [%rd1+112];
-       @!%p6 bra       BB36_13;
-       bra.uni         BB36_12;
-
-BB36_12:
-       add.s64         %rd105, %rd1, %rd8;
-       ld.local.u64    %rd106, [%rd105];
-       add.s64         %rd107, %rd121, %rd106;
-       ld.u64  %rd121, [%rd107+-1];
-
-BB36_13:
-       add.s64         %rd109, %rd18, %rd8;
-       // Callseq Start 15
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd109;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r16;
-       .param .b64 retval0;
-       prototype_15 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd121, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_15;
-       ld.param.b64    %rd111, [retval0+0];
-       
-       //{
-       }// Callseq End 15
-       ld.f32  %f32, [%rd111];
+       add.s32         %r45, %r43, %r14;
+       setp.ge.u32     %p3, %r45, %r10;
+       @%p3 bra        BB6_8;
+
+       setp.eq.s64     %p4, %rd30, 0;
+       mov.u64         %rd30, 0;
+       @%p4 bra        BB6_7;
+
+       cvta.to.global.u64      %rd20, %rd1;
+       add.s32         %r19, %r43, %r14;
+       mul.wide.u32    %rd21, %r19, 4;
+       add.s64         %rd22, %rd20, %rd21;
+       ld.global.u32   %r45, [%rd22];
+       mov.u64         %rd30, %rd1;
+
+BB6_7:
+       mul.wide.u32    %rd23, %r45, 4;
+       add.s64         %rd24, %rd2, %rd23;
+       ld.global.f32   %f32, [%rd24];
        min.f32         %f44, %f44, %f32;
 
-BB36_14:
-       shl.b32         %r20, %r9, 1;
-       mov.u32         %r21, %nctaid.x;
-       mad.lo.s32      %r43, %r20, %r21, %r43;
-       setp.lt.u32     %p7, %r43, %r5;
-       @%p7 bra        BB36_8;
-
-BB36_15:
-       shl.b32         %r23, %r6, 2;
-       mov.u32         %r24, memory;
-       add.s32         %r4, %r24, %r23;
-       st.shared.f32   [%r4], %f44;
+BB6_8:
+       shl.b32         %r21, %r14, 1;
+       mov.u32         %r22, %nctaid.x;
+       mad.lo.s32      %r43, %r21, %r22, %r43;
+       setp.lt.u32     %p5, %r43, %r10;
+       @%p5 bra        BB6_2;
+
+BB6_9:
+       shl.b32         %r24, %r11, 2;
+       mov.u32         %r25, memory;
+       add.s32         %r9, %r25, %r24;
+       st.shared.f32   [%r9], %f44;
        bar.sync        0;
-       setp.lt.u32     %p8, %r9, 1024;
-       @%p8 bra        BB36_19;
+       setp.lt.u32     %p6, %r14, 1024;
+       @%p6 bra        BB6_13;
 
-       setp.gt.u32     %p9, %r6, 511;
-       @%p9 bra        BB36_18;
+       setp.gt.u32     %p7, %r11, 511;
+       @%p7 bra        BB6_12;
 
-       ld.shared.f32   %f33, [%r4+2048];
+       ld.shared.f32   %f33, [%r9+2048];
        min.f32         %f44, %f44, %f33;
-       st.shared.f32   [%r4], %f44;
+       st.shared.f32   [%r9], %f44;
 
-BB36_18:
+BB6_12:
        bar.sync        0;
 
-BB36_19:
-       setp.lt.u32     %p10, %r9, 512;
-       @%p10 bra       BB36_23;
+BB6_13:
+       setp.lt.u32     %p8, %r14, 512;
+       @%p8 bra        BB6_17;
 
-       setp.gt.u32     %p11, %r6, 255;
-       @%p11 bra       BB36_22;
+       setp.gt.u32     %p9, %r11, 255;
+       @%p9 bra        BB6_16;
 
-       ld.shared.f32   %f34, [%r4+1024];
+       ld.shared.f32   %f34, [%r9+1024];
        min.f32         %f44, %f44, %f34;
-       st.shared.f32   [%r4], %f44;
+       st.shared.f32   [%r9], %f44;
 
-BB36_22:
+BB6_16:
        bar.sync        0;
 
-BB36_23:
-       setp.lt.u32     %p12, %r9, 256;
-       @%p12 bra       BB36_27;
+BB6_17:
+       setp.lt.u32     %p10, %r14, 256;
+       @%p10 bra       BB6_21;
 
-       setp.gt.u32     %p13, %r6, 127;
-       @%p13 bra       BB36_26;
+       setp.gt.u32     %p11, %r11, 127;
+       @%p11 bra       BB6_20;
 
-       ld.shared.f32   %f35, [%r4+512];
+       ld.shared.f32   %f35, [%r9+512];
        min.f32         %f44, %f44, %f35;
-       st.shared.f32   [%r4], %f44;
+       st.shared.f32   [%r9], %f44;
 
-BB36_26:
+BB6_20:
        bar.sync        0;
 
-BB36_27:
-       setp.lt.u32     %p14, %r9, 128;
-       @%p14 bra       BB36_31;
+BB6_21:
+       setp.lt.u32     %p12, %r14, 128;
+       @%p12 bra       BB6_25;
 
-       setp.gt.u32     %p15, %r6, 63;
-       @%p15 bra       BB36_30;
+       setp.gt.u32     %p13, %r11, 63;
+       @%p13 bra       BB6_24;
 
-       ld.shared.f32   %f36, [%r4+256];
+       ld.shared.f32   %f36, [%r9+256];
        min.f32         %f44, %f44, %f36;
-       st.shared.f32   [%r4], %f44;
+       st.shared.f32   [%r9], %f44;
 
-BB36_30:
+BB6_24:
        bar.sync        0;
 
-BB36_31:
-       setp.gt.u32     %p16, %r6, 31;
-       @%p16 bra       BB36_44;
+BB6_25:
+       setp.gt.u32     %p14, %r11, 31;
+       @%p14 bra       BB6_38;
 
-       setp.lt.u32     %p17, %r9, 64;
-       @%p17 bra       BB36_34;
+       setp.lt.u32     %p15, %r14, 64;
+       @%p15 bra       BB6_28;
 
-       ld.volatile.shared.f32  %f37, [%r4+128];
+       ld.volatile.shared.f32  %f37, [%r9+128];
        min.f32         %f44, %f44, %f37;
-       st.volatile.shared.f32  [%r4], %f44;
+       st.volatile.shared.f32  [%r9], %f44;
 
-BB36_34:
-       setp.lt.u32     %p18, %r9, 32;
-       @%p18 bra       BB36_36;
+BB6_28:
+       setp.lt.u32     %p16, %r14, 32;
+       @%p16 bra       BB6_30;
 
-       ld.volatile.shared.f32  %f38, [%r4+64];
+       ld.volatile.shared.f32  %f38, [%r9+64];
        min.f32         %f44, %f44, %f38;
-       st.volatile.shared.f32  [%r4], %f44;
+       st.volatile.shared.f32  [%r9], %f44;
 
-BB36_36:
-       setp.lt.u32     %p19, %r9, 16;
-       @%p19 bra       BB36_38;
+BB6_30:
+       setp.lt.u32     %p17, %r14, 16;
+       @%p17 bra       BB6_32;
 
-       ld.volatile.shared.f32  %f39, [%r4+32];
+       ld.volatile.shared.f32  %f39, [%r9+32];
        min.f32         %f44, %f44, %f39;
-       st.volatile.shared.f32  [%r4], %f44;
+       st.volatile.shared.f32  [%r9], %f44;
 
-BB36_38:
-       setp.lt.u32     %p20, %r9, 8;
-       @%p20 bra       BB36_40;
+BB6_32:
+       setp.lt.u32     %p18, %r14, 8;
+       @%p18 bra       BB6_34;
 
-       ld.volatile.shared.f32  %f40, [%r4+16];
+       ld.volatile.shared.f32  %f40, [%r9+16];
        min.f32         %f44, %f44, %f40;
-       st.volatile.shared.f32  [%r4], %f44;
+       st.volatile.shared.f32  [%r9], %f44;
 
-BB36_40:
-       setp.lt.u32     %p21, %r9, 4;
-       @%p21 bra       BB36_42;
+BB6_34:
+       setp.lt.u32     %p19, %r14, 4;
+       @%p19 bra       BB6_36;
 
-       ld.volatile.shared.f32  %f41, [%r4+8];
+       ld.volatile.shared.f32  %f41, [%r9+8];
        min.f32         %f44, %f44, %f41;
-       st.volatile.shared.f32  [%r4], %f44;
+       st.volatile.shared.f32  [%r9], %f44;
 
-BB36_42:
-       setp.lt.u32     %p22, %r9, 2;
-       @%p22 bra       BB36_44;
+BB6_36:
+       setp.lt.u32     %p20, %r14, 2;
+       @%p20 bra       BB6_38;
 
-       ld.volatile.shared.f32  %f42, [%r4+4];
+       ld.volatile.shared.f32  %f42, [%r9+4];
        min.f32         %f43, %f44, %f42;
-       st.volatile.shared.f32  [%r4], %f43;
+       st.volatile.shared.f32  [%r9], %f43;
 
-BB36_44:
-       setp.ne.s32     %p23, %r6, 0;
-       @%p23 bra       BB36_48;
+BB6_38:
+       setp.ne.s32     %p21, %r11, 0;
+       @%p21 bra       BB6_42;
 
        ld.shared.f32   %f28, [memory];
-       ld.local.u64    %rd114, [%rd2+96];
-       add.s64         %rd11, %rd2, %rd114;
-       add.s64         %rd12, %rd53, %rd114;
-       ld.local.u64    %rd122, [%rd2+88];
-       and.b64         %rd115, %rd122, 1;
-       setp.eq.b64     %p24, %rd115, 1;
-       @!%p24 bra      BB36_47;
-       bra.uni         BB36_46;
-
-BB36_46:
-       ld.local.u64    %rd116, [%rd11];
-       add.s64         %rd117, %rd122, %rd116;
-       ld.u64  %rd122, [%rd117+-1];
-
-BB36_47:
-       mov.u32         %r42, 0;
-       // Callseq Start 16
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd12;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r42;
-       .param .b32 param2;
-       st.param.b32    [param2+0], %r7;
-       .param .b64 retval0;
-       prototype_16 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _, .param .b32 _) ;
-       call (retval0), 
-       %rd122, 
-       (
-       param0, 
-       param1, 
-       param2
-       )
-       , prototype_16;
-       ld.param.b64    %rd119, [retval0+0];
-       
-       //{
-       }// Callseq End 16
-       st.f32  [%rd119], %f28;
-
-BB36_48:
+       cvta.to.global.u64      %rd25, %rd11;
+       ld.global.u64   %rd26, [%rd25+16];
+       ld.global.u64   %rd27, [%rd25+32];
+       cvta.to.global.u64      %rd33, %rd27;
+       setp.ne.s64     %p22, %rd26, 0;
+       @%p22 bra       BB6_41;
+
+       mul.wide.u32    %rd28, %r12, 4;
+       add.s64         %rd33, %rd33, %rd28;
+
+BB6_41:
+       st.global.f32   [%rd33], %f28;
+
+BB6_42:
        ret;
 }
 
@@ -2755,400 +1202,206 @@ BB36_48:
        .param .u32 reduce_min_d_param_2
 )
 {
-       .local .align 8 .b8     __local_depot37[272];
-       .reg .b64       %SP;
-       .reg .b64       %SPL;
-       .reg .pred      %p<25>;
-       .reg .b32       %r<44>;
+       .reg .pred      %p<23>;
+       .reg .b32       %r<46>;
        .reg .f64       %fd<60>;
-       .reg .b64       %rd<123>;
-
-
-       mov.u64         %SPL, __local_depot37;
-       cvta.local.u64  %SP, %SPL;
-       ld.param.u64    %rd17, [reduce_min_d_param_0];
-       ld.param.u64    %rd16, [reduce_min_d_param_1];
-       ld.param.u32    %r5, [reduce_min_d_param_2];
-       add.u64         %rd18, %SP, 0;
-       add.u64         %rd1, %SPL, 0;
-       st.local.u64    [%rd1], %rd17;
-       cvta.to.global.u64      %rd19, %rd17;
-       ld.global.u64   %rd20, [%rd19+16];
-       setp.eq.s64     %p1, %rd20, 0;
-       @%p1 bra        BB37_2;
-
-       mov.u64         %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
-       st.local.u64    [%rd1+8], %rd21;
-       mov.u64         %rd23, 0;
-       st.local.u64    [%rd1+16], %rd23;
-       mov.u64         %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
-       st.local.u64    [%rd1+40], %rd24;
-       st.local.u64    [%rd1+48], %rd23;
-       mov.u64         %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
-       st.local.u64    [%rd1+56], %rd26;
-       st.local.u64    [%rd1+64], %rd23;
-       mov.u64         %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
-       st.local.u64    [%rd1+88], %rd28;
-       st.local.u64    [%rd1+96], %rd23;
-       mov.u64         %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
-       st.local.u64    [%rd1+104], %rd30;
-       st.local.u64    [%rd1+112], %rd23;
-       mov.u64         %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
-       st.local.u64    [%rd1+24], %rd32;
-       st.local.u64    [%rd1+32], %rd23;
-       mov.u64         %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
-       st.local.u64    [%rd1+72], %rd34;
-       st.local.u64    [%rd1+80], %rd23;
-       mov.u64         %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
-       st.local.u64    [%rd1+120], %rd36;
-       st.local.u64    [%rd1+128], %rd23;
-       bra.uni         BB37_3;
-
-BB37_2:
-       mov.u64         %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
-       st.local.u64    [%rd1+8], %rd38;
-       mov.u64         %rd40, 0;
-       st.local.u64    [%rd1+16], %rd40;
-       mov.u64         %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
-       st.local.u64    [%rd1+40], %rd41;
-       st.local.u64    [%rd1+48], %rd40;
-       mov.u64         %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
-       st.local.u64    [%rd1+56], %rd43;
-       st.local.u64    [%rd1+64], %rd40;
-       mov.u64         %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
-       st.local.u64    [%rd1+88], %rd45;
-       st.local.u64    [%rd1+96], %rd40;
-       mov.u64         %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
-       st.local.u64    [%rd1+104], %rd47;
-       st.local.u64    [%rd1+112], %rd40;
-       mov.u64         %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
-       st.local.u64    [%rd1+24], %rd49;
-       st.local.u64    [%rd1+32], %rd40;
-       mov.u64         %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
-       st.local.u64    [%rd1+72], %rd51;
-       st.local.u64    [%rd1+80], %rd40;
-
-BB37_3:
-       add.u64         %rd53, %SP, 136;
-       add.u64         %rd2, %SPL, 136;
-       st.local.u64    [%rd2], %rd16;
-       cvta.to.global.u64      %rd54, %rd16;
-       ld.global.u64   %rd55, [%rd54+16];
-       setp.eq.s64     %p2, %rd55, 0;
-       @%p2 bra        BB37_5;
-
-       mov.u64         %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
-       st.local.u64    [%rd2+8], %rd56;
-       mov.u64         %rd58, 0;
-       st.local.u64    [%rd2+16], %rd58;
-       mov.u64         %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
-       st.local.u64    [%rd2+40], %rd59;
-       st.local.u64    [%rd2+48], %rd58;
-       mov.u64         %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
-       st.local.u64    [%rd2+56], %rd61;
-       st.local.u64    [%rd2+64], %rd58;
-       mov.u64         %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
-       st.local.u64    [%rd2+88], %rd63;
-       st.local.u64    [%rd2+96], %rd58;
-       mov.u64         %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
-       st.local.u64    [%rd2+104], %rd65;
-       st.local.u64    [%rd2+112], %rd58;
-       mov.u64         %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
-       st.local.u64    [%rd2+24], %rd67;
-       st.local.u64    [%rd2+32], %rd58;
-       mov.u64         %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
-       st.local.u64    [%rd2+72], %rd69;
-       st.local.u64    [%rd2+80], %rd58;
-       mov.u64         %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
-       st.local.u64    [%rd2+120], %rd71;
-       st.local.u64    [%rd2+128], %rd58;
-       bra.uni         BB37_6;
-
-BB37_5:
-       mov.u64         %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
-       st.local.u64    [%rd2+8], %rd73;
-       mov.u64         %rd75, 0;
-       st.local.u64    [%rd2+16], %rd75;
-       mov.u64         %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
-       st.local.u64    [%rd2+40], %rd76;
-       st.local.u64    [%rd2+48], %rd75;
-       mov.u64         %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
-       st.local.u64    [%rd2+56], %rd78;
-       st.local.u64    [%rd2+64], %rd75;
-       mov.u64         %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
-       st.local.u64    [%rd2+88], %rd80;
-       st.local.u64    [%rd2+96], %rd75;
-       mov.u64         %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
-       st.local.u64    [%rd2+104], %rd82;
-       st.local.u64    [%rd2+112], %rd75;
-       mov.u64         %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
-       st.local.u64    [%rd2+24], %rd84;
-       st.local.u64    [%rd2+32], %rd75;
-       mov.u64         %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
-       st.local.u64    [%rd2+72], %rd86;
-       st.local.u64    [%rd2+80], %rd75;
-
-BB37_6:
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r43, %r8, %r9, %r6;
-       mov.f64         %fd44, 0d7FF0000000000000;
-       setp.ge.u32     %p3, %r43, %r5;
-       @%p3 bra        BB37_15;
+       .reg .b64       %rd<34>;
+
 
+       ld.param.u64    %rd10, [reduce_min_d_param_0];
+       ld.param.u64    %rd11, [reduce_min_d_param_1];
+       ld.param.u32    %r10, [reduce_min_d_param_2];
+       mov.u32         %r11, %tid.x;
+       mov.u32         %r12, %ctaid.x;
+       shl.b32         %r13, %r12, 1;
+       mov.u32         %r14, %ntid.x;
+       mad.lo.s32      %r43, %r13, %r14, %r11;
        mov.f64         %fd44, 0d7FF0000000000000;
+       setp.ge.u32     %p1, %r43, %r10;
+       @%p1 bra        BB7_9;
 
-BB37_8:
-       ld.local.u64    %rd3, [%rd1+112];
-       ld.local.u64    %rd120, [%rd1+104];
-       and.b64         %rd90, %rd120, 1;
-       setp.eq.b64     %p4, %rd90, 1;
-       @!%p4 bra       BB37_10;
-       bra.uni         BB37_9;
-
-BB37_9:
-       add.s64         %rd93, %rd1, %rd3;
-       ld.local.u64    %rd94, [%rd93];
-       add.s64         %rd95, %rd120, %rd94;
-       ld.u64  %rd120, [%rd95+-1];
-
-BB37_10:
-       add.s64         %rd97, %rd18, %rd3;
-       // Callseq Start 17
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd97;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r43;
-       .param .b64 retval0;
-       prototype_17 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd120, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_17;
-       ld.param.b64    %rd99, [retval0+0];
-       
-       //{
-       }// Callseq End 17
-       ld.f64  %fd31, [%rd99];
+       cvta.to.global.u64      %rd12, %rd10;
+       ld.global.u64   %rd1, [%rd12+16];
+       ld.global.u64   %rd13, [%rd12+32];
+       cvta.to.global.u64      %rd2, %rd13;
+       mov.f64         %fd44, 0d7FF0000000000000;
+       mov.u64         %rd30, %rd1;
+
+BB7_2:
+       setp.eq.s64     %p2, %rd1, 0;
+       mov.u32         %r44, %r43;
+       @%p2 bra        BB7_4;
+
+       cvta.to.global.u64      %rd14, %rd1;
+       mul.wide.u32    %rd15, %r43, 4;
+       add.s64         %rd16, %rd14, %rd15;
+       ld.global.u32   %r44, [%rd16];
+       mov.u64         %rd30, %rd1;
+
+BB7_4:
+       mul.wide.u32    %rd17, %r44, 8;
+       add.s64         %rd18, %rd2, %rd17;
+       ld.global.f64   %fd31, [%rd18];
        min.f64         %fd44, %fd44, %fd31;
-       add.s32         %r16, %r43, %r9;
-       setp.ge.u32     %p5, %r16, %r5;
-       @%p5 bra        BB37_14;
-
-       ld.local.u64    %rd121, [%rd1+104];
-       and.b64         %rd102, %rd121, 1;
-       setp.eq.b64     %p6, %rd102, 1;
-       ld.local.u64    %rd8, [%rd1+112];
-       @!%p6 bra       BB37_13;
-       bra.uni         BB37_12;
-
-BB37_12:
-       add.s64         %rd105, %rd1, %rd8;
-       ld.local.u64    %rd106, [%rd105];
-       add.s64         %rd107, %rd121, %rd106;
-       ld.u64  %rd121, [%rd107+-1];
-
-BB37_13:
-       add.s64         %rd109, %rd18, %rd8;
-       // Callseq Start 18
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd109;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r16;
-       .param .b64 retval0;
-       prototype_18 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _) ;
-       call (retval0), 
-       %rd121, 
-       (
-       param0, 
-       param1
-       )
-       , prototype_18;
-       ld.param.b64    %rd111, [retval0+0];
-       
-       //{
-       }// Callseq End 18
-       ld.f64  %fd32, [%rd111];
+       add.s32         %r45, %r43, %r14;
+       setp.ge.u32     %p3, %r45, %r10;
+       @%p3 bra        BB7_8;
+
+       setp.eq.s64     %p4, %rd30, 0;
+       mov.u64         %rd30, 0;
+       @%p4 bra        BB7_7;
+
+       cvta.to.global.u64      %rd20, %rd1;
+       add.s32         %r19, %r43, %r14;
+       mul.wide.u32    %rd21, %r19, 4;
+       add.s64         %rd22, %rd20, %rd21;
+       ld.global.u32   %r45, [%rd22];
+       mov.u64         %rd30, %rd1;
+
+BB7_7:
+       mul.wide.u32    %rd23, %r45, 8;
+       add.s64         %rd24, %rd2, %rd23;
+       ld.global.f64   %fd32, [%rd24];
        min.f64         %fd44, %fd44, %fd32;
 
-BB37_14:
-       shl.b32         %r20, %r9, 1;
-       mov.u32         %r21, %nctaid.x;
-       mad.lo.s32      %r43, %r20, %r21, %r43;
-       setp.lt.u32     %p7, %r43, %r5;
-       @%p7 bra        BB37_8;
-
-BB37_15:
-       shl.b32         %r23, %r6, 3;
-       mov.u32         %r24, memory;
-       add.s32         %r4, %r24, %r23;
-       st.shared.f64   [%r4], %fd44;
+BB7_8:
+       shl.b32         %r21, %r14, 1;
+       mov.u32         %r22, %nctaid.x;
+       mad.lo.s32      %r43, %r21, %r22, %r43;
+       setp.lt.u32     %p5, %r43, %r10;
+       @%p5 bra        BB7_2;
+
+BB7_9:
+       shl.b32         %r24, %r11, 3;
+       mov.u32         %r25, memory;
+       add.s32         %r9, %r25, %r24;
+       st.shared.f64   [%r9], %fd44;
        bar.sync        0;
-       setp.lt.u32     %p8, %r9, 1024;
-       @%p8 bra        BB37_19;
+       setp.lt.u32     %p6, %r14, 1024;
+       @%p6 bra        BB7_13;
 
-       setp.gt.u32     %p9, %r6, 511;
-       @%p9 bra        BB37_18;
+       setp.gt.u32     %p7, %r11, 511;
+       @%p7 bra        BB7_12;
 
-       ld.shared.f64   %fd33, [%r4+4096];
+       ld.shared.f64   %fd33, [%r9+4096];
        min.f64         %fd44, %fd44, %fd33;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB37_18:
+BB7_12:
        bar.sync        0;
 
-BB37_19:
-       setp.lt.u32     %p10, %r9, 512;
-       @%p10 bra       BB37_23;
+BB7_13:
+       setp.lt.u32     %p8, %r14, 512;
+       @%p8 bra        BB7_17;
 
-       setp.gt.u32     %p11, %r6, 255;
-       @%p11 bra       BB37_22;
+       setp.gt.u32     %p9, %r11, 255;
+       @%p9 bra        BB7_16;
 
-       ld.shared.f64   %fd34, [%r4+2048];
+       ld.shared.f64   %fd34, [%r9+2048];
        min.f64         %fd44, %fd44, %fd34;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB37_22:
+BB7_16:
        bar.sync        0;
 
-BB37_23:
-       setp.lt.u32     %p12, %r9, 256;
-       @%p12 bra       BB37_27;
+BB7_17:
+       setp.lt.u32     %p10, %r14, 256;
+       @%p10 bra       BB7_21;
 
-       setp.gt.u32     %p13, %r6, 127;
-       @%p13 bra       BB37_26;
+       setp.gt.u32     %p11, %r11, 127;
+       @%p11 bra       BB7_20;
 
-       ld.shared.f64   %fd35, [%r4+1024];
+       ld.shared.f64   %fd35, [%r9+1024];
        min.f64         %fd44, %fd44, %fd35;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB37_26:
+BB7_20:
        bar.sync        0;
 
-BB37_27:
-       setp.lt.u32     %p14, %r9, 128;
-       @%p14 bra       BB37_31;
+BB7_21:
+       setp.lt.u32     %p12, %r14, 128;
+       @%p12 bra       BB7_25;
 
-       setp.gt.u32     %p15, %r6, 63;
-       @%p15 bra       BB37_30;
+       setp.gt.u32     %p13, %r11, 63;
+       @%p13 bra       BB7_24;
 
-       ld.shared.f64   %fd36, [%r4+512];
+       ld.shared.f64   %fd36, [%r9+512];
        min.f64         %fd44, %fd44, %fd36;
-       st.shared.f64   [%r4], %fd44;
+       st.shared.f64   [%r9], %fd44;
 
-BB37_30:
+BB7_24:
        bar.sync        0;
 
-BB37_31:
-       setp.gt.u32     %p16, %r6, 31;
-       @%p16 bra       BB37_44;
+BB7_25:
+       setp.gt.u32     %p14, %r11, 31;
+       @%p14 bra       BB7_38;
 
-       setp.lt.u32     %p17, %r9, 64;
-       @%p17 bra       BB37_34;
+       setp.lt.u32     %p15, %r14, 64;
+       @%p15 bra       BB7_28;
 
-       ld.volatile.shared.f64  %fd37, [%r4+256];
+       ld.volatile.shared.f64  %fd37, [%r9+256];
        min.f64         %fd44, %fd44, %fd37;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB37_34:
-       setp.lt.u32     %p18, %r9, 32;
-       @%p18 bra       BB37_36;
+BB7_28:
+       setp.lt.u32     %p16, %r14, 32;
+       @%p16 bra       BB7_30;
 
-       ld.volatile.shared.f64  %fd38, [%r4+128];
+       ld.volatile.shared.f64  %fd38, [%r9+128];
        min.f64         %fd44, %fd44, %fd38;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB37_36:
-       setp.lt.u32     %p19, %r9, 16;
-       @%p19 bra       BB37_38;
+BB7_30:
+       setp.lt.u32     %p17, %r14, 16;
+       @%p17 bra       BB7_32;
 
-       ld.volatile.shared.f64  %fd39, [%r4+64];
+       ld.volatile.shared.f64  %fd39, [%r9+64];
        min.f64         %fd44, %fd44, %fd39;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB37_38:
-       setp.lt.u32     %p20, %r9, 8;
-       @%p20 bra       BB37_40;
+BB7_32:
+       setp.lt.u32     %p18, %r14, 8;
+       @%p18 bra       BB7_34;
 
-       ld.volatile.shared.f64  %fd40, [%r4+32];
+       ld.volatile.shared.f64  %fd40, [%r9+32];
        min.f64         %fd44, %fd44, %fd40;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB37_40:
-       setp.lt.u32     %p21, %r9, 4;
-       @%p21 bra       BB37_42;
+BB7_34:
+       setp.lt.u32     %p19, %r14, 4;
+       @%p19 bra       BB7_36;
 
-       ld.volatile.shared.f64  %fd41, [%r4+16];
+       ld.volatile.shared.f64  %fd41, [%r9+16];
        min.f64         %fd44, %fd44, %fd41;
-       st.volatile.shared.f64  [%r4], %fd44;
+       st.volatile.shared.f64  [%r9], %fd44;
 
-BB37_42:
-       setp.lt.u32     %p22, %r9, 2;
-       @%p22 bra       BB37_44;
+BB7_36:
+       setp.lt.u32     %p20, %r14, 2;
+       @%p20 bra       BB7_38;
 
-       ld.volatile.shared.f64  %fd42, [%r4+8];
+       ld.volatile.shared.f64  %fd42, [%r9+8];
        min.f64         %fd43, %fd44, %fd42;
-       st.volatile.shared.f64  [%r4], %fd43;
+       st.volatile.shared.f64  [%r9], %fd43;
 
-BB37_44:
-       setp.ne.s32     %p23, %r6, 0;
-       @%p23 bra       BB37_48;
+BB7_38:
+       setp.ne.s32     %p21, %r11, 0;
+       @%p21 bra       BB7_42;
 
        ld.shared.f64   %fd28, [memory];
-       ld.local.u64    %rd114, [%rd2+96];
-       add.s64         %rd11, %rd2, %rd114;
-       add.s64         %rd12, %rd53, %rd114;
-       ld.local.u64    %rd122, [%rd2+88];
-       and.b64         %rd115, %rd122, 1;
-       setp.eq.b64     %p24, %rd115, 1;
-       @!%p24 bra      BB37_47;
-       bra.uni         BB37_46;
-
-BB37_46:
-       ld.local.u64    %rd116, [%rd11];
-       add.s64         %rd117, %rd122, %rd116;
-       ld.u64  %rd122, [%rd117+-1];
-
-BB37_47:
-       mov.u32         %r42, 0;
-       // Callseq Start 19
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.b64    [param0+0], %rd12;
-       .param .b32 param1;
-       st.param.b32    [param1+0], %r42;
-       .param .b32 param2;
-       st.param.b32    [param2+0], %r7;
-       .param .b64 retval0;
-       prototype_19 : .callprototype (.param .b64 _) _ (.param .b64 _, .param 
.b32 _, .param .b32 _) ;
-       call (retval0), 
-       %rd122, 
-       (
-       param0, 
-       param1, 
-       param2
-       )
-       , prototype_19;
-       ld.param.b64    %rd119, [retval0+0];
-       
-       //{
-       }// Callseq End 19
-       st.f64  [%rd119], %fd28;
-
-BB37_48:
+       cvta.to.global.u64      %rd25, %rd11;
+       ld.global.u64   %rd26, [%rd25+16];
+       ld.global.u64   %rd27, [%rd25+32];
+       cvta.to.global.u64      %rd33, %rd27;
+       setp.ne.s64     %p22, %rd26, 0;
+       @%p22 bra       BB7_41;
+
+       mul.wide.u32    %rd28, %r12, 8;
+       add.s64         %rd33, %rd33, %rd28;
+
+BB7_41:
+       st.global.f64   [%rd33], %fd28;
+
+BB7_42:
        ret;
 }
 

Reply via email to