This is an automated email from the ASF dual-hosted git repository.
markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 3ade4f2 [MINOR] Spoof cuda binaries for Windows & Linux;
3ade4f2 is described below
commit 3ade4f29b4614db70f4c574534b113f08901a7f4
Author: Mark Dokter <[email protected]>
AuthorDate: Wed Mar 3 16:12:51 2021 +0100
[MINOR] Spoof cuda binaries for Windows & Linux;
* Fixing platform specific compilation issues
---
.../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 265584 -> 303688
bytes
.../lib/libsystemds_spoof_cuda-Windows-AMD64.dll | Bin 222208 -> 244736
bytes
src/main/cuda/headers/operators.cuh | 28 +-
src/main/cuda/kernels/SystemDS.cu | 20 +-
src/main/cuda/kernels/SystemDS.ptx | 449 ++--
src/main/cuda/kernels/reduction.ptx | 2343 ++++++++++++++++----
src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp | 4 +-
src/main/cuda/spoof-launcher/SpoofCUDAContext.h | 7 +-
src/main/cuda/spoof-launcher/SpoofCellwise.h | 18 +-
src/main/cuda/spoof-launcher/SpoofOperator.h | 4 +-
src/main/cuda/spoof-launcher/SpoofRowwise.h | 6 +-
11 files changed, 2220 insertions(+), 659 deletions(-)
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
index 368ba5f..5bb044f 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and
b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
index 5d67d8e..bdf0a4f 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll and
b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll differ
diff --git a/src/main/cuda/headers/operators.cuh
b/src/main/cuda/headers/operators.cuh
index 3ac14a4..c88a19d 100644
--- a/src/main/cuda/headers/operators.cuh
+++ b/src/main/cuda/headers/operators.cuh
@@ -49,18 +49,42 @@ struct RoundOp {
template<typename T>
struct FloorOp {
- __device__ __forceinline__ static T exec(T a, T b) {
+ __device__ __forceinline__ static T exec(T a, T b);
+};
+
+template<>
+struct FloorOp<double> {
+ __device__ __forceinline__ static double exec(double a, double b) {
return floor(a);
}
};
+template<>
+struct FloorOp<float> {
+ __device__ __forceinline__ static float exec(float a, float b) {
+ return floorf(a);
+ }
+};
+
template<typename T>
struct CeilOp {
- __device__ __forceinline__ static T exec(T a, T b) {
+ __device__ __forceinline__ static T exec(T a, T b);
+};
+
+template<>
+struct CeilOp<double> {
+ __device__ __forceinline__ static double exec(double a, double b) {
return ceil(a);
}
};
+template<>
+struct CeilOp<float> {
+ __device__ __forceinline__ static float exec(float a, float b) {
+ return ceilf(a);
+ }
+};
+
template<typename T>
struct ExpOp {
__device__ __forceinline__ static T exec(T a, T b) {
diff --git a/src/main/cuda/kernels/SystemDS.cu
b/src/main/cuda/kernels/SystemDS.cu
index 52e2b33..3c0c821 100644
--- a/src/main/cuda/kernels/SystemDS.cu
+++ b/src/main/cuda/kernels/SystemDS.cu
@@ -34,6 +34,7 @@ using uint = unsigned int;
#include "cum_min.cuh"
#include "cum_max.cuh"
#include "cum_sum_prod.cuh"
+#include "operators.cuh"
/**
* This method performs an im2col operation on sparse input image
@@ -471,7 +472,7 @@ __forceinline__ __device__ T binaryOp(T x, T y, int op) {
if (isnan(v) || isinf(v)) {
return v;
} else {
- v = floor(v);
+ v = FloorOp<T>::exec(v, v);
}
return x - v * y;
}
@@ -480,7 +481,7 @@ __forceinline__ __device__ T binaryOp(T x, T y, int op) {
if (isnan(v) || isinf(v)) {
return v;
} else {
- return floor(v);
+ return FloorOp<T>::exec(v, v);
}
}
default:
@@ -1546,13 +1547,24 @@ extern "C" __global__ void matrix_log_f(float *A, float
*C, unsigned int size) {
* @param siz the length of the input and output matrices
*/
template<typename T>
-__device__ void matrix_floor(T *A, T *C, unsigned int size) {
+__device__ void matrix_floor(T* A, T* C, unsigned int size);
+
+template<>
+__device__ void matrix_floor<double>(double* A, double* C, unsigned int size) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size) {
C[index] = floor(A[index]);
}
}
+template<>
+__device__ void matrix_floor<float>(float* A, float* C, unsigned int size) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < size) {
+ C[index] = floorf(A[index]);
+ }
+}
+
extern "C" __global__ void matrix_floor_d(double *A, double *C,
unsigned int size) {
matrix_floor(A, C, size);
@@ -1573,7 +1585,7 @@ template<typename T>
__device__ void matrix_ceil(T *A, T *C, unsigned int size) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size) {
- C[index] = ceil(A[index]);
+ C[index] = CeilOp<T>::exec(A[index], A[index]);
}
}
diff --git a/src/main/cuda/kernels/SystemDS.ptx
b/src/main/cuda/kernels/SystemDS.ptx
index ee355bf..b5ca8de 100644
--- a/src/main/cuda/kernels/SystemDS.ptx
+++ b/src/main/cuda/kernels/SystemDS.ptx
@@ -9190,7 +9190,7 @@ BB75_35:
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<58>;
- .reg .b64 %rd<10>;
+ .reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_mean_d_param_0];
@@ -9338,13 +9338,12 @@ BB76_33:
@%p19 bra BB76_35;
ld.shared.f64 %fd40, [memory];
- cvt.u64.u32 %rd6, %r4;
- cvt.rn.f64.s64 %fd41, %rd6;
+ cvt.rn.f64.s32 %fd41, %r4;
div.rn.f64 %fd42, %fd40, %fd41;
- cvta.to.global.u64 %rd7, %rd2;
- mul.wide.u32 %rd8, %r6, 8;
- add.s64 %rd9, %rd7, %rd8;
- st.global.f64 [%rd9], %fd42;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd42;
BB76_35:
ret;
@@ -9361,7 +9360,7 @@ BB76_35:
.reg .pred %p<20>;
.reg .f32 %f<58>;
.reg .b32 %r<72>;
- .reg .b64 %rd<10>;
+ .reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_mean_f_param_0];
@@ -9509,13 +9508,12 @@ BB77_33:
@%p19 bra BB77_35;
ld.shared.f32 %f40, [memory];
- cvt.u64.u32 %rd6, %r4;
- cvt.rn.f32.s64 %f41, %rd6;
+ cvt.rn.f32.s32 %f41, %r4;
div.rn.f32 %f42, %f40, %f41;
- cvta.to.global.u64 %rd7, %rd2;
- mul.wide.u32 %rd8, %r6, 4;
- add.s64 %rd9, %rd7, %rd8;
- st.global.f32 [%rd9], %f42;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f42;
BB77_35:
ret;
@@ -9532,7 +9530,7 @@ BB77_35:
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<11>;
- .reg .b64 %rd<10>;
+ .reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_mean_d_param_0];
@@ -9564,13 +9562,12 @@ BB78_3:
@%p3 bra BB78_3;
BB78_4:
- cvt.u64.u32 %rd6, %r5;
- cvt.rn.f64.s64 %fd7, %rd6;
+ cvt.rn.f64.s32 %fd7, %r5;
div.rn.f64 %fd8, %fd10, %fd7;
- cvta.to.global.u64 %rd7, %rd3;
- mul.wide.u32 %rd8, %r1, 8;
- add.s64 %rd9, %rd7, %rd8;
- st.global.f64 [%rd9], %fd8;
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd8;
BB78_5:
ret;
@@ -9587,7 +9584,7 @@ BB78_5:
.reg .pred %p<4>;
.reg .f32 %f<11>;
.reg .b32 %r<11>;
- .reg .b64 %rd<10>;
+ .reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_mean_f_param_0];
@@ -9619,13 +9616,12 @@ BB79_3:
@%p3 bra BB79_3;
BB79_4:
- cvt.u64.u32 %rd6, %r5;
- cvt.rn.f32.s64 %f7, %rd6;
+ cvt.rn.f32.s32 %f7, %r5;
div.rn.f32 %f8, %f10, %f7;
- cvta.to.global.u64 %rd7, %rd3;
- mul.wide.u32 %rd8, %r1, 4;
- add.s64 %rd9, %rd7, %rd8;
- st.global.f32 [%rd9], %f8;
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f8;
BB79_5:
ret;
@@ -10598,7 +10594,7 @@ BB94_11:
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .f32 %f<38>;
- .reg .b32 %r<69>;
+ .reg .b32 %r<70>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
@@ -10606,12 +10602,12 @@ BB94_11:
mov.u64 %SPL, __local_depot95;
ld.param.u64 %rd7, [matrix_sin_f_param_0];
ld.param.u64 %rd8, [matrix_sin_f_param_1];
- ld.param.u32 %r29, [matrix_sin_f_param_2];
- mov.u32 %r30, %ntid.x;
- mov.u32 %r31, %ctaid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r1, %r30, %r31, %r32;
- setp.ge.u32 %p1, %r1, %r29;
+ ld.param.u32 %r30, [matrix_sin_f_param_2];
+ mov.u32 %r31, %ntid.x;
+ mov.u32 %r32, %ctaid.x;
+ mov.u32 %r33, %tid.x;
+ mad.lo.s32 %r1, %r31, %r32, %r33;
+ setp.ge.u32 %p1, %r1, %r30;
@%p1 bra BB95_17;
cvta.to.global.u64 %rd9, %rd7;
@@ -10620,8 +10616,8 @@ BB94_11:
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f15, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r68, %f15;
- cvt.rn.f32.s32 %f16, %r68;
+ cvt.rni.s32.f32 %r69, %f15;
+ cvt.rn.f32.s32 %f16, %r69;
mov.f32 %f17, 0fBFC90FDA;
fma.rn.f32 %f18, %f16, %f17, %f1;
mov.f32 %f19, 0fB3A22168;
@@ -10643,95 +10639,96 @@ BB95_11:
BB95_3:
mov.b32 %r3, %f1;
- shl.b32 %r35, %r3, 8;
- or.b32 %r4, %r35, -2147483648;
- mov.u32 %r62, 0;
+ shr.u32 %r4, %r3, 23;
+ shl.b32 %r36, %r3, 8;
+ or.b32 %r5, %r36, -2147483648;
+ mov.u32 %r63, 0;
mov.u64 %rd22, __cudart_i2opi_f;
- mov.u32 %r61, -6;
+ mov.u32 %r62, -6;
mov.u64 %rd23, %rd1;
BB95_4:
.pragma "nounroll";
- ld.const.u32 %r38, [%rd22];
+ ld.const.u32 %r39, [%rd22];
// inline asm
{
- mad.lo.cc.u32 %r36, %r38, %r4, %r62;
- madc.hi.u32 %r62, %r38, %r4, 0;
+ mad.lo.cc.u32 %r37, %r39, %r5, %r63;
+ madc.hi.u32 %r63, %r39, %r5, 0;
}
// inline asm
- st.local.u32 [%rd23], %r36;
+ st.local.u32 [%rd23], %r37;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
- add.s32 %r61, %r61, 1;
- setp.ne.s32 %p4, %r61, 0;
+ add.s32 %r62, %r62, 1;
+ setp.ne.s32 %p4, %r62, 0;
@%p4 bra BB95_4;
- bfe.u32 %r41, %r3, 23, 8;
- add.s32 %r42, %r41, -128;
- shr.u32 %r43, %r42, 5;
- and.b32 %r9, %r3, -2147483648;
- st.local.u32 [%rd1+24], %r62;
- bfe.u32 %r10, %r3, 23, 5;
- mov.u32 %r44, 6;
- sub.s32 %r45, %r44, %r43;
- mul.wide.s32 %rd14, %r45, 4;
+ and.b32 %r42, %r4, 255;
+ add.s32 %r43, %r42, -128;
+ shr.u32 %r44, %r43, 5;
+ and.b32 %r10, %r3, -2147483648;
+ st.local.u32 [%rd1+24], %r63;
+ mov.u32 %r45, 6;
+ sub.s32 %r46, %r45, %r44;
+ mul.wide.s32 %rd14, %r46, 4;
add.s64 %rd6, %rd1, %rd14;
- ld.local.u32 %r64, [%rd6];
- ld.local.u32 %r63, [%rd6+-4];
- setp.eq.s32 %p5, %r10, 0;
+ ld.local.u32 %r65, [%rd6];
+ ld.local.u32 %r64, [%rd6+-4];
+ and.b32 %r13, %r4, 31;
+ setp.eq.s32 %p5, %r13, 0;
@%p5 bra BB95_7;
- mov.u32 %r46, 32;
- sub.s32 %r47, %r46, %r10;
- shr.u32 %r48, %r63, %r47;
- shl.b32 %r49, %r64, %r10;
- add.s32 %r64, %r48, %r49;
- ld.local.u32 %r50, [%rd6+-8];
- shr.u32 %r51, %r50, %r47;
- shl.b32 %r52, %r63, %r10;
- add.s32 %r63, %r51, %r52;
+ mov.u32 %r47, 32;
+ sub.s32 %r48, %r47, %r13;
+ shr.u32 %r49, %r64, %r48;
+ shl.b32 %r50, %r65, %r13;
+ add.s32 %r65, %r49, %r50;
+ ld.local.u32 %r51, [%rd6+-8];
+ shr.u32 %r52, %r51, %r48;
+ shl.b32 %r53, %r64, %r13;
+ add.s32 %r64, %r52, %r53;
BB95_7:
- shr.u32 %r53, %r63, 30;
- shl.b32 %r54, %r64, 2;
- add.s32 %r66, %r54, %r53;
- shl.b32 %r18, %r63, 2;
- shr.u32 %r55, %r66, 31;
- shr.u32 %r56, %r64, 30;
- add.s32 %r19, %r55, %r56;
- setp.eq.s32 %p6, %r55, 0;
+ shr.u32 %r54, %r64, 30;
+ shl.b32 %r55, %r65, 2;
+ add.s32 %r67, %r55, %r54;
+ shl.b32 %r19, %r64, 2;
+ shr.u32 %r56, %r67, 31;
+ shr.u32 %r57, %r65, 30;
+ add.s32 %r20, %r56, %r57;
+ setp.eq.s32 %p6, %r56, 0;
@%p6 bra BB95_8;
- not.b32 %r57, %r66;
- neg.s32 %r65, %r18;
- setp.eq.s32 %p7, %r18, 0;
- selp.u32 %r58, 1, 0, %p7;
- add.s32 %r66, %r58, %r57;
- xor.b32 %r67, %r9, -2147483648;
+ not.b32 %r58, %r67;
+ neg.s32 %r66, %r19;
+ setp.eq.s32 %p7, %r19, 0;
+ selp.u32 %r59, 1, 0, %p7;
+ add.s32 %r67, %r59, %r58;
+ xor.b32 %r68, %r10, -2147483648;
bra.uni BB95_10;
BB95_8:
- mov.u32 %r65, %r18;
- mov.u32 %r67, %r9;
+ mov.u32 %r66, %r19;
+ mov.u32 %r68, %r10;
BB95_10:
- cvt.u64.u32 %rd15, %r66;
+ cvt.u64.u32 %rd15, %r67;
shl.b64 %rd16, %rd15, 32;
- cvt.u64.u32 %rd17, %r65;
+ cvt.u64.u32 %rd17, %r66;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f22, %fd2;
neg.f32 %f23, %f22;
- setp.eq.s32 %p8, %r67, 0;
+ setp.eq.s32 %p8, %r68, 0;
selp.f32 %f35, %f22, %f23, %p8;
- setp.eq.s32 %p9, %r9, 0;
- neg.s32 %r59, %r19;
- selp.b32 %r68, %r19, %r59, %p9;
+ setp.eq.s32 %p9, %r10, 0;
+ neg.s32 %r60, %r20;
+ selp.b32 %r69, %r20, %r60, %p9;
BB95_12:
- and.b32 %r28, %r68, 1;
- setp.eq.s32 %p10, %r28, 0;
+ and.b32 %r29, %r69, 1;
+ setp.eq.s32 %p10, %r29, 0;
selp.f32 %f7, %f35, 0f3F800000, %p10;
mul.rn.f32 %f8, %f35, %f35;
mov.f32 %f26, 0f00000000;
@@ -10749,8 +10746,8 @@ BB95_14:
selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
fma.rn.f32 %f32, %f30, %f8, %f31;
fma.rn.f32 %f37, %f32, %f9, %f7;
- and.b32 %r60, %r68, 2;
- setp.eq.s32 %p12, %r60, 0;
+ and.b32 %r61, %r69, 2;
+ setp.eq.s32 %p12, %r61, 0;
@%p12 bra BB95_16;
mov.f32 %f34, 0fBF800000;
@@ -11145,7 +11142,7 @@ BB98_11:
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .f32 %f<38>;
- .reg .b32 %r<70>;
+ .reg .b32 %r<71>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
@@ -11153,12 +11150,12 @@ BB98_11:
mov.u64 %SPL, __local_depot99;
ld.param.u64 %rd7, [matrix_cos_f_param_0];
ld.param.u64 %rd8, [matrix_cos_f_param_1];
- ld.param.u32 %r30, [matrix_cos_f_param_2];
- mov.u32 %r31, %ntid.x;
- mov.u32 %r32, %ctaid.x;
- mov.u32 %r33, %tid.x;
- mad.lo.s32 %r1, %r31, %r32, %r33;
- setp.ge.u32 %p1, %r1, %r30;
+ ld.param.u32 %r31, [matrix_cos_f_param_2];
+ mov.u32 %r32, %ntid.x;
+ mov.u32 %r33, %ctaid.x;
+ mov.u32 %r34, %tid.x;
+ mad.lo.s32 %r1, %r32, %r33, %r34;
+ setp.ge.u32 %p1, %r1, %r31;
@%p1 bra BB99_17;
cvta.to.global.u64 %rd9, %rd7;
@@ -11167,8 +11164,8 @@ BB98_11:
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f15, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r69, %f15;
- cvt.rn.f32.s32 %f16, %r69;
+ cvt.rni.s32.f32 %r70, %f15;
+ cvt.rn.f32.s32 %f16, %r70;
mov.f32 %f17, 0fBFC90FDA;
fma.rn.f32 %f18, %f16, %f17, %f1;
mov.f32 %f19, 0fB3A22168;
@@ -11190,96 +11187,97 @@ BB99_11:
BB99_3:
mov.b32 %r3, %f1;
- shl.b32 %r36, %r3, 8;
- or.b32 %r4, %r36, -2147483648;
- mov.u32 %r63, 0;
+ shr.u32 %r4, %r3, 23;
+ shl.b32 %r37, %r3, 8;
+ or.b32 %r5, %r37, -2147483648;
+ mov.u32 %r64, 0;
mov.u64 %rd22, __cudart_i2opi_f;
- mov.u32 %r62, -6;
+ mov.u32 %r63, -6;
mov.u64 %rd23, %rd1;
BB99_4:
.pragma "nounroll";
- ld.const.u32 %r39, [%rd22];
+ ld.const.u32 %r40, [%rd22];
// inline asm
{
- mad.lo.cc.u32 %r37, %r39, %r4, %r63;
- madc.hi.u32 %r63, %r39, %r4, 0;
+ mad.lo.cc.u32 %r38, %r40, %r5, %r64;
+ madc.hi.u32 %r64, %r40, %r5, 0;
}
// inline asm
- st.local.u32 [%rd23], %r37;
+ st.local.u32 [%rd23], %r38;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
- add.s32 %r62, %r62, 1;
- setp.ne.s32 %p4, %r62, 0;
+ add.s32 %r63, %r63, 1;
+ setp.ne.s32 %p4, %r63, 0;
@%p4 bra BB99_4;
- bfe.u32 %r42, %r3, 23, 8;
- add.s32 %r43, %r42, -128;
- shr.u32 %r44, %r43, 5;
- and.b32 %r9, %r3, -2147483648;
- st.local.u32 [%rd1+24], %r63;
- bfe.u32 %r10, %r3, 23, 5;
- mov.u32 %r45, 6;
- sub.s32 %r46, %r45, %r44;
- mul.wide.s32 %rd14, %r46, 4;
+ and.b32 %r43, %r4, 255;
+ add.s32 %r44, %r43, -128;
+ shr.u32 %r45, %r44, 5;
+ and.b32 %r10, %r3, -2147483648;
+ st.local.u32 [%rd1+24], %r64;
+ mov.u32 %r46, 6;
+ sub.s32 %r47, %r46, %r45;
+ mul.wide.s32 %rd14, %r47, 4;
add.s64 %rd6, %rd1, %rd14;
- ld.local.u32 %r65, [%rd6];
- ld.local.u32 %r64, [%rd6+-4];
- setp.eq.s32 %p5, %r10, 0;
+ ld.local.u32 %r66, [%rd6];
+ ld.local.u32 %r65, [%rd6+-4];
+ and.b32 %r13, %r4, 31;
+ setp.eq.s32 %p5, %r13, 0;
@%p5 bra BB99_7;
- mov.u32 %r47, 32;
- sub.s32 %r48, %r47, %r10;
- shr.u32 %r49, %r64, %r48;
- shl.b32 %r50, %r65, %r10;
- add.s32 %r65, %r49, %r50;
- ld.local.u32 %r51, [%rd6+-8];
- shr.u32 %r52, %r51, %r48;
- shl.b32 %r53, %r64, %r10;
- add.s32 %r64, %r52, %r53;
+ mov.u32 %r48, 32;
+ sub.s32 %r49, %r48, %r13;
+ shr.u32 %r50, %r65, %r49;
+ shl.b32 %r51, %r66, %r13;
+ add.s32 %r66, %r50, %r51;
+ ld.local.u32 %r52, [%rd6+-8];
+ shr.u32 %r53, %r52, %r49;
+ shl.b32 %r54, %r65, %r13;
+ add.s32 %r65, %r53, %r54;
BB99_7:
- shr.u32 %r54, %r64, 30;
- shl.b32 %r55, %r65, 2;
- add.s32 %r67, %r55, %r54;
- shl.b32 %r18, %r64, 2;
- shr.u32 %r56, %r67, 31;
- shr.u32 %r57, %r65, 30;
- add.s32 %r19, %r56, %r57;
- setp.eq.s32 %p6, %r56, 0;
+ shr.u32 %r55, %r65, 30;
+ shl.b32 %r56, %r66, 2;
+ add.s32 %r68, %r56, %r55;
+ shl.b32 %r19, %r65, 2;
+ shr.u32 %r57, %r68, 31;
+ shr.u32 %r58, %r66, 30;
+ add.s32 %r20, %r57, %r58;
+ setp.eq.s32 %p6, %r57, 0;
@%p6 bra BB99_8;
- not.b32 %r58, %r67;
- neg.s32 %r66, %r18;
- setp.eq.s32 %p7, %r18, 0;
- selp.u32 %r59, 1, 0, %p7;
- add.s32 %r67, %r59, %r58;
- xor.b32 %r68, %r9, -2147483648;
+ not.b32 %r59, %r68;
+ neg.s32 %r67, %r19;
+ setp.eq.s32 %p7, %r19, 0;
+ selp.u32 %r60, 1, 0, %p7;
+ add.s32 %r68, %r60, %r59;
+ xor.b32 %r69, %r10, -2147483648;
bra.uni BB99_10;
BB99_8:
- mov.u32 %r66, %r18;
- mov.u32 %r68, %r9;
+ mov.u32 %r67, %r19;
+ mov.u32 %r69, %r10;
BB99_10:
- cvt.u64.u32 %rd15, %r67;
+ cvt.u64.u32 %rd15, %r68;
shl.b64 %rd16, %rd15, 32;
- cvt.u64.u32 %rd17, %r66;
+ cvt.u64.u32 %rd17, %r67;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f22, %fd2;
neg.f32 %f23, %f22;
- setp.eq.s32 %p8, %r68, 0;
+ setp.eq.s32 %p8, %r69, 0;
selp.f32 %f35, %f22, %f23, %p8;
- setp.eq.s32 %p9, %r9, 0;
- neg.s32 %r60, %r19;
- selp.b32 %r69, %r19, %r60, %p9;
+ setp.eq.s32 %p9, %r10, 0;
+ neg.s32 %r61, %r20;
+ selp.b32 %r70, %r20, %r61, %p9;
BB99_12:
- add.s32 %r28, %r69, 1;
- and.b32 %r29, %r28, 1;
- setp.eq.s32 %p10, %r29, 0;
+ add.s32 %r29, %r70, 1;
+ and.b32 %r30, %r29, 1;
+ setp.eq.s32 %p10, %r30, 0;
selp.f32 %f7, %f35, 0f3F800000, %p10;
mul.rn.f32 %f8, %f35, %f35;
mov.f32 %f26, 0f00000000;
@@ -11297,8 +11295,8 @@ BB99_14:
selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
fma.rn.f32 %f32, %f30, %f8, %f31;
fma.rn.f32 %f37, %f32, %f9, %f7;
- and.b32 %r61, %r28, 2;
- setp.eq.s32 %p12, %r61, 0;
+ and.b32 %r62, %r29, 2;
+ setp.eq.s32 %p12, %r62, 0;
@%p12 bra BB99_16;
mov.f32 %f34, 0fBF800000;
@@ -11649,7 +11647,7 @@ BB102_9:
.reg .b64 %SPL;
.reg .pred %p<12>;
.reg .f32 %f<39>;
- .reg .b32 %r<68>;
+ .reg .b32 %r<69>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
@@ -11657,12 +11655,12 @@ BB102_9:
mov.u64 %SPL, __local_depot103;
ld.param.u64 %rd7, [matrix_tan_f_param_0];
ld.param.u64 %rd8, [matrix_tan_f_param_1];
- ld.param.u32 %r28, [matrix_tan_f_param_2];
- mov.u32 %r29, %ntid.x;
- mov.u32 %r30, %ctaid.x;
- mov.u32 %r31, %tid.x;
- mad.lo.s32 %r1, %r29, %r30, %r31;
- setp.ge.u32 %p1, %r1, %r28;
+ ld.param.u32 %r29, [matrix_tan_f_param_2];
+ mov.u32 %r30, %ntid.x;
+ mov.u32 %r31, %ctaid.x;
+ mov.u32 %r32, %tid.x;
+ mad.lo.s32 %r1, %r30, %r31, %r32;
+ setp.ge.u32 %p1, %r1, %r29;
@%p1 bra BB103_15;
cvta.to.global.u64 %rd9, %rd7;
@@ -11671,8 +11669,8 @@ BB102_9:
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f10, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r67, %f10;
- cvt.rn.f32.s32 %f11, %r67;
+ cvt.rni.s32.f32 %r68, %f10;
+ cvt.rn.f32.s32 %f11, %r68;
mov.f32 %f12, 0fBFC90FDA;
fma.rn.f32 %f13, %f11, %f12, %f1;
mov.f32 %f14, 0fB3A22168;
@@ -11694,91 +11692,92 @@ BB103_11:
BB103_3:
mov.b32 %r3, %f1;
- shl.b32 %r34, %r3, 8;
- or.b32 %r4, %r34, -2147483648;
- mov.u32 %r61, 0;
+ shr.u32 %r4, %r3, 23;
+ shl.b32 %r35, %r3, 8;
+ or.b32 %r5, %r35, -2147483648;
+ mov.u32 %r62, 0;
mov.u64 %rd22, __cudart_i2opi_f;
- mov.u32 %r60, -6;
+ mov.u32 %r61, -6;
mov.u64 %rd23, %rd1;
BB103_4:
.pragma "nounroll";
- ld.const.u32 %r37, [%rd22];
+ ld.const.u32 %r38, [%rd22];
// inline asm
{
- mad.lo.cc.u32 %r35, %r37, %r4, %r61;
- madc.hi.u32 %r61, %r37, %r4, 0;
+ mad.lo.cc.u32 %r36, %r38, %r5, %r62;
+ madc.hi.u32 %r62, %r38, %r5, 0;
}
// inline asm
- st.local.u32 [%rd23], %r35;
+ st.local.u32 [%rd23], %r36;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
- add.s32 %r60, %r60, 1;
- setp.ne.s32 %p4, %r60, 0;
+ add.s32 %r61, %r61, 1;
+ setp.ne.s32 %p4, %r61, 0;
@%p4 bra BB103_4;
- bfe.u32 %r40, %r3, 23, 8;
- add.s32 %r41, %r40, -128;
- shr.u32 %r42, %r41, 5;
- and.b32 %r9, %r3, -2147483648;
- st.local.u32 [%rd1+24], %r61;
- bfe.u32 %r10, %r3, 23, 5;
- mov.u32 %r43, 6;
- sub.s32 %r44, %r43, %r42;
- mul.wide.s32 %rd14, %r44, 4;
+ and.b32 %r41, %r4, 255;
+ add.s32 %r42, %r41, -128;
+ shr.u32 %r43, %r42, 5;
+ and.b32 %r10, %r3, -2147483648;
+ st.local.u32 [%rd1+24], %r62;
+ mov.u32 %r44, 6;
+ sub.s32 %r45, %r44, %r43;
+ mul.wide.s32 %rd14, %r45, 4;
add.s64 %rd6, %rd1, %rd14;
- ld.local.u32 %r63, [%rd6];
- ld.local.u32 %r62, [%rd6+-4];
- setp.eq.s32 %p5, %r10, 0;
+ ld.local.u32 %r64, [%rd6];
+ ld.local.u32 %r63, [%rd6+-4];
+ and.b32 %r13, %r4, 31;
+ setp.eq.s32 %p5, %r13, 0;
@%p5 bra BB103_7;
- mov.u32 %r45, 32;
- sub.s32 %r46, %r45, %r10;
- shr.u32 %r47, %r62, %r46;
- shl.b32 %r48, %r63, %r10;
- add.s32 %r63, %r47, %r48;
- ld.local.u32 %r49, [%rd6+-8];
- shr.u32 %r50, %r49, %r46;
- shl.b32 %r51, %r62, %r10;
- add.s32 %r62, %r50, %r51;
+ mov.u32 %r46, 32;
+ sub.s32 %r47, %r46, %r13;
+ shr.u32 %r48, %r63, %r47;
+ shl.b32 %r49, %r64, %r13;
+ add.s32 %r64, %r48, %r49;
+ ld.local.u32 %r50, [%rd6+-8];
+ shr.u32 %r51, %r50, %r47;
+ shl.b32 %r52, %r63, %r13;
+ add.s32 %r63, %r51, %r52;
BB103_7:
- shr.u32 %r52, %r62, 30;
- shl.b32 %r53, %r63, 2;
- add.s32 %r65, %r53, %r52;
- shl.b32 %r18, %r62, 2;
- shr.u32 %r54, %r65, 31;
- shr.u32 %r55, %r63, 30;
- add.s32 %r19, %r54, %r55;
- setp.eq.s32 %p6, %r54, 0;
+ shr.u32 %r53, %r63, 30;
+ shl.b32 %r54, %r64, 2;
+ add.s32 %r66, %r54, %r53;
+ shl.b32 %r19, %r63, 2;
+ shr.u32 %r55, %r66, 31;
+ shr.u32 %r56, %r64, 30;
+ add.s32 %r20, %r55, %r56;
+ setp.eq.s32 %p6, %r55, 0;
@%p6 bra BB103_8;
- not.b32 %r56, %r65;
- neg.s32 %r64, %r18;
- setp.eq.s32 %p7, %r18, 0;
- selp.u32 %r57, 1, 0, %p7;
- add.s32 %r65, %r57, %r56;
- xor.b32 %r66, %r9, -2147483648;
+ not.b32 %r57, %r66;
+ neg.s32 %r65, %r19;
+ setp.eq.s32 %p7, %r19, 0;
+ selp.u32 %r58, 1, 0, %p7;
+ add.s32 %r66, %r58, %r57;
+ xor.b32 %r67, %r10, -2147483648;
bra.uni BB103_10;
BB103_8:
- mov.u32 %r64, %r18;
- mov.u32 %r66, %r9;
+ mov.u32 %r65, %r19;
+ mov.u32 %r67, %r10;
BB103_10:
- cvt.u64.u32 %rd15, %r65;
+ cvt.u64.u32 %rd15, %r66;
shl.b64 %rd16, %rd15, 32;
- cvt.u64.u32 %rd17, %r64;
+ cvt.u64.u32 %rd17, %r65;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f17, %fd2;
neg.f32 %f18, %f17;
- setp.eq.s32 %p8, %r66, 0;
+ setp.eq.s32 %p8, %r67, 0;
selp.f32 %f37, %f17, %f18, %p8;
- setp.eq.s32 %p9, %r9, 0;
- neg.s32 %r58, %r19;
- selp.b32 %r67, %r19, %r58, %p9;
+ setp.eq.s32 %p9, %r10, 0;
+ neg.s32 %r59, %r20;
+ selp.b32 %r68, %r20, %r59, %p9;
BB103_12:
mul.f32 %f20, %f37, %f37;
@@ -11798,8 +11797,8 @@ BB103_12:
abs.f32 %f34, %f37;
setp.eq.f32 %p10, %f34, 0f3A00B43C;
selp.f32 %f38, %f37, %f33, %p10;
- and.b32 %r59, %r67, 1;
- setp.eq.b32 %p11, %r59, 1;
+ and.b32 %r60, %r68, 1;
+ setp.eq.b32 %p11, %r60, 1;
@!%p11 bra BB103_14;
bra.uni BB103_13;
diff --git a/src/main/cuda/kernels/reduction.ptx
b/src/main/cuda/kernels/reduction.ptx
index 85b9670..31038f5 100644
--- a/src/main/cuda/kernels/reduction.ptx
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -19,19 +19,336 @@
.extern .shared .align 1 .b8 memory[];
.global .align 1 .b8 $str[78] = {69, 82, 82, 79, 82, 58, 32, 110, 111, 32, 99,
111, 108, 117, 109, 110, 32, 105, 110, 100, 105, 99, 101, 115, 32, 97, 114,
114, 97, 121, 32, 105, 110, 32, 97, 32, 100, 101, 110, 115, 101, 32, 109, 97,
116, 114, 105, 120, 33, 32, 84, 104, 105, 115, 32, 119, 105, 108, 108, 32, 108,
105, 107, 101, 108, 121, 32, 99, 114, 97, 115, 104, 32, 58, 45, 47, 10, 0};
+.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE9len_denseEv(
+ .param .b64 _ZN14MatrixAccessorIfE9len_denseEv_param_0
+)
+{
+ .reg .b32 %r<4>;
+ .reg .b64 %rd<3>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE9len_denseEv_param_0];
+ ld.u64 %rd2, [%rd1];
+ ld.u32 %r1, [%rd2+4];
+ ld.u32 %r2, [%rd2+8];
+ mul.lo.s32 %r3, %r2, %r1;
+ st.param.b32 [func_retval0+0], %r3;
+ ret;
+}
+
+.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE9pos_denseEj(
+ .param .b64 _ZN14MatrixAccessorIfE9pos_denseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE9pos_denseEj_param_1
+)
+{
+ .reg .b32 %r<4>;
+ .reg .b64 %rd<3>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE9pos_denseEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE9pos_denseEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u32 %r2, [%rd2+8];
+ mul.lo.s32 %r3, %r2, %r1;
+ st.param.b32 [func_retval0+0], %r3;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE10cols_denseEj(
+ .param .b64 _ZN14MatrixAccessorIfE10cols_denseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE10cols_denseEj_param_1
+)
+{
+ .reg .b32 %r<2>;
+ .reg .b64 %rd<4>;
+
+
+ mov.u64 %rd1, $str;
+ cvta.global.u64 %rd2, %rd1;
+ mov.u64 %rd3, 0;
+ // Callseq Start 0
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd2;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd3;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r1, [retval0+0];
+
+ //{
+ }// Callseq End 0
+ st.param.b64 [func_retval0+0], %rd3;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE12val_dense_rcEjj(
+ .param .b64 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_1,
+ .param .b32 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_2
+)
+{
+ .reg .b32 %r<5>;
+ .reg .b64 %rd<6>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_1];
+ ld.param.u32 %r2, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_2];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ ld.u32 %r3, [%rd2+8];
+ mad.lo.s32 %r4, %r3, %r1, %r2;
+ mul.wide.u32 %rd4, %r4, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ st.param.b64 [func_retval0+0], %rd5;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE10vals_denseEj(
+ .param .b64 _ZN14MatrixAccessorIfE10vals_denseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE10vals_denseEj_param_1
+)
+{
+ .reg .b32 %r<2>;
+ .reg .b64 %rd<6>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10vals_denseEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE10vals_denseEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ mul.wide.u32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ st.param.b64 [func_retval0+0], %rd5;
+ ret;
+}
+
+.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE13row_len_denseEj(
+ .param .b64 _ZN14MatrixAccessorIfE13row_len_denseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE13row_len_denseEj_param_1
+)
+{
+ .reg .b32 %r<2>;
+ .reg .b64 %rd<3>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE13row_len_denseEj_param_0];
+ ld.u64 %rd2, [%rd1];
+ ld.u32 %r1, [%rd2+4];
+ st.param.b32 [func_retval0+0], %r1;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11val_dense_iEj(
+ .param .b64 _ZN14MatrixAccessorIfE11val_dense_iEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE11val_dense_iEj_param_1
+)
+{
+ .reg .b32 %r<2>;
+ .reg .b64 %rd<6>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE11val_dense_iEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE11val_dense_iEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ mul.wide.u32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ st.param.b64 [func_retval0+0], %rd5;
+ ret;
+}
+
+.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE10len_sparseEv(
+ .param .b64 _ZN14MatrixAccessorIfE10len_sparseEv_param_0
+)
+{
+ .reg .b32 %r<2>;
+ .reg .b64 %rd<3>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10len_sparseEv_param_0];
+ ld.u64 %rd2, [%rd1];
+ ld.u32 %r1, [%rd2];
+ st.param.b32 [func_retval0+0], %r1;
+ ret;
+}
+
+.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE10pos_sparseEj(
+ .param .b64 _ZN14MatrixAccessorIfE10pos_sparseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE10pos_sparseEj_param_1
+)
+{
+ .reg .b32 %r<3>;
+ .reg .b64 %rd<6>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10pos_sparseEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE10pos_sparseEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+16];
+ mul.wide.u32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.u32 %r2, [%rd5];
+ st.param.b32 [func_retval0+0], %r2;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11cols_sparseEj(
+ .param .b64 _ZN14MatrixAccessorIfE11cols_sparseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE11cols_sparseEj_param_1
+)
+{
+ .reg .b32 %r<3>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE11cols_sparseEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE11cols_sparseEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+24];
+ ld.u64 %rd4, [%rd2+16];
+ mul.wide.u32 %rd5, %r1, 4;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.u32 %r2, [%rd6];
+ mul.wide.u32 %rd7, %r2, 4;
+ add.s64 %rd8, %rd3, %rd7;
+ st.param.b64 [func_retval0+0], %rd8;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE13val_sparse_rcEjj(
+ .param .b64 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_1,
+ .param .b32 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_2
+)
+{
+ .reg .b64 %rd<4>;
+
+
+ ld.param.u64 %rd1,
[_ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_0];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ st.param.b64 [func_retval0+0], %rd3;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11vals_sparseEj(
+ .param .b64 _ZN14MatrixAccessorIfE11vals_sparseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE11vals_sparseEj_param_1
+)
+{
+ .reg .b32 %r<3>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE11vals_sparseEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE11vals_sparseEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ ld.u64 %rd4, [%rd2+16];
+ mul.wide.u32 %rd5, %r1, 4;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.u32 %r2, [%rd6];
+ mul.wide.u32 %rd7, %r2, 4;
+ add.s64 %rd8, %rd3, %rd7;
+ st.param.b64 [func_retval0+0], %rd8;
+ ret;
+}
+
+.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE14row_len_sparseEj(
+ .param .b64 _ZN14MatrixAccessorIfE14row_len_sparseEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE14row_len_sparseEj_param_1
+)
+{
+ .reg .b32 %r<6>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd1,
[_ZN14MatrixAccessorIfE14row_len_sparseEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE14row_len_sparseEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+16];
+ add.s32 %r2, %r1, 1;
+ mul.wide.u32 %rd4, %r2, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.u32 %r3, [%rd5];
+ mul.wide.u32 %rd6, %r1, 4;
+ add.s64 %rd7, %rd3, %rd6;
+ ld.u32 %r4, [%rd7];
+ sub.s32 %r5, %r3, %r4;
+ st.param.b32 [func_retval0+0], %r5;
+ ret;
+}
+
+.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE12val_sparse_iEj(
+ .param .b64 _ZN14MatrixAccessorIfE12val_sparse_iEj_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE12val_sparse_iEj_param_1
+)
+{
+ .reg .b32 %r<2>;
+ .reg .b64 %rd<6>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE12val_sparse_iEj_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE12val_sparse_iEj_param_1];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ mul.wide.u32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ st.param.b64 [func_retval0+0], %rd5;
+ ret;
+}
+
+.func _ZN14MatrixAccessorIfE10set_sparseEjjf(
+ .param .b64 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_0,
+ .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_1,
+ .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_2,
+ .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_3
+)
+{
+ .reg .f32 %f<2>;
+ .reg .b32 %r<3>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_0];
+ ld.param.u32 %r1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_1];
+ ld.param.u32 %r2, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_2];
+ ld.param.f32 %f1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_3];
+ ld.u64 %rd2, [%rd1];
+ ld.u64 %rd3, [%rd2+32];
+ mul.wide.u32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ st.f32 [%rd5], %f1;
+ ld.u64 %rd6, [%rd1];
+ ld.u64 %rd7, [%rd6+24];
+ add.s64 %rd8, %rd7, %rd4;
+ st.u32 [%rd8], %r2;
+ ret;
+}
+
.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE9len_denseEv(
.param .b64 _ZN14MatrixAccessorIdE9len_denseEv_param_0
)
{
- .reg .b32 %r<6>;
+ .reg .b32 %r<4>;
.reg .b64 %rd<3>;
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE9len_denseEv_param_0];
ld.u64 %rd2, [%rd1];
- ld.v2.u32 {%r1, %r2}, [%rd2+24];
- mul.lo.s32 %r5, %r2, %r1;
- st.param.b32 [func_retval0+0], %r5;
+ ld.u32 %r1, [%rd2+4];
+ ld.u32 %r2, [%rd2+8];
+ mul.lo.s32 %r3, %r2, %r1;
+ st.param.b32 [func_retval0+0], %r3;
ret;
}
@@ -47,7 +364,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE9pos_denseEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE9pos_denseEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u32 %r2, [%rd2+28];
+ ld.u32 %r2, [%rd2+8];
mul.lo.s32 %r3, %r2, %r1;
st.param.b32 [func_retval0+0], %r3;
ret;
@@ -65,7 +382,7 @@
mov.u64 %rd1, $str;
cvta.global.u64 %rd2, %rd1;
mov.u64 %rd3, 0;
- // Callseq Start 0
+ // Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
@@ -83,7 +400,7 @@
ld.param.b32 %r1, [retval0+0];
//{
- }// Callseq End 0
+ }// Callseq End 1
st.param.b64 [func_retval0+0], %rd3;
ret;
}
@@ -102,8 +419,8 @@
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_1];
ld.param.u32 %r2, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_2];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
- ld.u32 %r3, [%rd2+28];
+ ld.u64 %rd3, [%rd2+32];
+ ld.u32 %r3, [%rd2+8];
mad.lo.s32 %r4, %r3, %r1, %r2;
mul.wide.u32 %rd4, %r4, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -123,7 +440,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10vals_denseEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE10vals_denseEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
+ ld.u64 %rd3, [%rd2+32];
mul.wide.u32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
st.param.b64 [func_retval0+0], %rd5;
@@ -141,7 +458,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE13row_len_denseEj_param_0];
ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2+24];
+ ld.u32 %r1, [%rd2+4];
st.param.b32 [func_retval0+0], %r1;
ret;
}
@@ -158,7 +475,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE11val_dense_iEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE11val_dense_iEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
+ ld.u64 %rd3, [%rd2+32];
mul.wide.u32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
st.param.b64 [func_retval0+0], %rd5;
@@ -175,7 +492,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10len_sparseEv_param_0];
ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2+32];
+ ld.u32 %r1, [%rd2];
st.param.b32 [func_retval0+0], %r1;
ret;
}
@@ -192,7 +509,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10pos_sparseEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE10pos_sparseEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+8];
+ ld.u64 %rd3, [%rd2+16];
mul.wide.u32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.u32 %r2, [%rd5];
@@ -212,8 +529,8 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE11cols_sparseEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE11cols_sparseEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+16];
- ld.u64 %rd4, [%rd2+8];
+ ld.u64 %rd3, [%rd2+24];
+ ld.u64 %rd4, [%rd2+16];
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.u32 %r2, [%rd6];
@@ -234,7 +551,7 @@
ld.param.u64 %rd1,
[_ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_0];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
+ ld.u64 %rd3, [%rd2+32];
st.param.b64 [func_retval0+0], %rd3;
ret;
}
@@ -251,8 +568,8 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE11vals_sparseEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE11vals_sparseEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
- ld.u64 %rd4, [%rd2+8];
+ ld.u64 %rd3, [%rd2+32];
+ ld.u64 %rd4, [%rd2+16];
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.u32 %r2, [%rd6];
@@ -274,7 +591,7 @@
ld.param.u64 %rd1,
[_ZN14MatrixAccessorIdE14row_len_sparseEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE14row_len_sparseEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+8];
+ ld.u64 %rd3, [%rd2+16];
add.s32 %r2, %r1, 1;
mul.wide.u32 %rd4, %r2, 4;
add.s64 %rd5, %rd3, %rd4;
@@ -299,7 +616,7 @@
ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE12val_sparse_iEj_param_0];
ld.param.u32 %r1, [_ZN14MatrixAccessorIdE12val_sparse_iEj_param_1];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
+ ld.u64 %rd3, [%rd2+32];
mul.wide.u32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
st.param.b64 [func_retval0+0], %rd5;
@@ -323,102 +640,1314 @@
ld.param.u32 %r2, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_2];
ld.param.f64 %fd1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_3];
ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2];
+ ld.u64 %rd3, [%rd2+32];
mul.wide.u32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
st.f64 [%rd5], %fd1;
ld.u64 %rd6, [%rd1];
- ld.u64 %rd7, [%rd6+16];
+ ld.u64 %rd7, [%rd6+24];
mul.wide.u32 %rd8, %r1, 4;
add.s64 %rd9, %rd7, %rd8;
st.u32 [%rd9], %r2;
ret;
}
- // .globl double2float_f
-.visible .entry double2float_f(
- .param .u64 double2float_f_param_0,
- .param .u64 double2float_f_param_1,
- .param .u32 double2float_f_param_2
-)
-{
- .reg .pred %p<2>;
- .reg .f32 %f<2>;
- .reg .b32 %r<6>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<9>;
+ // .globl double2float_f
+.visible .entry double2float_f(
+ .param .u64 double2float_f_param_0,
+ .param .u64 double2float_f_param_1,
+ .param .u32 double2float_f_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .f32 %f<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [double2float_f_param_0];
+ ld.param.u64 %rd2, [double2float_f_param_1];
+ ld.param.u32 %r2, [double2float_f_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.s32 %p1, %r1, %r2;
+ @%p1 bra BB30_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ cvt.rn.f32.f64 %f1, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.s32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f1;
+
+BB30_2:
+ ret;
+}
+
+ // .globl float2double_f
+.visible .entry float2double_f(
+ .param .u64 float2double_f_param_0,
+ .param .u64 float2double_f_param_1,
+ .param .u32 float2double_f_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .f32 %f<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [float2double_f_param_0];
+ ld.param.u64 %rd2, [float2double_f_param_1];
+ ld.param.u32 %r2, [float2double_f_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.s32 %p1, %r1, %r2;
+ @%p1 bra BB31_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f1, [%rd5];
+ cvt.f64.f32 %fd1, %f1;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.s32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd1;
+
+BB31_2:
+ ret;
+}
+
+ // .globl reduce_sum_f
+.visible .entry reduce_sum_f(
+ .param .u64 reduce_sum_f_param_0,
+ .param .u64 reduce_sum_f_param_1,
+ .param .u32 reduce_sum_f_param_2
+)
+{
+ .local .align 8 .b8 __local_depot32[272];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<25>;
+ .reg .f32 %f<60>;
+ .reg .b32 %r<44>;
+ .reg .b64 %rd<123>;
+
+
+ mov.u64 %SPL, __local_depot32;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd17, [reduce_sum_f_param_0];
+ ld.param.u64 %rd16, [reduce_sum_f_param_1];
+ ld.param.u32 %r5, [reduce_sum_f_param_2];
+ add.u64 %rd18, %SP, 0;
+ add.u64 %rd1, %SPL, 0;
+ st.local.u64 [%rd1], %rd17;
+ cvta.to.global.u64 %rd19, %rd17;
+ ld.global.u64 %rd20, [%rd19+16];
+ setp.eq.s64 %p1, %rd20, 0;
+ @%p1 bra BB32_2;
+
+ mov.u64 %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
+ st.local.u64 [%rd1+8], %rd21;
+ mov.u64 %rd23, 0;
+ st.local.u64 [%rd1+16], %rd23;
+ mov.u64 %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
+ st.local.u64 [%rd1+40], %rd24;
+ st.local.u64 [%rd1+48], %rd23;
+ mov.u64 %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
+ st.local.u64 [%rd1+56], %rd26;
+ st.local.u64 [%rd1+64], %rd23;
+ mov.u64 %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
+ st.local.u64 [%rd1+88], %rd28;
+ st.local.u64 [%rd1+96], %rd23;
+ mov.u64 %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
+ st.local.u64 [%rd1+104], %rd30;
+ st.local.u64 [%rd1+112], %rd23;
+ mov.u64 %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
+ st.local.u64 [%rd1+24], %rd32;
+ st.local.u64 [%rd1+32], %rd23;
+ mov.u64 %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
+ st.local.u64 [%rd1+72], %rd34;
+ st.local.u64 [%rd1+80], %rd23;
+ mov.u64 %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
+ st.local.u64 [%rd1+120], %rd36;
+ st.local.u64 [%rd1+128], %rd23;
+ bra.uni BB32_3;
+
+BB32_2:
+ mov.u64 %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
+ st.local.u64 [%rd1+8], %rd38;
+ mov.u64 %rd40, 0;
+ st.local.u64 [%rd1+16], %rd40;
+ mov.u64 %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
+ st.local.u64 [%rd1+40], %rd41;
+ st.local.u64 [%rd1+48], %rd40;
+ mov.u64 %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
+ st.local.u64 [%rd1+56], %rd43;
+ st.local.u64 [%rd1+64], %rd40;
+ mov.u64 %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
+ st.local.u64 [%rd1+88], %rd45;
+ st.local.u64 [%rd1+96], %rd40;
+ mov.u64 %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
+ st.local.u64 [%rd1+104], %rd47;
+ st.local.u64 [%rd1+112], %rd40;
+ mov.u64 %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
+ st.local.u64 [%rd1+24], %rd49;
+ st.local.u64 [%rd1+32], %rd40;
+ mov.u64 %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
+ st.local.u64 [%rd1+72], %rd51;
+ st.local.u64 [%rd1+80], %rd40;
+
+BB32_3:
+ add.u64 %rd53, %SP, 136;
+ add.u64 %rd2, %SPL, 136;
+ st.local.u64 [%rd2], %rd16;
+ cvta.to.global.u64 %rd54, %rd16;
+ ld.global.u64 %rd55, [%rd54+16];
+ setp.eq.s64 %p2, %rd55, 0;
+ @%p2 bra BB32_5;
+
+ mov.u64 %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
+ st.local.u64 [%rd2+8], %rd56;
+ mov.u64 %rd58, 0;
+ st.local.u64 [%rd2+16], %rd58;
+ mov.u64 %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
+ st.local.u64 [%rd2+40], %rd59;
+ st.local.u64 [%rd2+48], %rd58;
+ mov.u64 %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
+ st.local.u64 [%rd2+56], %rd61;
+ st.local.u64 [%rd2+64], %rd58;
+ mov.u64 %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
+ st.local.u64 [%rd2+88], %rd63;
+ st.local.u64 [%rd2+96], %rd58;
+ mov.u64 %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
+ st.local.u64 [%rd2+104], %rd65;
+ st.local.u64 [%rd2+112], %rd58;
+ mov.u64 %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
+ st.local.u64 [%rd2+24], %rd67;
+ st.local.u64 [%rd2+32], %rd58;
+ mov.u64 %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
+ st.local.u64 [%rd2+72], %rd69;
+ st.local.u64 [%rd2+80], %rd58;
+ mov.u64 %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
+ st.local.u64 [%rd2+120], %rd71;
+ st.local.u64 [%rd2+128], %rd58;
+ bra.uni BB32_6;
+
+BB32_5:
+ mov.u64 %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
+ st.local.u64 [%rd2+8], %rd73;
+ mov.u64 %rd75, 0;
+ st.local.u64 [%rd2+16], %rd75;
+ mov.u64 %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
+ st.local.u64 [%rd2+40], %rd76;
+ st.local.u64 [%rd2+48], %rd75;
+ mov.u64 %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
+ st.local.u64 [%rd2+56], %rd78;
+ st.local.u64 [%rd2+64], %rd75;
+ mov.u64 %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
+ st.local.u64 [%rd2+88], %rd80;
+ st.local.u64 [%rd2+96], %rd75;
+ mov.u64 %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
+ st.local.u64 [%rd2+104], %rd82;
+ st.local.u64 [%rd2+112], %rd75;
+ mov.u64 %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
+ st.local.u64 [%rd2+24], %rd84;
+ st.local.u64 [%rd2+32], %rd75;
+ mov.u64 %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
+ st.local.u64 [%rd2+72], %rd86;
+ st.local.u64 [%rd2+80], %rd75;
+
+BB32_6:
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r43, %r8, %r9, %r6;
+ mov.f32 %f44, 0f00000000;
+ setp.ge.u32 %p3, %r43, %r5;
+ @%p3 bra BB32_15;
+
+ mov.f32 %f44, 0f00000000;
+
+BB32_8:
+ ld.local.u64 %rd3, [%rd1+112];
+ ld.local.u64 %rd120, [%rd1+104];
+ and.b64 %rd90, %rd120, 1;
+ setp.eq.b64 %p4, %rd90, 1;
+ @!%p4 bra BB32_10;
+ bra.uni BB32_9;
+
+BB32_9:
+ add.s64 %rd93, %rd1, %rd3;
+ ld.local.u64 %rd94, [%rd93];
+ add.s64 %rd95, %rd120, %rd94;
+ ld.u64 %rd120, [%rd95+-1];
+
+BB32_10:
+ add.s64 %rd97, %rd18, %rd3;
+ // Callseq Start 2
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd97;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r43;
+ .param .b64 retval0;
+ prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ call (retval0),
+ %rd120,
+ (
+ param0,
+ param1
+ )
+ , prototype_2;
+ ld.param.b64 %rd99, [retval0+0];
+
+ //{
+ }// Callseq End 2
+ ld.f32 %f31, [%rd99];
+ add.f32 %f44, %f44, %f31;
+ add.s32 %r16, %r43, %r9;
+ setp.ge.u32 %p5, %r16, %r5;
+ @%p5 bra BB32_14;
+
+ ld.local.u64 %rd121, [%rd1+104];
+ and.b64 %rd102, %rd121, 1;
+ setp.eq.b64 %p6, %rd102, 1;
+ ld.local.u64 %rd8, [%rd1+112];
+ @!%p6 bra BB32_13;
+ bra.uni BB32_12;
+
+BB32_12:
+ add.s64 %rd105, %rd1, %rd8;
+ ld.local.u64 %rd106, [%rd105];
+ add.s64 %rd107, %rd121, %rd106;
+ ld.u64 %rd121, [%rd107+-1];
+
+BB32_13:
+ add.s64 %rd109, %rd18, %rd8;
+ // Callseq Start 3
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd109;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r16;
+ .param .b64 retval0;
+ prototype_3 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ call (retval0),
+ %rd121,
+ (
+ param0,
+ param1
+ )
+ , prototype_3;
+ ld.param.b64 %rd111, [retval0+0];
+
+ //{
+ }// Callseq End 3
+ ld.f32 %f32, [%rd111];
+ add.f32 %f44, %f44, %f32;
+
+BB32_14:
+ shl.b32 %r20, %r9, 1;
+ mov.u32 %r21, %nctaid.x;
+ mad.lo.s32 %r43, %r20, %r21, %r43;
+ setp.lt.u32 %p7, %r43, %r5;
+ @%p7 bra BB32_8;
+
+BB32_15:
+ shl.b32 %r23, %r6, 2;
+ mov.u32 %r24, memory;
+ add.s32 %r4, %r24, %r23;
+ st.shared.f32 [%r4], %f44;
+ bar.sync 0;
+ setp.lt.u32 %p8, %r9, 1024;
+ @%p8 bra BB32_19;
+
+ setp.gt.u32 %p9, %r6, 511;
+ @%p9 bra BB32_18;
+
+ ld.shared.f32 %f33, [%r4+2048];
+ add.f32 %f44, %f44, %f33;
+ st.shared.f32 [%r4], %f44;
+
+BB32_18:
+ bar.sync 0;
+
+BB32_19:
+ setp.lt.u32 %p10, %r9, 512;
+ @%p10 bra BB32_23;
+
+ setp.gt.u32 %p11, %r6, 255;
+ @%p11 bra BB32_22;
+
+ ld.shared.f32 %f34, [%r4+1024];
+ add.f32 %f44, %f44, %f34;
+ st.shared.f32 [%r4], %f44;
+
+BB32_22:
+ bar.sync 0;
+
+BB32_23:
+ setp.lt.u32 %p12, %r9, 256;
+ @%p12 bra BB32_27;
+
+ setp.gt.u32 %p13, %r6, 127;
+ @%p13 bra BB32_26;
+
+ ld.shared.f32 %f35, [%r4+512];
+ add.f32 %f44, %f44, %f35;
+ st.shared.f32 [%r4], %f44;
+
+BB32_26:
+ bar.sync 0;
+
+BB32_27:
+ setp.lt.u32 %p14, %r9, 128;
+ @%p14 bra BB32_31;
+
+ setp.gt.u32 %p15, %r6, 63;
+ @%p15 bra BB32_30;
+
+ ld.shared.f32 %f36, [%r4+256];
+ add.f32 %f44, %f44, %f36;
+ st.shared.f32 [%r4], %f44;
+
+BB32_30:
+ bar.sync 0;
+
+BB32_31:
+ setp.gt.u32 %p16, %r6, 31;
+ @%p16 bra BB32_44;
+
+ setp.lt.u32 %p17, %r9, 64;
+ @%p17 bra BB32_34;
+
+ ld.volatile.shared.f32 %f37, [%r4+128];
+ add.f32 %f44, %f44, %f37;
+ st.volatile.shared.f32 [%r4], %f44;
+
+BB32_34:
+ setp.lt.u32 %p18, %r9, 32;
+ @%p18 bra BB32_36;
+
+ ld.volatile.shared.f32 %f38, [%r4+64];
+ add.f32 %f44, %f44, %f38;
+ st.volatile.shared.f32 [%r4], %f44;
+
+BB32_36:
+ setp.lt.u32 %p19, %r9, 16;
+ @%p19 bra BB32_38;
+
+ ld.volatile.shared.f32 %f39, [%r4+32];
+ add.f32 %f44, %f44, %f39;
+ st.volatile.shared.f32 [%r4], %f44;
+
+BB32_38:
+ setp.lt.u32 %p20, %r9, 8;
+ @%p20 bra BB32_40;
+
+ ld.volatile.shared.f32 %f40, [%r4+16];
+ add.f32 %f44, %f44, %f40;
+ st.volatile.shared.f32 [%r4], %f44;
+
+BB32_40:
+ setp.lt.u32 %p21, %r9, 4;
+ @%p21 bra BB32_42;
+
+ ld.volatile.shared.f32 %f41, [%r4+8];
+ add.f32 %f44, %f44, %f41;
+ st.volatile.shared.f32 [%r4], %f44;
+
+BB32_42:
+ setp.lt.u32 %p22, %r9, 2;
+ @%p22 bra BB32_44;
+
+ ld.volatile.shared.f32 %f42, [%r4+4];
+ add.f32 %f43, %f44, %f42;
+ st.volatile.shared.f32 [%r4], %f43;
+
+BB32_44:
+ setp.ne.s32 %p23, %r6, 0;
+ @%p23 bra BB32_48;
+
+ ld.shared.f32 %f28, [memory];
+ ld.local.u64 %rd114, [%rd2+96];
+ add.s64 %rd11, %rd2, %rd114;
+ add.s64 %rd12, %rd53, %rd114;
+ ld.local.u64 %rd122, [%rd2+88];
+ and.b64 %rd115, %rd122, 1;
+ setp.eq.b64 %p24, %rd115, 1;
+ @!%p24 bra BB32_47;
+ bra.uni BB32_46;
+
+BB32_46:
+ ld.local.u64 %rd116, [%rd11];
+ add.s64 %rd117, %rd122, %rd116;
+ ld.u64 %rd122, [%rd117+-1];
+
+BB32_47:
+ mov.u32 %r42, 0;
+ // Callseq Start 4
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r42;
+ .param .b32 param2;
+ st.param.b32 [param2+0], %r7;
+ .param .b64 retval0;
+ prototype_4 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
+ call (retval0),
+ %rd122,
+ (
+ param0,
+ param1,
+ param2
+ )
+ , prototype_4;
+ ld.param.b64 %rd119, [retval0+0];
+
+ //{
+ }// Callseq End 4
+ st.f32 [%rd119], %f28;
+
+BB32_48:
+ ret;
+}
+
+ // .globl reduce_sum_d
+.visible .entry reduce_sum_d(
+ .param .u64 reduce_sum_d_param_0,
+ .param .u64 reduce_sum_d_param_1,
+ .param .u32 reduce_sum_d_param_2
+)
+{
+ .local .align 8 .b8 __local_depot33[272];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<25>;
+ .reg .b32 %r<44>;
+ .reg .f64 %fd<60>;
+ .reg .b64 %rd<123>;
+
+
+ mov.u64 %SPL, __local_depot33;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd17, [reduce_sum_d_param_0];
+ ld.param.u64 %rd16, [reduce_sum_d_param_1];
+ ld.param.u32 %r5, [reduce_sum_d_param_2];
+ add.u64 %rd18, %SP, 0;
+ add.u64 %rd1, %SPL, 0;
+ st.local.u64 [%rd1], %rd17;
+ cvta.to.global.u64 %rd19, %rd17;
+ ld.global.u64 %rd20, [%rd19+16];
+ setp.eq.s64 %p1, %rd20, 0;
+ @%p1 bra BB33_2;
+
+ mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
+ st.local.u64 [%rd1+8], %rd21;
+ mov.u64 %rd23, 0;
+ st.local.u64 [%rd1+16], %rd23;
+ mov.u64 %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
+ st.local.u64 [%rd1+40], %rd24;
+ st.local.u64 [%rd1+48], %rd23;
+ mov.u64 %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
+ st.local.u64 [%rd1+56], %rd26;
+ st.local.u64 [%rd1+64], %rd23;
+ mov.u64 %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
+ st.local.u64 [%rd1+88], %rd28;
+ st.local.u64 [%rd1+96], %rd23;
+ mov.u64 %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
+ st.local.u64 [%rd1+104], %rd30;
+ st.local.u64 [%rd1+112], %rd23;
+ mov.u64 %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
+ st.local.u64 [%rd1+24], %rd32;
+ st.local.u64 [%rd1+32], %rd23;
+ mov.u64 %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
+ st.local.u64 [%rd1+72], %rd34;
+ st.local.u64 [%rd1+80], %rd23;
+ mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
+ st.local.u64 [%rd1+120], %rd36;
+ st.local.u64 [%rd1+128], %rd23;
+ bra.uni BB33_3;
+
+BB33_2:
+ mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
+ st.local.u64 [%rd1+8], %rd38;
+ mov.u64 %rd40, 0;
+ st.local.u64 [%rd1+16], %rd40;
+ mov.u64 %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
+ st.local.u64 [%rd1+40], %rd41;
+ st.local.u64 [%rd1+48], %rd40;
+ mov.u64 %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
+ st.local.u64 [%rd1+56], %rd43;
+ st.local.u64 [%rd1+64], %rd40;
+ mov.u64 %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
+ st.local.u64 [%rd1+88], %rd45;
+ st.local.u64 [%rd1+96], %rd40;
+ mov.u64 %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
+ st.local.u64 [%rd1+104], %rd47;
+ st.local.u64 [%rd1+112], %rd40;
+ mov.u64 %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
+ st.local.u64 [%rd1+24], %rd49;
+ st.local.u64 [%rd1+32], %rd40;
+ mov.u64 %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
+ st.local.u64 [%rd1+72], %rd51;
+ st.local.u64 [%rd1+80], %rd40;
+
+BB33_3:
+ add.u64 %rd53, %SP, 136;
+ add.u64 %rd2, %SPL, 136;
+ st.local.u64 [%rd2], %rd16;
+ cvta.to.global.u64 %rd54, %rd16;
+ ld.global.u64 %rd55, [%rd54+16];
+ setp.eq.s64 %p2, %rd55, 0;
+ @%p2 bra BB33_5;
+
+ mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
+ st.local.u64 [%rd2+8], %rd56;
+ mov.u64 %rd58, 0;
+ st.local.u64 [%rd2+16], %rd58;
+ mov.u64 %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
+ st.local.u64 [%rd2+40], %rd59;
+ st.local.u64 [%rd2+48], %rd58;
+ mov.u64 %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
+ st.local.u64 [%rd2+56], %rd61;
+ st.local.u64 [%rd2+64], %rd58;
+ mov.u64 %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
+ st.local.u64 [%rd2+88], %rd63;
+ st.local.u64 [%rd2+96], %rd58;
+ mov.u64 %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
+ st.local.u64 [%rd2+104], %rd65;
+ st.local.u64 [%rd2+112], %rd58;
+ mov.u64 %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
+ st.local.u64 [%rd2+24], %rd67;
+ st.local.u64 [%rd2+32], %rd58;
+ mov.u64 %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
+ st.local.u64 [%rd2+72], %rd69;
+ st.local.u64 [%rd2+80], %rd58;
+ mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
+ st.local.u64 [%rd2+120], %rd71;
+ st.local.u64 [%rd2+128], %rd58;
+ bra.uni BB33_6;
+
+BB33_5:
+ mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
+ st.local.u64 [%rd2+8], %rd73;
+ mov.u64 %rd75, 0;
+ st.local.u64 [%rd2+16], %rd75;
+ mov.u64 %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
+ st.local.u64 [%rd2+40], %rd76;
+ st.local.u64 [%rd2+48], %rd75;
+ mov.u64 %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
+ st.local.u64 [%rd2+56], %rd78;
+ st.local.u64 [%rd2+64], %rd75;
+ mov.u64 %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
+ st.local.u64 [%rd2+88], %rd80;
+ st.local.u64 [%rd2+96], %rd75;
+ mov.u64 %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
+ st.local.u64 [%rd2+104], %rd82;
+ st.local.u64 [%rd2+112], %rd75;
+ mov.u64 %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
+ st.local.u64 [%rd2+24], %rd84;
+ st.local.u64 [%rd2+32], %rd75;
+ mov.u64 %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
+ st.local.u64 [%rd2+72], %rd86;
+ st.local.u64 [%rd2+80], %rd75;
+
+BB33_6:
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r43, %r8, %r9, %r6;
+ mov.f64 %fd44, 0d0000000000000000;
+ setp.ge.u32 %p3, %r43, %r5;
+ @%p3 bra BB33_15;
+
+ mov.f64 %fd44, 0d0000000000000000;
+
+BB33_8:
+ ld.local.u64 %rd3, [%rd1+112];
+ ld.local.u64 %rd120, [%rd1+104];
+ and.b64 %rd90, %rd120, 1;
+ setp.eq.b64 %p4, %rd90, 1;
+ @!%p4 bra BB33_10;
+ bra.uni BB33_9;
+
+BB33_9:
+ add.s64 %rd93, %rd1, %rd3;
+ ld.local.u64 %rd94, [%rd93];
+ add.s64 %rd95, %rd120, %rd94;
+ ld.u64 %rd120, [%rd95+-1];
+
+BB33_10:
+ add.s64 %rd97, %rd18, %rd3;
+ // Callseq Start 5
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd97;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r43;
+ .param .b64 retval0;
+ prototype_5 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ call (retval0),
+ %rd120,
+ (
+ param0,
+ param1
+ )
+ , prototype_5;
+ ld.param.b64 %rd99, [retval0+0];
+
+ //{
+ }// Callseq End 5
+ ld.f64 %fd31, [%rd99];
+ add.f64 %fd44, %fd44, %fd31;
+ add.s32 %r16, %r43, %r9;
+ setp.ge.u32 %p5, %r16, %r5;
+ @%p5 bra BB33_14;
+
+ ld.local.u64 %rd121, [%rd1+104];
+ and.b64 %rd102, %rd121, 1;
+ setp.eq.b64 %p6, %rd102, 1;
+ ld.local.u64 %rd8, [%rd1+112];
+ @!%p6 bra BB33_13;
+ bra.uni BB33_12;
+
+BB33_12:
+ add.s64 %rd105, %rd1, %rd8;
+ ld.local.u64 %rd106, [%rd105];
+ add.s64 %rd107, %rd121, %rd106;
+ ld.u64 %rd121, [%rd107+-1];
+
+BB33_13:
+ add.s64 %rd109, %rd18, %rd8;
+ // Callseq Start 6
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd109;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r16;
+ .param .b64 retval0;
+ prototype_6 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ call (retval0),
+ %rd121,
+ (
+ param0,
+ param1
+ )
+ , prototype_6;
+ ld.param.b64 %rd111, [retval0+0];
+
+ //{
+ }// Callseq End 6
+ ld.f64 %fd32, [%rd111];
+ add.f64 %fd44, %fd44, %fd32;
+
+BB33_14:
+ shl.b32 %r20, %r9, 1;
+ mov.u32 %r21, %nctaid.x;
+ mad.lo.s32 %r43, %r20, %r21, %r43;
+ setp.lt.u32 %p7, %r43, %r5;
+ @%p7 bra BB33_8;
+
+BB33_15:
+ shl.b32 %r23, %r6, 3;
+ mov.u32 %r24, memory;
+ add.s32 %r4, %r24, %r23;
+ st.shared.f64 [%r4], %fd44;
+ bar.sync 0;
+ setp.lt.u32 %p8, %r9, 1024;
+ @%p8 bra BB33_19;
+
+ setp.gt.u32 %p9, %r6, 511;
+ @%p9 bra BB33_18;
+
+ ld.shared.f64 %fd33, [%r4+4096];
+ add.f64 %fd44, %fd44, %fd33;
+ st.shared.f64 [%r4], %fd44;
+
+BB33_18:
+ bar.sync 0;
+
+BB33_19:
+ setp.lt.u32 %p10, %r9, 512;
+ @%p10 bra BB33_23;
+
+ setp.gt.u32 %p11, %r6, 255;
+ @%p11 bra BB33_22;
+
+ ld.shared.f64 %fd34, [%r4+2048];
+ add.f64 %fd44, %fd44, %fd34;
+ st.shared.f64 [%r4], %fd44;
+
+BB33_22:
+ bar.sync 0;
+
+BB33_23:
+ setp.lt.u32 %p12, %r9, 256;
+ @%p12 bra BB33_27;
+
+ setp.gt.u32 %p13, %r6, 127;
+ @%p13 bra BB33_26;
+
+ ld.shared.f64 %fd35, [%r4+1024];
+ add.f64 %fd44, %fd44, %fd35;
+ st.shared.f64 [%r4], %fd44;
+
+BB33_26:
+ bar.sync 0;
+
+BB33_27:
+ setp.lt.u32 %p14, %r9, 128;
+ @%p14 bra BB33_31;
+
+ setp.gt.u32 %p15, %r6, 63;
+ @%p15 bra BB33_30;
+
+ ld.shared.f64 %fd36, [%r4+512];
+ add.f64 %fd44, %fd44, %fd36;
+ st.shared.f64 [%r4], %fd44;
+
+BB33_30:
+ bar.sync 0;
+
+BB33_31:
+ setp.gt.u32 %p16, %r6, 31;
+ @%p16 bra BB33_44;
+
+ setp.lt.u32 %p17, %r9, 64;
+ @%p17 bra BB33_34;
+
+ ld.volatile.shared.f64 %fd37, [%r4+256];
+ add.f64 %fd44, %fd44, %fd37;
+ st.volatile.shared.f64 [%r4], %fd44;
+
+BB33_34:
+ setp.lt.u32 %p18, %r9, 32;
+ @%p18 bra BB33_36;
+
+ ld.volatile.shared.f64 %fd38, [%r4+128];
+ add.f64 %fd44, %fd44, %fd38;
+ st.volatile.shared.f64 [%r4], %fd44;
+
+BB33_36:
+ setp.lt.u32 %p19, %r9, 16;
+ @%p19 bra BB33_38;
+
+ ld.volatile.shared.f64 %fd39, [%r4+64];
+ add.f64 %fd44, %fd44, %fd39;
+ st.volatile.shared.f64 [%r4], %fd44;
+
+BB33_38:
+ setp.lt.u32 %p20, %r9, 8;
+ @%p20 bra BB33_40;
+
+ ld.volatile.shared.f64 %fd40, [%r4+32];
+ add.f64 %fd44, %fd44, %fd40;
+ st.volatile.shared.f64 [%r4], %fd44;
+
+BB33_40:
+ setp.lt.u32 %p21, %r9, 4;
+ @%p21 bra BB33_42;
+
+ ld.volatile.shared.f64 %fd41, [%r4+16];
+ add.f64 %fd44, %fd44, %fd41;
+ st.volatile.shared.f64 [%r4], %fd44;
+
+BB33_42:
+ setp.lt.u32 %p22, %r9, 2;
+ @%p22 bra BB33_44;
+
+ ld.volatile.shared.f64 %fd42, [%r4+8];
+ add.f64 %fd43, %fd44, %fd42;
+ st.volatile.shared.f64 [%r4], %fd43;
+
+BB33_44:
+ setp.ne.s32 %p23, %r6, 0;
+ @%p23 bra BB33_48;
+
+ ld.shared.f64 %fd28, [memory];
+ ld.local.u64 %rd114, [%rd2+96];
+ add.s64 %rd11, %rd2, %rd114;
+ add.s64 %rd12, %rd53, %rd114;
+ ld.local.u64 %rd122, [%rd2+88];
+ and.b64 %rd115, %rd122, 1;
+ setp.eq.b64 %p24, %rd115, 1;
+ @!%p24 bra BB33_47;
+ bra.uni BB33_46;
+
+BB33_46:
+ ld.local.u64 %rd116, [%rd11];
+ add.s64 %rd117, %rd122, %rd116;
+ ld.u64 %rd122, [%rd117+-1];
+
+BB33_47:
+ mov.u32 %r42, 0;
+ // Callseq Start 7
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r42;
+ .param .b32 param2;
+ st.param.b32 [param2+0], %r7;
+ .param .b64 retval0;
+ prototype_7 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
+ call (retval0),
+ %rd122,
+ (
+ param0,
+ param1,
+ param2
+ )
+ , prototype_7;
+ ld.param.b64 %rd119, [retval0+0];
+
+ //{
+ }// Callseq End 7
+ st.f64 [%rd119], %fd28;
+
+BB33_48:
+ ret;
+}
+
+ // .globl reduce_max_f
+.visible .entry reduce_max_f(
+ .param .u64 reduce_max_f_param_0,
+ .param .u64 reduce_max_f_param_1,
+ .param .u32 reduce_max_f_param_2
+)
+{
+ .local .align 8 .b8 __local_depot34[272];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<25>;
+ .reg .f32 %f<60>;
+ .reg .b32 %r<44>;
+ .reg .b64 %rd<123>;
+
+
+ mov.u64 %SPL, __local_depot34;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd17, [reduce_max_f_param_0];
+ ld.param.u64 %rd16, [reduce_max_f_param_1];
+ ld.param.u32 %r5, [reduce_max_f_param_2];
+ add.u64 %rd18, %SP, 0;
+ add.u64 %rd1, %SPL, 0;
+ st.local.u64 [%rd1], %rd17;
+ cvta.to.global.u64 %rd19, %rd17;
+ ld.global.u64 %rd20, [%rd19+16];
+ setp.eq.s64 %p1, %rd20, 0;
+ @%p1 bra BB34_2;
+
+ mov.u64 %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
+ st.local.u64 [%rd1+8], %rd21;
+ mov.u64 %rd23, 0;
+ st.local.u64 [%rd1+16], %rd23;
+ mov.u64 %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
+ st.local.u64 [%rd1+40], %rd24;
+ st.local.u64 [%rd1+48], %rd23;
+ mov.u64 %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
+ st.local.u64 [%rd1+56], %rd26;
+ st.local.u64 [%rd1+64], %rd23;
+ mov.u64 %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
+ st.local.u64 [%rd1+88], %rd28;
+ st.local.u64 [%rd1+96], %rd23;
+ mov.u64 %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
+ st.local.u64 [%rd1+104], %rd30;
+ st.local.u64 [%rd1+112], %rd23;
+ mov.u64 %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
+ st.local.u64 [%rd1+24], %rd32;
+ st.local.u64 [%rd1+32], %rd23;
+ mov.u64 %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
+ st.local.u64 [%rd1+72], %rd34;
+ st.local.u64 [%rd1+80], %rd23;
+ mov.u64 %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
+ st.local.u64 [%rd1+120], %rd36;
+ st.local.u64 [%rd1+128], %rd23;
+ bra.uni BB34_3;
+
+BB34_2:
+ mov.u64 %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
+ st.local.u64 [%rd1+8], %rd38;
+ mov.u64 %rd40, 0;
+ st.local.u64 [%rd1+16], %rd40;
+ mov.u64 %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
+ st.local.u64 [%rd1+40], %rd41;
+ st.local.u64 [%rd1+48], %rd40;
+ mov.u64 %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
+ st.local.u64 [%rd1+56], %rd43;
+ st.local.u64 [%rd1+64], %rd40;
+ mov.u64 %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
+ st.local.u64 [%rd1+88], %rd45;
+ st.local.u64 [%rd1+96], %rd40;
+ mov.u64 %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
+ st.local.u64 [%rd1+104], %rd47;
+ st.local.u64 [%rd1+112], %rd40;
+ mov.u64 %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
+ st.local.u64 [%rd1+24], %rd49;
+ st.local.u64 [%rd1+32], %rd40;
+ mov.u64 %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
+ st.local.u64 [%rd1+72], %rd51;
+ st.local.u64 [%rd1+80], %rd40;
+
+BB34_3:
+ add.u64 %rd53, %SP, 136;
+ add.u64 %rd2, %SPL, 136;
+ st.local.u64 [%rd2], %rd16;
+ cvta.to.global.u64 %rd54, %rd16;
+ ld.global.u64 %rd55, [%rd54+16];
+ setp.eq.s64 %p2, %rd55, 0;
+ @%p2 bra BB34_5;
+
+ mov.u64 %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
+ st.local.u64 [%rd2+8], %rd56;
+ mov.u64 %rd58, 0;
+ st.local.u64 [%rd2+16], %rd58;
+ mov.u64 %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
+ st.local.u64 [%rd2+40], %rd59;
+ st.local.u64 [%rd2+48], %rd58;
+ mov.u64 %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
+ st.local.u64 [%rd2+56], %rd61;
+ st.local.u64 [%rd2+64], %rd58;
+ mov.u64 %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
+ st.local.u64 [%rd2+88], %rd63;
+ st.local.u64 [%rd2+96], %rd58;
+ mov.u64 %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
+ st.local.u64 [%rd2+104], %rd65;
+ st.local.u64 [%rd2+112], %rd58;
+ mov.u64 %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
+ st.local.u64 [%rd2+24], %rd67;
+ st.local.u64 [%rd2+32], %rd58;
+ mov.u64 %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
+ st.local.u64 [%rd2+72], %rd69;
+ st.local.u64 [%rd2+80], %rd58;
+ mov.u64 %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
+ st.local.u64 [%rd2+120], %rd71;
+ st.local.u64 [%rd2+128], %rd58;
+ bra.uni BB34_6;
+
+BB34_5:
+ mov.u64 %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
+ st.local.u64 [%rd2+8], %rd73;
+ mov.u64 %rd75, 0;
+ st.local.u64 [%rd2+16], %rd75;
+ mov.u64 %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
+ st.local.u64 [%rd2+40], %rd76;
+ st.local.u64 [%rd2+48], %rd75;
+ mov.u64 %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
+ st.local.u64 [%rd2+56], %rd78;
+ st.local.u64 [%rd2+64], %rd75;
+ mov.u64 %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
+ st.local.u64 [%rd2+88], %rd80;
+ st.local.u64 [%rd2+96], %rd75;
+ mov.u64 %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
+ st.local.u64 [%rd2+104], %rd82;
+ st.local.u64 [%rd2+112], %rd75;
+ mov.u64 %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
+ st.local.u64 [%rd2+24], %rd84;
+ st.local.u64 [%rd2+32], %rd75;
+ mov.u64 %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
+ st.local.u64 [%rd2+72], %rd86;
+ st.local.u64 [%rd2+80], %rd75;
+
+BB34_6:
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r43, %r8, %r9, %r6;
+ mov.f32 %f44, 0fFF800000;
+ setp.ge.u32 %p3, %r43, %r5;
+ @%p3 bra BB34_15;
+
+ mov.f32 %f44, 0fFF800000;
+
+BB34_8:
+ ld.local.u64 %rd3, [%rd1+112];
+ ld.local.u64 %rd120, [%rd1+104];
+ and.b64 %rd90, %rd120, 1;
+ setp.eq.b64 %p4, %rd90, 1;
+ @!%p4 bra BB34_10;
+ bra.uni BB34_9;
+
+BB34_9:
+ add.s64 %rd93, %rd1, %rd3;
+ ld.local.u64 %rd94, [%rd93];
+ add.s64 %rd95, %rd120, %rd94;
+ ld.u64 %rd120, [%rd95+-1];
+
+BB34_10:
+ add.s64 %rd97, %rd18, %rd3;
+ // Callseq Start 8
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd97;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r43;
+ .param .b64 retval0;
+ prototype_8 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ call (retval0),
+ %rd120,
+ (
+ param0,
+ param1
+ )
+ , prototype_8;
+ ld.param.b64 %rd99, [retval0+0];
+
+ //{
+ }// Callseq End 8
+ ld.f32 %f31, [%rd99];
+ max.f32 %f44, %f44, %f31;
+ add.s32 %r16, %r43, %r9;
+ setp.ge.u32 %p5, %r16, %r5;
+ @%p5 bra BB34_14;
+
+ ld.local.u64 %rd121, [%rd1+104];
+ and.b64 %rd102, %rd121, 1;
+ setp.eq.b64 %p6, %rd102, 1;
+ ld.local.u64 %rd8, [%rd1+112];
+ @!%p6 bra BB34_13;
+ bra.uni BB34_12;
+
+BB34_12:
+ add.s64 %rd105, %rd1, %rd8;
+ ld.local.u64 %rd106, [%rd105];
+ add.s64 %rd107, %rd121, %rd106;
+ ld.u64 %rd121, [%rd107+-1];
+
+BB34_13:
+ add.s64 %rd109, %rd18, %rd8;
+ // Callseq Start 9
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd109;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r16;
+ .param .b64 retval0;
+ prototype_9 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ call (retval0),
+ %rd121,
+ (
+ param0,
+ param1
+ )
+ , prototype_9;
+ ld.param.b64 %rd111, [retval0+0];
+
+ //{
+ }// Callseq End 9
+ ld.f32 %f32, [%rd111];
+ max.f32 %f44, %f44, %f32;
+
+BB34_14:
+ shl.b32 %r20, %r9, 1;
+ mov.u32 %r21, %nctaid.x;
+ mad.lo.s32 %r43, %r20, %r21, %r43;
+ setp.lt.u32 %p7, %r43, %r5;
+ @%p7 bra BB34_8;
+
+BB34_15:
+ shl.b32 %r23, %r6, 2;
+ mov.u32 %r24, memory;
+ add.s32 %r4, %r24, %r23;
+ st.shared.f32 [%r4], %f44;
+ bar.sync 0;
+ setp.lt.u32 %p8, %r9, 1024;
+ @%p8 bra BB34_19;
+
+ setp.gt.u32 %p9, %r6, 511;
+ @%p9 bra BB34_18;
+
+ ld.shared.f32 %f33, [%r4+2048];
+ max.f32 %f44, %f44, %f33;
+ st.shared.f32 [%r4], %f44;
+
+BB34_18:
+ bar.sync 0;
+
+BB34_19:
+ setp.lt.u32 %p10, %r9, 512;
+ @%p10 bra BB34_23;
+
+ setp.gt.u32 %p11, %r6, 255;
+ @%p11 bra BB34_22;
+
+ ld.shared.f32 %f34, [%r4+1024];
+ max.f32 %f44, %f44, %f34;
+ st.shared.f32 [%r4], %f44;
+
+BB34_22:
+ bar.sync 0;
+
+BB34_23:
+ setp.lt.u32 %p12, %r9, 256;
+ @%p12 bra BB34_27;
+
+ setp.gt.u32 %p13, %r6, 127;
+ @%p13 bra BB34_26;
+
+ ld.shared.f32 %f35, [%r4+512];
+ max.f32 %f44, %f44, %f35;
+ st.shared.f32 [%r4], %f44;
+
+BB34_26:
+ bar.sync 0;
+
+BB34_27:
+ setp.lt.u32 %p14, %r9, 128;
+ @%p14 bra BB34_31;
+
+ setp.gt.u32 %p15, %r6, 63;
+ @%p15 bra BB34_30;
+
+ ld.shared.f32 %f36, [%r4+256];
+ max.f32 %f44, %f44, %f36;
+ st.shared.f32 [%r4], %f44;
+
+BB34_30:
+ bar.sync 0;
+
+BB34_31:
+ setp.gt.u32 %p16, %r6, 31;
+ @%p16 bra BB34_44;
+
+ setp.lt.u32 %p17, %r9, 64;
+ @%p17 bra BB34_34;
+ ld.volatile.shared.f32 %f37, [%r4+128];
+ max.f32 %f44, %f44, %f37;
+ st.volatile.shared.f32 [%r4], %f44;
- ld.param.u64 %rd1, [double2float_f_param_0];
- ld.param.u64 %rd2, [double2float_f_param_1];
- ld.param.u32 %r2, [double2float_f_param_2];
- mov.u32 %r3, %ctaid.x;
- mov.u32 %r4, %ntid.x;
- mov.u32 %r5, %tid.x;
- mad.lo.s32 %r1, %r4, %r3, %r5;
- setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB15_2;
+BB34_34:
+ setp.lt.u32 %p18, %r9, 32;
+ @%p18 bra BB34_36;
- cvta.to.global.u64 %rd3, %rd1;
- mul.wide.s32 %rd4, %r1, 8;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.f64 %fd1, [%rd5];
- cvt.rn.f32.f64 %f1, %fd1;
- cvta.to.global.u64 %rd6, %rd2;
- mul.wide.s32 %rd7, %r1, 4;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f32 [%rd8], %f1;
+ ld.volatile.shared.f32 %f38, [%r4+64];
+ max.f32 %f44, %f44, %f38;
+ st.volatile.shared.f32 [%r4], %f44;
-BB15_2:
- ret;
-}
+BB34_36:
+ setp.lt.u32 %p19, %r9, 16;
+ @%p19 bra BB34_38;
- // .globl float2double_f
-.visible .entry float2double_f(
- .param .u64 float2double_f_param_0,
- .param .u64 float2double_f_param_1,
- .param .u32 float2double_f_param_2
-)
-{
- .reg .pred %p<2>;
- .reg .f32 %f<2>;
- .reg .b32 %r<6>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<9>;
+ ld.volatile.shared.f32 %f39, [%r4+32];
+ max.f32 %f44, %f44, %f39;
+ st.volatile.shared.f32 [%r4], %f44;
+BB34_38:
+ setp.lt.u32 %p20, %r9, 8;
+ @%p20 bra BB34_40;
- ld.param.u64 %rd1, [float2double_f_param_0];
- ld.param.u64 %rd2, [float2double_f_param_1];
- ld.param.u32 %r2, [float2double_f_param_2];
- mov.u32 %r3, %ctaid.x;
- mov.u32 %r4, %ntid.x;
- mov.u32 %r5, %tid.x;
- mad.lo.s32 %r1, %r4, %r3, %r5;
- setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB16_2;
+ ld.volatile.shared.f32 %f40, [%r4+16];
+ max.f32 %f44, %f44, %f40;
+ st.volatile.shared.f32 [%r4], %f44;
- cvta.to.global.u64 %rd3, %rd1;
- mul.wide.s32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.f32 %f1, [%rd5];
- cvt.f64.f32 %fd1, %f1;
- cvta.to.global.u64 %rd6, %rd2;
- mul.wide.s32 %rd7, %r1, 8;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f64 [%rd8], %fd1;
+BB34_40:
+ setp.lt.u32 %p21, %r9, 4;
+ @%p21 bra BB34_42;
+
+ ld.volatile.shared.f32 %f41, [%r4+8];
+ max.f32 %f44, %f44, %f41;
+ st.volatile.shared.f32 [%r4], %f44;
+
+BB34_42:
+ setp.lt.u32 %p22, %r9, 2;
+ @%p22 bra BB34_44;
+
+ ld.volatile.shared.f32 %f42, [%r4+4];
+ max.f32 %f43, %f44, %f42;
+ st.volatile.shared.f32 [%r4], %f43;
+
+BB34_44:
+ setp.ne.s32 %p23, %r6, 0;
+ @%p23 bra BB34_48;
+
+ ld.shared.f32 %f28, [memory];
+ ld.local.u64 %rd114, [%rd2+96];
+ add.s64 %rd11, %rd2, %rd114;
+ add.s64 %rd12, %rd53, %rd114;
+ ld.local.u64 %rd122, [%rd2+88];
+ and.b64 %rd115, %rd122, 1;
+ setp.eq.b64 %p24, %rd115, 1;
+ @!%p24 bra BB34_47;
+ bra.uni BB34_46;
+
+BB34_46:
+ ld.local.u64 %rd116, [%rd11];
+ add.s64 %rd117, %rd122, %rd116;
+ ld.u64 %rd122, [%rd117+-1];
+
+BB34_47:
+ mov.u32 %r42, 0;
+ // Callseq Start 10
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b32 param1;
+ st.param.b32 [param1+0], %r42;
+ .param .b32 param2;
+ st.param.b32 [param2+0], %r7;
+ .param .b64 retval0;
+ prototype_10 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
+ call (retval0),
+ %rd122,
+ (
+ param0,
+ param1,
+ param2
+ )
+ , prototype_10;
+ ld.param.b64 %rd119, [retval0+0];
+
+ //{
+ }// Callseq End 10
+ st.f32 [%rd119], %f28;
-BB16_2:
+BB34_48:
ret;
}
- // .globl reduce_sum_d
-.visible .entry reduce_sum_d(
- .param .u64 reduce_sum_d_param_0,
- .param .u64 reduce_sum_d_param_1,
- .param .u32 reduce_sum_d_param_2
+ // .globl reduce_max_d
+.visible .entry reduce_max_d(
+ .param .u64 reduce_max_d_param_0,
+ .param .u64 reduce_max_d_param_1,
+ .param .u32 reduce_max_d_param_2
)
{
- .local .align 8 .b8 __local_depot17[272];
+ .local .align 8 .b8 __local_depot35[272];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<25>;
@@ -427,18 +1956,18 @@ BB16_2:
.reg .b64 %rd<123>;
- mov.u64 %SPL, __local_depot17;
+ mov.u64 %SPL, __local_depot35;
cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_sum_d_param_0];
- ld.param.u64 %rd16, [reduce_sum_d_param_1];
- ld.param.u32 %r5, [reduce_sum_d_param_2];
+ ld.param.u64 %rd17, [reduce_max_d_param_0];
+ ld.param.u64 %rd16, [reduce_max_d_param_1];
+ ld.param.u32 %r5, [reduce_max_d_param_2];
add.u64 %rd18, %SP, 0;
add.u64 %rd1, %SPL, 0;
st.local.u64 [%rd1], %rd17;
cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+8];
+ ld.global.u64 %rd20, [%rd19+16];
setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB17_2;
+ @%p1 bra BB35_2;
mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
st.local.u64 [%rd1+8], %rd21;
@@ -465,9 +1994,9 @@ BB16_2:
mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
st.local.u64 [%rd1+120], %rd36;
st.local.u64 [%rd1+128], %rd23;
- bra.uni BB17_3;
+ bra.uni BB35_3;
-BB17_2:
+BB35_2:
mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
st.local.u64 [%rd1+8], %rd38;
mov.u64 %rd40, 0;
@@ -491,14 +2020,14 @@ BB17_2:
st.local.u64 [%rd1+72], %rd51;
st.local.u64 [%rd1+80], %rd40;
-BB17_3:
+BB35_3:
add.u64 %rd53, %SP, 136;
add.u64 %rd2, %SPL, 136;
st.local.u64 [%rd2], %rd16;
cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+8];
+ ld.global.u64 %rd55, [%rd54+16];
setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB17_5;
+ @%p2 bra BB35_5;
mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
st.local.u64 [%rd2+8], %rd56;
@@ -525,9 +2054,9 @@ BB17_3:
mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
st.local.u64 [%rd2+120], %rd71;
st.local.u64 [%rd2+128], %rd58;
- bra.uni BB17_6;
+ bra.uni BB35_6;
-BB17_5:
+BB35_5:
mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
st.local.u64 [%rd2+8], %rd73;
mov.u64 %rd75, 0;
@@ -551,35 +2080,35 @@ BB17_5:
st.local.u64 [%rd2+72], %rd86;
st.local.u64 [%rd2+80], %rd75;
-BB17_6:
+BB35_6:
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f64 %fd44, 0d0000000000000000;
+ mov.f64 %fd44, 0dFFF0000000000000;
setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB17_15;
+ @%p3 bra BB35_15;
- mov.f64 %fd44, 0d0000000000000000;
+ mov.f64 %fd44, 0dFFF0000000000000;
-BB17_8:
+BB35_8:
ld.local.u64 %rd3, [%rd1+112];
ld.local.u64 %rd120, [%rd1+104];
and.b64 %rd90, %rd120, 1;
setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB17_10;
- bra.uni BB17_9;
+ @!%p4 bra BB35_10;
+ bra.uni BB35_9;
-BB17_9:
+BB35_9:
add.s64 %rd93, %rd1, %rd3;
ld.local.u64 %rd94, [%rd93];
add.s64 %rd95, %rd120, %rd94;
ld.u64 %rd120, [%rd95+-1];
-BB17_10:
+BB35_10:
add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 1
+ // Callseq Start 11
{
.reg .b32 temp_param_reg;
// <end>}
@@ -588,40 +2117,40 @@ BB17_10:
.param .b32 param1;
st.param.b32 [param1+0], %r43;
.param .b64 retval0;
- prototype_1 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ prototype_11 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
call (retval0),
%rd120,
(
param0,
param1
)
- , prototype_1;
+ , prototype_11;
ld.param.b64 %rd99, [retval0+0];
//{
- }// Callseq End 1
+ }// Callseq End 11
ld.f64 %fd31, [%rd99];
- add.f64 %fd44, %fd44, %fd31;
+ max.f64 %fd44, %fd44, %fd31;
add.s32 %r16, %r43, %r9;
setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB17_14;
+ @%p5 bra BB35_14;
ld.local.u64 %rd121, [%rd1+104];
and.b64 %rd102, %rd121, 1;
setp.eq.b64 %p6, %rd102, 1;
ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB17_13;
- bra.uni BB17_12;
+ @!%p6 bra BB35_13;
+ bra.uni BB35_12;
-BB17_12:
+BB35_12:
add.s64 %rd105, %rd1, %rd8;
ld.local.u64 %rd106, [%rd105];
add.s64 %rd107, %rd121, %rd106;
ld.u64 %rd121, [%rd107+-1];
-BB17_13:
+BB35_13:
add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 2
+ // Callseq Start 12
{
.reg .b32 temp_param_reg;
// <end>}
@@ -630,143 +2159,143 @@ BB17_13:
.param .b32 param1;
st.param.b32 [param1+0], %r16;
.param .b64 retval0;
- prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ prototype_12 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
call (retval0),
%rd121,
(
param0,
param1
)
- , prototype_2;
+ , prototype_12;
ld.param.b64 %rd111, [retval0+0];
//{
- }// Callseq End 2
+ }// Callseq End 12
ld.f64 %fd32, [%rd111];
- add.f64 %fd44, %fd44, %fd32;
+ max.f64 %fd44, %fd44, %fd32;
-BB17_14:
+BB35_14:
shl.b32 %r20, %r9, 1;
mov.u32 %r21, %nctaid.x;
mad.lo.s32 %r43, %r20, %r21, %r43;
setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB17_8;
+ @%p7 bra BB35_8;
-BB17_15:
+BB35_15:
shl.b32 %r23, %r6, 3;
mov.u32 %r24, memory;
add.s32 %r4, %r24, %r23;
st.shared.f64 [%r4], %fd44;
bar.sync 0;
setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB17_19;
+ @%p8 bra BB35_19;
setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB17_18;
+ @%p9 bra BB35_18;
ld.shared.f64 %fd33, [%r4+4096];
- add.f64 %fd44, %fd44, %fd33;
+ max.f64 %fd44, %fd44, %fd33;
st.shared.f64 [%r4], %fd44;
-BB17_18:
+BB35_18:
bar.sync 0;
-BB17_19:
+BB35_19:
setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB17_23;
+ @%p10 bra BB35_23;
setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB17_22;
+ @%p11 bra BB35_22;
ld.shared.f64 %fd34, [%r4+2048];
- add.f64 %fd44, %fd44, %fd34;
+ max.f64 %fd44, %fd44, %fd34;
st.shared.f64 [%r4], %fd44;
-BB17_22:
+BB35_22:
bar.sync 0;
-BB17_23:
+BB35_23:
setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB17_27;
+ @%p12 bra BB35_27;
setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB17_26;
+ @%p13 bra BB35_26;
ld.shared.f64 %fd35, [%r4+1024];
- add.f64 %fd44, %fd44, %fd35;
+ max.f64 %fd44, %fd44, %fd35;
st.shared.f64 [%r4], %fd44;
-BB17_26:
+BB35_26:
bar.sync 0;
-BB17_27:
+BB35_27:
setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB17_31;
+ @%p14 bra BB35_31;
setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB17_30;
+ @%p15 bra BB35_30;
ld.shared.f64 %fd36, [%r4+512];
- add.f64 %fd44, %fd44, %fd36;
+ max.f64 %fd44, %fd44, %fd36;
st.shared.f64 [%r4], %fd44;
-BB17_30:
+BB35_30:
bar.sync 0;
-BB17_31:
+BB35_31:
setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB17_44;
+ @%p16 bra BB35_44;
setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB17_34;
+ @%p17 bra BB35_34;
ld.volatile.shared.f64 %fd37, [%r4+256];
- add.f64 %fd44, %fd44, %fd37;
+ max.f64 %fd44, %fd44, %fd37;
st.volatile.shared.f64 [%r4], %fd44;
-BB17_34:
+BB35_34:
setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB17_36;
+ @%p18 bra BB35_36;
ld.volatile.shared.f64 %fd38, [%r4+128];
- add.f64 %fd44, %fd44, %fd38;
+ max.f64 %fd44, %fd44, %fd38;
st.volatile.shared.f64 [%r4], %fd44;
-BB17_36:
+BB35_36:
setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB17_38;
+ @%p19 bra BB35_38;
ld.volatile.shared.f64 %fd39, [%r4+64];
- add.f64 %fd44, %fd44, %fd39;
+ max.f64 %fd44, %fd44, %fd39;
st.volatile.shared.f64 [%r4], %fd44;
-BB17_38:
+BB35_38:
setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB17_40;
+ @%p20 bra BB35_40;
ld.volatile.shared.f64 %fd40, [%r4+32];
- add.f64 %fd44, %fd44, %fd40;
+ max.f64 %fd44, %fd44, %fd40;
st.volatile.shared.f64 [%r4], %fd44;
-BB17_40:
+BB35_40:
setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB17_42;
+ @%p21 bra BB35_42;
ld.volatile.shared.f64 %fd41, [%r4+16];
- add.f64 %fd44, %fd44, %fd41;
+ max.f64 %fd44, %fd44, %fd41;
st.volatile.shared.f64 [%r4], %fd44;
-BB17_42:
+BB35_42:
setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB17_44;
+ @%p22 bra BB35_44;
ld.volatile.shared.f64 %fd42, [%r4+8];
- add.f64 %fd43, %fd44, %fd42;
+ max.f64 %fd43, %fd44, %fd42;
st.volatile.shared.f64 [%r4], %fd43;
-BB17_44:
+BB35_44:
setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB17_48;
+ @%p23 bra BB35_48;
ld.shared.f64 %fd28, [memory];
ld.local.u64 %rd114, [%rd2+96];
@@ -775,17 +2304,17 @@ BB17_44:
ld.local.u64 %rd122, [%rd2+88];
and.b64 %rd115, %rd122, 1;
setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB17_47;
- bra.uni BB17_46;
+ @!%p24 bra BB35_47;
+ bra.uni BB35_46;
-BB17_46:
+BB35_46:
ld.local.u64 %rd116, [%rd11];
add.s64 %rd117, %rd122, %rd116;
ld.u64 %rd122, [%rd117+-1];
-BB17_47:
+BB35_47:
mov.u32 %r42, 0;
- // Callseq Start 3
+ // Callseq Start 13
{
.reg .b32 temp_param_reg;
// <end>}
@@ -796,7 +2325,7 @@ BB17_47:
.param .b32 param2;
st.param.b32 [param2+0], %r7;
.param .b64 retval0;
- prototype_3 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
+ prototype_13 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
call (retval0),
%rd122,
(
@@ -804,186 +2333,186 @@ BB17_47:
param1,
param2
)
- , prototype_3;
+ , prototype_13;
ld.param.b64 %rd119, [retval0+0];
//{
- }// Callseq End 3
+ }// Callseq End 13
st.f64 [%rd119], %fd28;
-BB17_48:
+BB35_48:
ret;
}
- // .globl reduce_max_d
-.visible .entry reduce_max_d(
- .param .u64 reduce_max_d_param_0,
- .param .u64 reduce_max_d_param_1,
- .param .u32 reduce_max_d_param_2
+ // .globl reduce_min_f
+.visible .entry reduce_min_f(
+ .param .u64 reduce_min_f_param_0,
+ .param .u64 reduce_min_f_param_1,
+ .param .u32 reduce_min_f_param_2
)
{
- .local .align 8 .b8 __local_depot18[272];
+ .local .align 8 .b8 __local_depot36[272];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<25>;
+ .reg .f32 %f<60>;
.reg .b32 %r<44>;
- .reg .f64 %fd<60>;
.reg .b64 %rd<123>;
- mov.u64 %SPL, __local_depot18;
+ mov.u64 %SPL, __local_depot36;
cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_max_d_param_0];
- ld.param.u64 %rd16, [reduce_max_d_param_1];
- ld.param.u32 %r5, [reduce_max_d_param_2];
+ ld.param.u64 %rd17, [reduce_min_f_param_0];
+ ld.param.u64 %rd16, [reduce_min_f_param_1];
+ ld.param.u32 %r5, [reduce_min_f_param_2];
add.u64 %rd18, %SP, 0;
add.u64 %rd1, %SPL, 0;
st.local.u64 [%rd1], %rd17;
cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+8];
+ ld.global.u64 %rd20, [%rd19+16];
setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB18_2;
+ @%p1 bra BB36_2;
- mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
+ mov.u64 %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
st.local.u64 [%rd1+8], %rd21;
mov.u64 %rd23, 0;
st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
+ mov.u64 %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
st.local.u64 [%rd1+40], %rd24;
st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
+ mov.u64 %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
st.local.u64 [%rd1+56], %rd26;
st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
+ mov.u64 %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
st.local.u64 [%rd1+88], %rd28;
st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
+ mov.u64 %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
st.local.u64 [%rd1+104], %rd30;
st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
+ mov.u64 %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
st.local.u64 [%rd1+24], %rd32;
st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
+ mov.u64 %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
st.local.u64 [%rd1+72], %rd34;
st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
+ mov.u64 %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
st.local.u64 [%rd1+120], %rd36;
st.local.u64 [%rd1+128], %rd23;
- bra.uni BB18_3;
+ bra.uni BB36_3;
-BB18_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
+BB36_2:
+ mov.u64 %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
st.local.u64 [%rd1+8], %rd38;
mov.u64 %rd40, 0;
st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
+ mov.u64 %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
st.local.u64 [%rd1+40], %rd41;
st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
+ mov.u64 %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
st.local.u64 [%rd1+56], %rd43;
st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
+ mov.u64 %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
st.local.u64 [%rd1+88], %rd45;
st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
+ mov.u64 %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
st.local.u64 [%rd1+104], %rd47;
st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
+ mov.u64 %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
st.local.u64 [%rd1+24], %rd49;
st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
+ mov.u64 %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
st.local.u64 [%rd1+72], %rd51;
st.local.u64 [%rd1+80], %rd40;
-BB18_3:
+BB36_3:
add.u64 %rd53, %SP, 136;
add.u64 %rd2, %SPL, 136;
st.local.u64 [%rd2], %rd16;
cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+8];
+ ld.global.u64 %rd55, [%rd54+16];
setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB18_5;
+ @%p2 bra BB36_5;
- mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
+ mov.u64 %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
st.local.u64 [%rd2+8], %rd56;
mov.u64 %rd58, 0;
st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
+ mov.u64 %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
st.local.u64 [%rd2+40], %rd59;
st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
+ mov.u64 %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
st.local.u64 [%rd2+56], %rd61;
st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
+ mov.u64 %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
st.local.u64 [%rd2+88], %rd63;
st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
+ mov.u64 %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
st.local.u64 [%rd2+104], %rd65;
st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
+ mov.u64 %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
st.local.u64 [%rd2+24], %rd67;
st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
+ mov.u64 %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
st.local.u64 [%rd2+72], %rd69;
st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
+ mov.u64 %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
st.local.u64 [%rd2+120], %rd71;
st.local.u64 [%rd2+128], %rd58;
- bra.uni BB18_6;
+ bra.uni BB36_6;
-BB18_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
+BB36_5:
+ mov.u64 %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
st.local.u64 [%rd2+8], %rd73;
mov.u64 %rd75, 0;
st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
+ mov.u64 %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
st.local.u64 [%rd2+40], %rd76;
st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
+ mov.u64 %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
st.local.u64 [%rd2+56], %rd78;
st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
+ mov.u64 %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
st.local.u64 [%rd2+88], %rd80;
st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
+ mov.u64 %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
st.local.u64 [%rd2+104], %rd82;
st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
+ mov.u64 %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
st.local.u64 [%rd2+24], %rd84;
st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
+ mov.u64 %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
st.local.u64 [%rd2+72], %rd86;
st.local.u64 [%rd2+80], %rd75;
-BB18_6:
+BB36_6:
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f64 %fd44, 0dFFF0000000000000;
+ mov.f32 %f44, 0f7F800000;
setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB18_15;
+ @%p3 bra BB36_15;
- mov.f64 %fd44, 0dFFF0000000000000;
+ mov.f32 %f44, 0f7F800000;
-BB18_8:
+BB36_8:
ld.local.u64 %rd3, [%rd1+112];
ld.local.u64 %rd120, [%rd1+104];
and.b64 %rd90, %rd120, 1;
setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB18_10;
- bra.uni BB18_9;
+ @!%p4 bra BB36_10;
+ bra.uni BB36_9;
-BB18_9:
+BB36_9:
add.s64 %rd93, %rd1, %rd3;
ld.local.u64 %rd94, [%rd93];
add.s64 %rd95, %rd120, %rd94;
ld.u64 %rd120, [%rd95+-1];
-BB18_10:
+BB36_10:
add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 4
+ // Callseq Start 14
{
.reg .b32 temp_param_reg;
// <end>}
@@ -992,40 +2521,40 @@ BB18_10:
.param .b32 param1;
st.param.b32 [param1+0], %r43;
.param .b64 retval0;
- prototype_4 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ prototype_14 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
call (retval0),
%rd120,
(
param0,
param1
)
- , prototype_4;
+ , prototype_14;
ld.param.b64 %rd99, [retval0+0];
//{
- }// Callseq End 4
- ld.f64 %fd31, [%rd99];
- max.f64 %fd44, %fd44, %fd31;
+ }// Callseq End 14
+ ld.f32 %f31, [%rd99];
+ min.f32 %f44, %f44, %f31;
add.s32 %r16, %r43, %r9;
setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB18_14;
+ @%p5 bra BB36_14;
ld.local.u64 %rd121, [%rd1+104];
and.b64 %rd102, %rd121, 1;
setp.eq.b64 %p6, %rd102, 1;
ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB18_13;
- bra.uni BB18_12;
+ @!%p6 bra BB36_13;
+ bra.uni BB36_12;
-BB18_12:
+BB36_12:
add.s64 %rd105, %rd1, %rd8;
ld.local.u64 %rd106, [%rd105];
add.s64 %rd107, %rd121, %rd106;
ld.u64 %rd121, [%rd107+-1];
-BB18_13:
+BB36_13:
add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 5
+ // Callseq Start 15
{
.reg .b32 temp_param_reg;
// <end>}
@@ -1034,162 +2563,162 @@ BB18_13:
.param .b32 param1;
st.param.b32 [param1+0], %r16;
.param .b64 retval0;
- prototype_5 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ prototype_15 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
call (retval0),
%rd121,
(
param0,
param1
)
- , prototype_5;
+ , prototype_15;
ld.param.b64 %rd111, [retval0+0];
//{
- }// Callseq End 5
- ld.f64 %fd32, [%rd111];
- max.f64 %fd44, %fd44, %fd32;
+ }// Callseq End 15
+ ld.f32 %f32, [%rd111];
+ min.f32 %f44, %f44, %f32;
-BB18_14:
+BB36_14:
shl.b32 %r20, %r9, 1;
mov.u32 %r21, %nctaid.x;
mad.lo.s32 %r43, %r20, %r21, %r43;
setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB18_8;
+ @%p7 bra BB36_8;
-BB18_15:
- shl.b32 %r23, %r6, 3;
+BB36_15:
+ shl.b32 %r23, %r6, 2;
mov.u32 %r24, memory;
add.s32 %r4, %r24, %r23;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f32 [%r4], %f44;
bar.sync 0;
setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB18_19;
+ @%p8 bra BB36_19;
setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB18_18;
+ @%p9 bra BB36_18;
- ld.shared.f64 %fd33, [%r4+4096];
- max.f64 %fd44, %fd44, %fd33;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f32 %f33, [%r4+2048];
+ min.f32 %f44, %f44, %f33;
+ st.shared.f32 [%r4], %f44;
-BB18_18:
+BB36_18:
bar.sync 0;
-BB18_19:
+BB36_19:
setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB18_23;
+ @%p10 bra BB36_23;
setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB18_22;
+ @%p11 bra BB36_22;
- ld.shared.f64 %fd34, [%r4+2048];
- max.f64 %fd44, %fd44, %fd34;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f32 %f34, [%r4+1024];
+ min.f32 %f44, %f44, %f34;
+ st.shared.f32 [%r4], %f44;
-BB18_22:
+BB36_22:
bar.sync 0;
-BB18_23:
+BB36_23:
setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB18_27;
+ @%p12 bra BB36_27;
setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB18_26;
+ @%p13 bra BB36_26;
- ld.shared.f64 %fd35, [%r4+1024];
- max.f64 %fd44, %fd44, %fd35;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f32 %f35, [%r4+512];
+ min.f32 %f44, %f44, %f35;
+ st.shared.f32 [%r4], %f44;
-BB18_26:
+BB36_26:
bar.sync 0;
-BB18_27:
+BB36_27:
setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB18_31;
+ @%p14 bra BB36_31;
setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB18_30;
+ @%p15 bra BB36_30;
- ld.shared.f64 %fd36, [%r4+512];
- max.f64 %fd44, %fd44, %fd36;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f32 %f36, [%r4+256];
+ min.f32 %f44, %f44, %f36;
+ st.shared.f32 [%r4], %f44;
-BB18_30:
+BB36_30:
bar.sync 0;
-BB18_31:
+BB36_31:
setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB18_44;
+ @%p16 bra BB36_44;
setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB18_34;
+ @%p17 bra BB36_34;
- ld.volatile.shared.f64 %fd37, [%r4+256];
- max.f64 %fd44, %fd44, %fd37;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f32 %f37, [%r4+128];
+ min.f32 %f44, %f44, %f37;
+ st.volatile.shared.f32 [%r4], %f44;
-BB18_34:
+BB36_34:
setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB18_36;
+ @%p18 bra BB36_36;
- ld.volatile.shared.f64 %fd38, [%r4+128];
- max.f64 %fd44, %fd44, %fd38;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f32 %f38, [%r4+64];
+ min.f32 %f44, %f44, %f38;
+ st.volatile.shared.f32 [%r4], %f44;
-BB18_36:
+BB36_36:
setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB18_38;
+ @%p19 bra BB36_38;
- ld.volatile.shared.f64 %fd39, [%r4+64];
- max.f64 %fd44, %fd44, %fd39;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f32 %f39, [%r4+32];
+ min.f32 %f44, %f44, %f39;
+ st.volatile.shared.f32 [%r4], %f44;
-BB18_38:
+BB36_38:
setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB18_40;
+ @%p20 bra BB36_40;
- ld.volatile.shared.f64 %fd40, [%r4+32];
- max.f64 %fd44, %fd44, %fd40;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f32 %f40, [%r4+16];
+ min.f32 %f44, %f44, %f40;
+ st.volatile.shared.f32 [%r4], %f44;
-BB18_40:
+BB36_40:
setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB18_42;
+ @%p21 bra BB36_42;
- ld.volatile.shared.f64 %fd41, [%r4+16];
- max.f64 %fd44, %fd44, %fd41;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f32 %f41, [%r4+8];
+ min.f32 %f44, %f44, %f41;
+ st.volatile.shared.f32 [%r4], %f44;
-BB18_42:
+BB36_42:
setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB18_44;
+ @%p22 bra BB36_44;
- ld.volatile.shared.f64 %fd42, [%r4+8];
- max.f64 %fd43, %fd44, %fd42;
- st.volatile.shared.f64 [%r4], %fd43;
+ ld.volatile.shared.f32 %f42, [%r4+4];
+ min.f32 %f43, %f44, %f42;
+ st.volatile.shared.f32 [%r4], %f43;
-BB18_44:
+BB36_44:
setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB18_48;
+ @%p23 bra BB36_48;
- ld.shared.f64 %fd28, [memory];
+ ld.shared.f32 %f28, [memory];
ld.local.u64 %rd114, [%rd2+96];
add.s64 %rd11, %rd2, %rd114;
add.s64 %rd12, %rd53, %rd114;
ld.local.u64 %rd122, [%rd2+88];
and.b64 %rd115, %rd122, 1;
setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB18_47;
- bra.uni BB18_46;
+ @!%p24 bra BB36_47;
+ bra.uni BB36_46;
-BB18_46:
+BB36_46:
ld.local.u64 %rd116, [%rd11];
add.s64 %rd117, %rd122, %rd116;
ld.u64 %rd122, [%rd117+-1];
-BB18_47:
+BB36_47:
mov.u32 %r42, 0;
- // Callseq Start 6
+ // Callseq Start 16
{
.reg .b32 temp_param_reg;
// <end>}
@@ -1200,7 +2729,7 @@ BB18_47:
.param .b32 param2;
st.param.b32 [param2+0], %r7;
.param .b64 retval0;
- prototype_6 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
+ prototype_16 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
call (retval0),
%rd122,
(
@@ -1208,14 +2737,14 @@ BB18_47:
param1,
param2
)
- , prototype_6;
+ , prototype_16;
ld.param.b64 %rd119, [retval0+0];
//{
- }// Callseq End 6
- st.f64 [%rd119], %fd28;
+ }// Callseq End 16
+ st.f32 [%rd119], %f28;
-BB18_48:
+BB36_48:
ret;
}
@@ -1226,7 +2755,7 @@ BB18_48:
.param .u32 reduce_min_d_param_2
)
{
- .local .align 8 .b8 __local_depot19[272];
+ .local .align 8 .b8 __local_depot37[272];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<25>;
@@ -1235,7 +2764,7 @@ BB18_48:
.reg .b64 %rd<123>;
- mov.u64 %SPL, __local_depot19;
+ mov.u64 %SPL, __local_depot37;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd17, [reduce_min_d_param_0];
ld.param.u64 %rd16, [reduce_min_d_param_1];
@@ -1244,9 +2773,9 @@ BB18_48:
add.u64 %rd1, %SPL, 0;
st.local.u64 [%rd1], %rd17;
cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+8];
+ ld.global.u64 %rd20, [%rd19+16];
setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB19_2;
+ @%p1 bra BB37_2;
mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
st.local.u64 [%rd1+8], %rd21;
@@ -1273,9 +2802,9 @@ BB18_48:
mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
st.local.u64 [%rd1+120], %rd36;
st.local.u64 [%rd1+128], %rd23;
- bra.uni BB19_3;
+ bra.uni BB37_3;
-BB19_2:
+BB37_2:
mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
st.local.u64 [%rd1+8], %rd38;
mov.u64 %rd40, 0;
@@ -1299,14 +2828,14 @@ BB19_2:
st.local.u64 [%rd1+72], %rd51;
st.local.u64 [%rd1+80], %rd40;
-BB19_3:
+BB37_3:
add.u64 %rd53, %SP, 136;
add.u64 %rd2, %SPL, 136;
st.local.u64 [%rd2], %rd16;
cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+8];
+ ld.global.u64 %rd55, [%rd54+16];
setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB19_5;
+ @%p2 bra BB37_5;
mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
st.local.u64 [%rd2+8], %rd56;
@@ -1333,9 +2862,9 @@ BB19_3:
mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
st.local.u64 [%rd2+120], %rd71;
st.local.u64 [%rd2+128], %rd58;
- bra.uni BB19_6;
+ bra.uni BB37_6;
-BB19_5:
+BB37_5:
mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
st.local.u64 [%rd2+8], %rd73;
mov.u64 %rd75, 0;
@@ -1359,7 +2888,7 @@ BB19_5:
st.local.u64 [%rd2+72], %rd86;
st.local.u64 [%rd2+80], %rd75;
-BB19_6:
+BB37_6:
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
@@ -1367,27 +2896,27 @@ BB19_6:
mad.lo.s32 %r43, %r8, %r9, %r6;
mov.f64 %fd44, 0d7FF0000000000000;
setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB19_15;
+ @%p3 bra BB37_15;
mov.f64 %fd44, 0d7FF0000000000000;
-BB19_8:
+BB37_8:
ld.local.u64 %rd3, [%rd1+112];
ld.local.u64 %rd120, [%rd1+104];
and.b64 %rd90, %rd120, 1;
setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB19_10;
- bra.uni BB19_9;
+ @!%p4 bra BB37_10;
+ bra.uni BB37_9;
-BB19_9:
+BB37_9:
add.s64 %rd93, %rd1, %rd3;
ld.local.u64 %rd94, [%rd93];
add.s64 %rd95, %rd120, %rd94;
ld.u64 %rd120, [%rd95+-1];
-BB19_10:
+BB37_10:
add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 7
+ // Callseq Start 17
{
.reg .b32 temp_param_reg;
// <end>}
@@ -1396,40 +2925,40 @@ BB19_10:
.param .b32 param1;
st.param.b32 [param1+0], %r43;
.param .b64 retval0;
- prototype_7 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ prototype_17 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
call (retval0),
%rd120,
(
param0,
param1
)
- , prototype_7;
+ , prototype_17;
ld.param.b64 %rd99, [retval0+0];
//{
- }// Callseq End 7
+ }// Callseq End 17
ld.f64 %fd31, [%rd99];
min.f64 %fd44, %fd44, %fd31;
add.s32 %r16, %r43, %r9;
setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB19_14;
+ @%p5 bra BB37_14;
ld.local.u64 %rd121, [%rd1+104];
and.b64 %rd102, %rd121, 1;
setp.eq.b64 %p6, %rd102, 1;
ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB19_13;
- bra.uni BB19_12;
+ @!%p6 bra BB37_13;
+ bra.uni BB37_12;
-BB19_12:
+BB37_12:
add.s64 %rd105, %rd1, %rd8;
ld.local.u64 %rd106, [%rd105];
add.s64 %rd107, %rd121, %rd106;
ld.u64 %rd121, [%rd107+-1];
-BB19_13:
+BB37_13:
add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 8
+ // Callseq Start 18
{
.reg .b32 temp_param_reg;
// <end>}
@@ -1438,143 +2967,143 @@ BB19_13:
.param .b32 param1;
st.param.b32 [param1+0], %r16;
.param .b64 retval0;
- prototype_8 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
+ prototype_18 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
call (retval0),
%rd121,
(
param0,
param1
)
- , prototype_8;
+ , prototype_18;
ld.param.b64 %rd111, [retval0+0];
//{
- }// Callseq End 8
+ }// Callseq End 18
ld.f64 %fd32, [%rd111];
min.f64 %fd44, %fd44, %fd32;
-BB19_14:
+BB37_14:
shl.b32 %r20, %r9, 1;
mov.u32 %r21, %nctaid.x;
mad.lo.s32 %r43, %r20, %r21, %r43;
setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB19_8;
+ @%p7 bra BB37_8;
-BB19_15:
+BB37_15:
shl.b32 %r23, %r6, 3;
mov.u32 %r24, memory;
add.s32 %r4, %r24, %r23;
st.shared.f64 [%r4], %fd44;
bar.sync 0;
setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB19_19;
+ @%p8 bra BB37_19;
setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB19_18;
+ @%p9 bra BB37_18;
ld.shared.f64 %fd33, [%r4+4096];
min.f64 %fd44, %fd44, %fd33;
st.shared.f64 [%r4], %fd44;
-BB19_18:
+BB37_18:
bar.sync 0;
-BB19_19:
+BB37_19:
setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB19_23;
+ @%p10 bra BB37_23;
setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB19_22;
+ @%p11 bra BB37_22;
ld.shared.f64 %fd34, [%r4+2048];
min.f64 %fd44, %fd44, %fd34;
st.shared.f64 [%r4], %fd44;
-BB19_22:
+BB37_22:
bar.sync 0;
-BB19_23:
+BB37_23:
setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB19_27;
+ @%p12 bra BB37_27;
setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB19_26;
+ @%p13 bra BB37_26;
ld.shared.f64 %fd35, [%r4+1024];
min.f64 %fd44, %fd44, %fd35;
st.shared.f64 [%r4], %fd44;
-BB19_26:
+BB37_26:
bar.sync 0;
-BB19_27:
+BB37_27:
setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB19_31;
+ @%p14 bra BB37_31;
setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB19_30;
+ @%p15 bra BB37_30;
ld.shared.f64 %fd36, [%r4+512];
min.f64 %fd44, %fd44, %fd36;
st.shared.f64 [%r4], %fd44;
-BB19_30:
+BB37_30:
bar.sync 0;
-BB19_31:
+BB37_31:
setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB19_44;
+ @%p16 bra BB37_44;
setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB19_34;
+ @%p17 bra BB37_34;
ld.volatile.shared.f64 %fd37, [%r4+256];
min.f64 %fd44, %fd44, %fd37;
st.volatile.shared.f64 [%r4], %fd44;
-BB19_34:
+BB37_34:
setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB19_36;
+ @%p18 bra BB37_36;
ld.volatile.shared.f64 %fd38, [%r4+128];
min.f64 %fd44, %fd44, %fd38;
st.volatile.shared.f64 [%r4], %fd44;
-BB19_36:
+BB37_36:
setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB19_38;
+ @%p19 bra BB37_38;
ld.volatile.shared.f64 %fd39, [%r4+64];
min.f64 %fd44, %fd44, %fd39;
st.volatile.shared.f64 [%r4], %fd44;
-BB19_38:
+BB37_38:
setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB19_40;
+ @%p20 bra BB37_40;
ld.volatile.shared.f64 %fd40, [%r4+32];
min.f64 %fd44, %fd44, %fd40;
st.volatile.shared.f64 [%r4], %fd44;
-BB19_40:
+BB37_40:
setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB19_42;
+ @%p21 bra BB37_42;
ld.volatile.shared.f64 %fd41, [%r4+16];
min.f64 %fd44, %fd44, %fd41;
st.volatile.shared.f64 [%r4], %fd44;
-BB19_42:
+BB37_42:
setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB19_44;
+ @%p22 bra BB37_44;
ld.volatile.shared.f64 %fd42, [%r4+8];
min.f64 %fd43, %fd44, %fd42;
st.volatile.shared.f64 [%r4], %fd43;
-BB19_44:
+BB37_44:
setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB19_48;
+ @%p23 bra BB37_48;
ld.shared.f64 %fd28, [memory];
ld.local.u64 %rd114, [%rd2+96];
@@ -1583,17 +3112,17 @@ BB19_44:
ld.local.u64 %rd122, [%rd2+88];
and.b64 %rd115, %rd122, 1;
setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB19_47;
- bra.uni BB19_46;
+ @!%p24 bra BB37_47;
+ bra.uni BB37_46;
-BB19_46:
+BB37_46:
ld.local.u64 %rd116, [%rd11];
add.s64 %rd117, %rd122, %rd116;
ld.u64 %rd122, [%rd117+-1];
-BB19_47:
+BB37_47:
mov.u32 %r42, 0;
- // Callseq Start 9
+ // Callseq Start 19
{
.reg .b32 temp_param_reg;
// <end>}
@@ -1604,7 +3133,7 @@ BB19_47:
.param .b32 param2;
st.param.b32 [param2+0], %r7;
.param .b64 retval0;
- prototype_9 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
+ prototype_19 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
call (retval0),
%rd122,
(
@@ -1612,14 +3141,14 @@ BB19_47:
param1,
param2
)
- , prototype_9;
+ , prototype_19;
ld.param.b64 %rd119, [retval0+0];
//{
- }// Callseq End 9
+ }// Callseq End 19
st.f64 [%rd119], %fd28;
-BB19_48:
+BB37_48:
ret;
}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 29dc46b..6f449e9 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -29,7 +29,7 @@ using sec = std::chrono::duration<double, std::ratio<1>>;
size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char*
resource_path) {
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << "initializing cuda device " << device_id << std::endl;
#endif
std::string cuda_include_path;
@@ -90,7 +90,7 @@ void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx,
uint32_t device_id) {
}
int SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const
std::string &src) {
-#ifdef _DEBUG
+#ifndef NDEBUG
// std::cout << "---=== START source listing of spoof cuda kernel [ " <<
name << " ]: " << std::endl;
// uint32_t line_num = 0;
// std::istringstream src_stream(src);
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index ab0f098..7f74337 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -25,10 +25,7 @@
#define NOMINMAX
#endif
-#ifndef NDEBUG
- #define _DEBUG
-#endif
-//#ifdef _DEBUG
+//#ifndef NDEBUG
// #define JITIFY_PRINT_ALL 1
//#endif
@@ -88,7 +85,7 @@ public:
CHECK_CUDART(cudaMemcpy(output.row_ptr,
input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t),
cudaMemcpyDeviceToDevice));
}
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << "output rows: " << output.rows << " cols: " <<
output.cols << " nnz: " << output.nnz << " format: " <<
(output.row_ptr == nullptr ? "dense" :
"sparse") << std::endl;
#endif
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h
b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index f1735eb..85449a2 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -36,7 +36,7 @@ struct SpoofCellwiseFullAgg {
dim3 grid(NB, 1, 1);
dim3 block(NT, 1, 1);
uint32_t shared_mem_size = NT * sizeof(T);
-#ifdef _DEBUG
+#ifndef NDEBUG
// ToDo: connect output to SystemDS logging facilities
std::cout << "launching spoof cellwise kernel "
<< op_name << " with "
<< NT * NB << " threads in "
<< NB << " blocks and "
@@ -46,7 +46,7 @@ struct SpoofCellwiseFullAgg {
<< std::endl;
#endif
CHECK_CUDA(op->program.get()->kernel(op_name)
-
.instantiate(type_of(value_type), std::max(1ul, sides.size()))
+
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1),
sides.size()))
.configure(grid, block,
shared_mem_size)
.launch(dp.in, dp.sides,
dp.out, dp.scalars, N, grix));
@@ -56,7 +56,7 @@ struct SpoofCellwiseFullAgg {
void* args[3] = { &dp.out, &dp.out, &N};
NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << " launching spoof cellwise kernel
" << op_name << " with "
<< NT * NB << " threads in " << NB << " blocks and "
<< shared_mem_size
@@ -83,14 +83,14 @@ struct SpoofCellwiseRowAgg {
dim3 grid(NB, 1, 1);
dim3 block(NT, 1, 1);
uint32_t shared_mem_size = NT * sizeof(T);
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << " launching spoof cellwise kernel " << op_name <<
" with "
<< NT * NB << " threads in " << NB << "
blocks and "
<< shared_mem_size << " bytes of shared
memory for row aggregation of "
<< N << " elements" << std::endl;
#endif
CHECK_CUDA(op->program->kernel(op_name)
-
.instantiate(type_of(value_type), std::max(1ul, sides.size()))
+
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1),
sides.size()))
.configure(grid, block,
shared_mem_size)
.launch(dp.in, dp.sides,
dp.out, dp.scalars, N, grix));
@@ -110,13 +110,13 @@ struct SpoofCellwiseColAgg {
dim3 grid(NB,1, 1);
dim3 block(NT,1, 1);
uint32_t shared_mem_size = 0;
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << " launching spoof cellwise kernel " << op_name <<
" with "
<< NT * NB << " threads in " <<
NB << " blocks for column aggregation of "
<< N << " elements" <<
std::endl;
#endif
CHECK_CUDA(op->program->kernel(op_name)
-
.instantiate(type_of(value_type), std::max(1ul, sides.size()))
+
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1),
sides.size()))
.configure(grid, block,
shared_mem_size)
.launch(dp.in, dp.sides,
dp.out, dp.scalars, N, grix));
@@ -141,7 +141,7 @@ struct SpoofCellwiseNoAgg {
dim3 block(NT, 1, 1);
uint32_t shared_mem_size = 0;
-#ifdef _DEBUG
+#ifndef NDEBUG
if(sparse_input) {
std::cout << "launching sparse spoof cellwise
kernel " << op_name << " with " << NT * NB
<< " threads in " << NB << "
blocks without aggregation for " << N << " elements"
@@ -155,7 +155,7 @@ struct SpoofCellwiseNoAgg {
#endif
CHECK_CUDA(op->program->kernel(op_name)
-
.instantiate(type_of(value_type), std::max(1ul, sides.size()))
+
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1),
sides.size()))
.configure(grid, block,
shared_mem_size)
.launch(dp.in, dp.sides,
dp.out, dp.scalars, N, grix));
}
diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h
b/src/main/cuda/spoof-launcher/SpoofOperator.h
index f9fc5ee..0ccc633 100644
--- a/src/main/cuda/spoof-launcher/SpoofOperator.h
+++ b/src/main/cuda/spoof-launcher/SpoofOperator.h
@@ -74,7 +74,7 @@ struct DevMatPtrs {
T* scalars{};
~DevMatPtrs() {
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << "~DevMatPtrs() before cudaFree:\n";
int i = 0;
for (auto& p : ptrs) {
@@ -89,7 +89,7 @@ struct DevMatPtrs {
p = nullptr;
}
}
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << "~DevMatPtrs() after cudaFree:\n";
i = 0;
for (auto& p : ptrs) {
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h
b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index fb919b7..1295314 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -43,7 +43,7 @@ struct SpoofRowwise {
if(op->num_temp_vectors > 0) {
tmp_len = std::max(input.front().cols, op->const_dim2 <
0 ? 0 : static_cast<uint32_t>(op->const_dim2));
temp_buf_size = op->num_temp_vectors * tmp_len *
input.front().rows * sizeof(T);
-#ifdef _DEBUG
+#ifndef NDEBUG
std::cout << "num_temp_vect: " << op->num_temp_vectors
<< " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl;
#endif
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
@@ -54,7 +54,7 @@ struct SpoofRowwise {
if(sparse_input)
op_name = std::string(op->name + "_SPARSE");
-#ifdef _DEBUG
+#ifndef NDEBUG
// ToDo: connect output to SystemDS logging facilities
std::cout << "launching spoof rowwise kernel " << op_name << "
with " << NT * input.front().rows << " threads in "
<< input.front().rows << " blocks and " <<
shared_mem_size << " bytes of shared memory for "
@@ -62,7 +62,7 @@ struct SpoofRowwise {
<< temp_buf_size / 1024 << " kb of temp buffer
in global memory." << std::endl;
#endif
CHECK_CUDA(op->program->kernel(op_name)
-
.instantiate(type_of(value_type), std::max(1ul, sides.size()),
op->num_temp_vectors, tmp_len)
+
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1),
sides.size()), op->num_temp_vectors, tmp_len)
.configure(grid, block,
shared_mem_size)
.launch(dp.in, dp.sides,
dp.out, dp.scalars, d_temp, grix));