This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 6df1ea3c13613e122b4dad5742d2f6db4513f572
Author: Mark Dokter <[email protected]>
AuthorDate: Wed Apr 14 17:13:53 2021 +0200

    [SYSTEMDS-2930] Remove function pointers from SPOOF CUDA's MatrixAccessor 
class
    
    The use of function pointers to abstract dense and sparse matrices is a 
major performance bottleneck. Relying on a conditional if sparse/dense for now.
    
    Also sets values per thread to 1 in cellwise template (better performance).
---
 src/main/cuda/headers/Matrix.h                     | 85 +++++-----------------
 src/main/cuda/spoof-launcher/SpoofCellwise.h       |  2 +-
 .../apache/sysds/hops/codegen/cplan/CNodeCell.java |  2 +-
 3 files changed, 20 insertions(+), 69 deletions(-)

diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h
index 808590b..755e764 100644
--- a/src/main/cuda/headers/Matrix.h
+++ b/src/main/cuda/headers/Matrix.h
@@ -41,7 +41,6 @@ struct Matrix {
                        col_idx(reinterpret_cast<uint32_t*>((jvals[4]))), 
data(reinterpret_cast<T*>(jvals[5])) {}
 };
 
-//#ifdef __CUDACC_RTC__
 #ifdef __CUDACC__
 
 template<typename T>
@@ -65,91 +64,48 @@ template<typename T>
 class MatrixAccessor {
        
        Matrix<T>* _mat;
-       
-       // Member function pointers
-       uint32_t (MatrixAccessor::*_len)();
-       uint32_t (MatrixAccessor::*_row_len)(uint32_t);
-       uint32_t (MatrixAccessor::*_pos)(uint32_t);
-       uint32_t* (MatrixAccessor::*_col_idxs)(uint32_t);
-       
-       T& (MatrixAccessor::*_val_i)(uint32_t);
-       T& (MatrixAccessor::*_val_rc)(uint32_t, uint32_t);
-       T* (MatrixAccessor::*_vals)(uint32_t);
-       void (MatrixAccessor::*_set)(uint32_t, uint32_t, T);
 
 public:
        MatrixAccessor() = default;
-
-       __device__ MatrixAccessor(Matrix<T>* mat) { init(mat); }
-
-       __device__ void init(Matrix<T>* mat) {
-               _mat = mat;
-               
-               if (_mat->row_ptr == nullptr) {
-                       _len = &MatrixAccessor::len_dense;
-                       _pos = &MatrixAccessor::pos_dense;
-                       _col_idxs = &MatrixAccessor::cols_dense;
-                       _val_rc = &MatrixAccessor::val_dense_rc;
-                       _vals = &MatrixAccessor::vals_dense;
-                       _row_len = &MatrixAccessor::row_len_dense;
-                       _val_i = &MatrixAccessor::val_dense_i;
-               } else {
-                       _len = &MatrixAccessor::len_sparse;
-                       _pos = &MatrixAccessor::pos_sparse;
-                       _col_idxs = &MatrixAccessor::cols_sparse;
-                       _val_rc = &MatrixAccessor::val_sparse_rc;
-                       _vals = &MatrixAccessor::vals_sparse;
-                       _row_len = &MatrixAccessor::row_len_sparse;
-                       _val_i = &MatrixAccessor::val_sparse_i;
-                       _set = &MatrixAccessor::set_sparse;
-               }
-       }
+       
+       __device__ MatrixAccessor(Matrix<T>* mat) : _mat(mat) {}
+       
+       __device__ void init(Matrix<T>* mat) { _mat = mat; }
        
        __device__ uint32_t& nnz() { return _mat->nnz; }
        __device__ uint32_t cols() { return _mat->cols; }
        __device__ uint32_t rows() { return _mat->rows; }
        
-       __device__ uint32_t len() { return (this->*_len)(); }
+       __device__ uint32_t len() { return _mat->data == nullptr ? len_sparse() 
: len_dense(); }
        
        __device__ uint32_t pos(uint32_t rix) {
-               return (this->*_pos)(rix);
+               return _mat->data == nullptr ? pos_sparse(rix) : pos_dense(rix);
        }
        
        __device__ T& val(uint32_t r, uint32_t c) {
-               return (this->*_val_rc)(r, c);
+               return _mat->data == nullptr ? val_sparse_rc(r, c) : 
val_dense_rc(r,c);
        }
        
        __device__ T& val(uint32_t i) {
-               return (this->*_val_i)(i);
+               return _mat->data == nullptr ? val_sparse_i(i) : val_dense_i(i);
        }
-
+       __device__ T& operator[](uint32_t i) { return val(i); }
+       
        __device__ T* vals(uint32_t rix) {
-               return (this->*_vals)(rix);
+               return _mat->data == nullptr ? vals_sparse(rix) : 
vals_dense(rix);
        }
        
-    __device__ T& operator[](uint32_t i) {
-               return (this->*_val_i)(i);
-    }
-       
        __device__ uint32_t row_len(uint32_t rix) {
-               return (this->*_row_len)(rix);
+               return _mat->data == nullptr ? row_len_sparse(rix) : 
row_len_dense(rix);
        }
        
-       __device__ uint32_t* col_idxs(uint32_t rix) {
-               return (this->*_col_idxs)(rix);
-       }
+       __device__ uint32_t* col_idxs(uint32_t rix) { return cols_sparse(rix); }
 
-       __device__ void set(uint32_t r, uint32_t c, T v) {
-               (this->*_set)(r,c,v);
-       }
+       __device__ void set(uint32_t r, uint32_t c, T v) { set_sparse(r,c,v); }
        
-       __device__ uint32_t* indexes() {
-               return _mat->row_ptr;
-       }
+       __device__ uint32_t* indexes() {  return _mat->row_ptr; }
        
-       __device__ bool hasData() {
-               return _mat->data != nullptr;
-       }
+       __device__ bool hasData() { return _mat->data != nullptr; }
 private:
        __device__ uint32_t len_dense() {
                return _mat->rows * _mat->cols;
@@ -159,11 +115,6 @@ private:
                return _mat->cols * rix;
        }
        
-       __device__ uint32_t* cols_dense(uint32_t rix) {
-               printf("ERROR: no column indices array in a dense matrix! This 
will likely crash :-/\n");
-               return nullptr;
-       }
-       
        __device__ T& val_dense_rc(uint32_t r, uint32_t c) {
                return _mat->data[_mat->cols * r + c];
        }
@@ -240,8 +191,8 @@ struct Vector {
        __device__ T* vals(uint32_t idx) { return &data[idx]; }
 
        __device__ T& operator[](uint32_t idx) {
-           return data[idx];
-    }
+               return data[idx];
+       }
        
        __device__ void print(const char* name, uint32_t end_ = 0, uint32_t 
start = 0, uint32_t bID = 0, uint32_t tID = 0) {
                if(blockIdx.x == bID && threadIdx.x==tID) {
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h 
b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index 85449a2..fe7e9d6 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -133,7 +133,7 @@ struct SpoofCellwiseNoAgg {
                
                // num ctas
                // ToDo: adaptive VT
-               const uint32_t VT = 4;
+               const uint32_t VT = 1;
                uint32_t NB = std::ceil((N + NT * VT - 1) / (NT * VT));
                if(sparse_input)
                        NB = input.front().rows;
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index 070fc9e..4e79fea 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -158,7 +158,7 @@ public class CNodeCell extends CNodeTpl
                if(api == GeneratorAPI.CUDA) {
                        // ToDo: initial_value is misused to pass VT (values 
per thread) to no_agg operator
                        String agg_op = "IdentityOp";
-                       String initial_value = "(T)4.0";
+                       String initial_value = "(T)1.0";
                        if(_aggOp != null)
                        switch(_aggOp) {
                                case SUM:

Reply via email to