[systemds] 02/03: [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix)

markd Mon, 09 May 2022 14:45:20 -0700

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit 14d095efe5bc98d120bc0dd34270c3f12747b3cc
Author: Mark Dokter <[email protected]>
AuthorDate: Thu Apr 28 14:06:53 2022 +0200

    [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix)
    
    The CUDA code generation launcher handles streams per operator at the 
moment. This is wrong since a read before write can happen on a certain device 
allocation. Switching to a central stream object for now.
    
    Closes #1600
---
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h |  9 ++++++--
 src/main/cuda/spoof-launcher/SpoofCellwise.h    | 30 ++++++++++++-------------
 src/main/cuda/spoof-launcher/SpoofOperator.h    |  6 ++---
 src/main/cuda/spoof-launcher/SpoofRowwise.h     |  6 ++---
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index e4b80c5e40..c902c38382 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -55,9 +55,14 @@ public:
        size_t current_mem_size = 0; // the actual staging buffer size (should 
be default unless there was a resize)
        std::byte* staging_buffer{}; // pinned host mem for async transfers
        std::byte* device_buffer{};  // this buffer holds the pointers to the 
data buffers
+       cudaStream_t stream{};
 
        explicit SpoofCUDAContext(const char* resource_path_, 
std::vector<std::string>  include_paths_) : reductions(nullptr),
-                       resource_path(resource_path_), 
include_paths(std::move(include_paths_)) { }
+                       resource_path(resource_path_), 
include_paths(std::move(include_paths_)) {
+                           CHECK_CUDART(cudaStreamCreate(&stream));
+            }
+
+    virtual ~SpoofCUDAContext() { CHECK_CUDART(cudaStreamDestroy(stream)); }
 
        static size_t initialize_cuda(uint32_t device_id, const char* 
resource_path_);
 
@@ -70,7 +75,7 @@ public:
 
                DataBufferWrapper dbw(staging_buffer, device_buffer);
                SpoofOperator* op = compiled_ops[dbw.op_id()].get();
-               dbw.toDevice(op->stream);
+               dbw.toDevice(stream);
 
                CALL::exec(this, op, &dbw);
 
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h 
b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index 9077840020..68b176b6f2 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -27,7 +27,7 @@
 template<typename T>
 struct SpoofCellwiseFullAgg {
        
-       static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const 
std::string& op_name, DataBufferWrapper* dbw) {
+       static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const 
std::string& op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
                T value_type;
                
                // num ctas
@@ -46,7 +46,7 @@ struct SpoofCellwiseFullAgg {
 #endif
                CHECK_CUDA(op->program.get()->kernel(op_name)
                                                   
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
-                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .configure(grid, block, 
shared_mem_size, ctx->stream)
                                                   .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
                
                if(NB > 1) {
@@ -64,7 +64,7 @@ struct SpoofCellwiseFullAgg {
                     << N << " elements"
                     << std::endl;
 #endif
-                               CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 
1, NT, 1, 1, shared_mem_size, op->stream, args, nullptr));
+                               CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 
1, NT, 1, 1, shared_mem_size, ctx->stream, args, nullptr));
                                N = NB;
                        }
                }
@@ -74,7 +74,7 @@ struct SpoofCellwiseFullAgg {
 
 template<typename T>
 struct SpoofCellwiseRowAgg {
-       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name, DataBufferWrapper* dbw) {
+       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
                T value_type;
                
                // num ctas
@@ -90,7 +90,7 @@ struct SpoofCellwiseRowAgg {
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
                                                   
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
-                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .configure(grid, block, 
shared_mem_size, ctx->stream)
                                                   .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
                
        }
@@ -99,7 +99,7 @@ struct SpoofCellwiseRowAgg {
 
 template<typename T>
 struct SpoofCellwiseColAgg {
-       static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const 
std::string& op_name, DataBufferWrapper* dbw) {
+       static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const 
std::string& op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
                T value_type;
                
                // num ctas
@@ -115,7 +115,7 @@ struct SpoofCellwiseColAgg {
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
                                                   
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
-                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .configure(grid, block, 
shared_mem_size, ctx->stream)
                                                   .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
                
        }
@@ -124,7 +124,7 @@ struct SpoofCellwiseColAgg {
 
 template<typename T>
 struct SpoofCellwiseNoAgg {
-       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name, DataBufferWrapper* dbw) {
+       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
                T value_type;
                bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
                
@@ -155,16 +155,16 @@ struct SpoofCellwiseNoAgg {
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
                                                   
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
-                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .configure(grid, block, 
shared_mem_size, ctx->stream)
                                                   .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 
                // copy over row indices from input to output if appropriate
                if (op->isSparseSafe() && dbw->h_in<T>(0)->row_ptr != nullptr) {
                        // src/dst information (pointer address) is stored in 
*host* buffer!
                        CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->row_ptr, 
dbw->h_in<T>(0)->row_ptr,
-                               (dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), 
cudaMemcpyDeviceToDevice, op->stream));
+                               (dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), 
cudaMemcpyDeviceToDevice, ctx->stream));
                        CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->col_idx, 
dbw->h_in<T>(0)->col_idx,
-                                                                               
 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, 
op->stream));
+                                                                               
 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, 
ctx->stream));
                }
        }
 };
@@ -186,16 +186,16 @@ struct SpoofCellwise {
                switch(op->agg_type) {
                        case SpoofOperator::AggType::FULL_AGG:
                                op->agg_kernel = ctx->template 
getReductionKernel<T>(std::make_pair(op->agg_type, op->agg_op));
-                               SpoofCellwiseFullAgg<T>::exec(op, NT, N, 
op_name, dbw);
+                               SpoofCellwiseFullAgg<T>::exec(op, NT, N, 
op_name, dbw, ctx);
                                break;
                        case SpoofOperator::AggType::ROW_AGG:
-                               SpoofCellwiseRowAgg<T>::exec(op, NT, N, 
op_name, dbw);
+                               SpoofCellwiseRowAgg<T>::exec(op, NT, N, 
op_name, dbw, ctx);
                                break;
                        case SpoofOperator::AggType::COL_AGG:
-                               SpoofCellwiseColAgg<T>::exec(op, NT, N, 
op_name, dbw);
+                               SpoofCellwiseColAgg<T>::exec(op, NT, N, 
op_name, dbw, ctx);
                                break;
                        case SpoofOperator::AggType::NO_AGG:
-                               SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, 
dbw);
+                               SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, 
dbw, ctx);
                                break;
                        default:
                                throw std::runtime_error("unknown cellwise agg 
type" + std::to_string(static_cast<int>(op->agg_type)));
diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h 
b/src/main/cuda/spoof-launcher/SpoofOperator.h
index f256e817db..045dcfdb80 100644
--- a/src/main/cuda/spoof-launcher/SpoofOperator.h
+++ b/src/main/cuda/spoof-launcher/SpoofOperator.h
@@ -42,10 +42,8 @@ struct SpoofOperator {
        
        [[nodiscard]] virtual bool isSparseSafe() const = 0;
 
-       cudaStream_t stream{};
-       
-       SpoofOperator() { CHECK_CUDART(cudaStreamCreate(&stream));}
-       virtual ~SpoofOperator() {CHECK_CUDART(cudaStreamDestroy(stream));}
+       SpoofOperator() = default;
+       virtual ~SpoofOperator() = default;
 };
 
 struct SpoofCellwiseOp : public SpoofOperator {
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h 
b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index 01ec5206aa..a9a656fbb7 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -39,7 +39,7 @@ struct SpoofRowwise {
                        if(op->isSparseSafe() && dbw->h_out<T>()->nnz > 0)
                                out_num_elements = dbw->h_out<T>()->nnz;
                //ToDo: only memset output when there is an output operation 
that *adds* to the buffer
-               CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, 
out_num_elements * sizeof(T), op->stream));
+               CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, 
out_num_elements * sizeof(T), ctx->stream));
 
                //ToDo: handle this in JVM
                uint32_t tmp_len = 0;
@@ -52,7 +52,7 @@ struct SpoofRowwise {
                        std::cout << "num_temp_vect: " << op->num_temp_vectors 
<< " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl;
 #endif
                        
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
-                       CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, 
op->stream));
+                       CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, 
ctx->stream));
                }
 
                std::string op_name(op->name + "_DENSE");
@@ -68,7 +68,7 @@ struct SpoofRowwise {
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
                                                   
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1), 
dbw->num_sides()), op->num_temp_vectors, tmp_len)
-                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .configure(grid, block, 
shared_mem_size, ctx->stream)
                                                   .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), d_temp, dbw->grix()));
                
                if(op->num_temp_vectors > 0)

[systemds] 02/03: [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix)

Reply via email to