[systemds] branch master updated: [SYSTEMDS-2888] Fix incomplete cbind support in codegen row templates (CUDA)

markd Mon, 12 Apr 2021 14:43:43 -0700

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new 769f0e3  [SYSTEMDS-2888] Fix incomplete cbind support in codegen row 
templates (CUDA)
769f0e3 is described below

commit 769f0e3db1646acdb7212bbe0c2275d69013a2c2
Author: Mark Dokter <[email protected]>
AuthorDate: Mon Apr 12 23:06:32 2021 +0200

    [SYSTEMDS-2888] Fix incomplete cbind support in codegen row templates (CUDA)
    
    This patch is the CUDA version of the original bugfix (commit 
1ec292a932c6e732bbac835a81cdb59371002114)
---
 src/main/cuda/headers/spoof_utils.cuh              | 28 ++++++++++++------
 .../sysds/hops/codegen/cplan/cuda/Binary.java      | 33 ++++++++++------------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/src/main/cuda/headers/spoof_utils.cuh 
b/src/main/cuda/headers/spoof_utils.cuh
index 271f0c2..9bcaef5 100644
--- a/src/main/cuda/headers/spoof_utils.cuh
+++ b/src/main/cuda/headers/spoof_utils.cuh
@@ -337,18 +337,30 @@ __device__ void vectDivAdd(T* a, T b, T* c, int ai, int 
ci, int len) {
 }
 
 template<typename T>
+__device__ Vector<T>& vectCbindWrite(T* a, T* b, uint32_t ai, uint32_t bi, 
uint32_t alen, uint32_t blen, TempStorage<T>* fop) {
+       Vector<T>& c = fop->getTempStorage(alen+blen);
+       auto i = threadIdx.x;
+       while(i < alen) {
+               c[i] = a[ai + i];
+               i+=gridDim.x;
+       }
+       while(i < blen) {
+               c[alen + i] = b[bi + i];
+       }
+       return c;
+}
+
+template<typename T>
 __device__ Vector<T>& vectCbindWrite(T* a, T b, uint32_t ai, uint32_t len, 
TempStorage<T>* fop) {
 
        Vector<T>& c = fop->getTempStorage(len+1);
-
-       if(threadIdx.x < len) {
-//              if(blockIdx.x==1 && threadIdx.x ==0)
-//                     printf("vecCbindWrite: bid=%d, tid=%d, ai=%d, len=%d, 
a[%d]=%f\n", blockIdx.x, threadIdx.x, ai, len, ai * len + threadIdx.x, a[ai * 
len + threadIdx.x]);
-               c[threadIdx.x] = a[ai + threadIdx.x];
+       auto i = threadIdx.x;
+       while(i < len) {
+               c[i] = a[ai + i];
+               i += gridDim.x;
        }
-       if(threadIdx.x == len) {
-//             printf("---> block %d thread %d, b=%f,, len=%d, 
a[%d]=%f\n",blockIdx.x, threadIdx.x, b, len, ai, a[ai]);
-        c[threadIdx.x] = b;
+       if(i == len) {
+        c[i] = b;
        }
        return c;
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java
index 7d9655f..6d826b1 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java
@@ -30,6 +30,19 @@ public class Binary extends CodeTemplate
        public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, 
boolean sparseRhs,
                boolean scalarVector, boolean scalarInput, boolean vectorVector)
        {
+               if(type == CNodeBinary.BinType.VECT_CBIND) {
+                       if(scalarInput)
+                               return "\t\tVector<T>& %TMP% = 
vectCbindWrite(%IN1%, %IN2%, this);\n";
+                       else if (!vectorVector)
+                               return sparseLhs ? 
+                                       "\t\tVector<T>& %TMP% = 
vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%, this);\n" :
+                                       "\t\tVector<T>& %TMP% = 
vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%, this);\n";
+                       else //vect/vect
+                               return sparseLhs ?
+                                       "\t\tVector<T>& %TMP% = 
vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN1%, %LEN2%, 
this);\n" :
+                                       "\t\tVector<T>& %TMP% = 
vectCbindWrite(%IN1%, %IN2%, %POS1%, %POS2%, %LEN1%, %LEN2%, this);\n";
+               }
+               
                if(isSinglePrecision()) {
                        switch(type) {
                                case DOT_PRODUCT:
@@ -84,15 +97,7 @@ public class Binary extends CodeTemplate
                                        else
                                                return sparseLhs ? "    T[] 
%TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, 
%POS1%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName 
+ "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
                                }
-
-                               case VECT_CBIND:
-                                       if(scalarInput)
-                                               return "                
Vector<T>& %TMP% = vectCbindWrite(%IN1%, %IN2%, this);\n";
-                                       else
-//                                             return sparseLhs ? "    T[] 
%TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, 
%LEN%);\n" : "       T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, 
%IN2%, %POS1%, %LEN%);\n";
-                                               return sparseLhs ? "    T[] 
%TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, 
%LEN%);\n" : "               Vector<T>& %TMP% = vectCbindWrite(%IN1%, %IN2%, 
%POS1%, %LEN%, this);\n";
-
-                                       //vector-vector operations
+                               //vector-vector operations
                                case VECT_MULT:
                                case VECT_DIV:
                                case VECT_MINUS:
@@ -222,16 +227,8 @@ public class Binary extends CodeTemplate
 //                                             return sparseLhs ? "    T[] 
%TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, 
%POS1%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName 
+ "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
                                                return sparseLhs ? "            
Vector<T>& %TMP% = vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, 
alen, %LEN%, this);\n" : "          Vector<T>& %TMP% = vect" + vectName + 
"Write(%IN1%, %IN2%, static_cast<uint32_t>(%POS1%), %LEN%, this);\n";
                                }
-
-                               case VECT_CBIND:
-                                       if(scalarInput)
-                                               return "                
Vector<T>& %TMP% = vectCbindWrite(%IN1%, %IN2%, this);\n";
-                                       else
-//                                             return sparseLhs ? "    T[] 
%TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, 
%LEN%);\n" : "       T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, 
%IN2%, %POS1%, %LEN%);\n";
-//                                             return sparseLhs ? "    T[] 
%TMP% = vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : "      
    T* %TMP% = vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
-                                               return sparseLhs ? "    T[] 
%TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, 
%LEN%);\n" : "               Vector<T>& %TMP% = vectCbindWrite(%IN1%, %IN2%, 
%POS1%, %LEN%, this);\n";
                                        
-                                       //vector-vector operations
+                               //vector-vector operations
                                case VECT_MULT:
                                case VECT_DIV:
                                case VECT_MINUS:

[systemds] branch master updated: [SYSTEMDS-2888] Fix incomplete cbind support in codegen row templates (CUDA)

Reply via email to