[SYSTEMML-2261] Performance sparse tsmm (CSR with empty/dense rows)

This patch significantly improves performance for sparse tsmm over
special sparse matrices in CSR format, where all rows are either empty
or completely dense. The trick is that we can simply take the CSR values
array, count the number of dense rows, and use it as a "dense" block for
existing dense tsmm operations. This greatly improves performance
because existing dense operations usually perform much close at peak
performance due to better cache-conscious implementations and avoiding
unnecessary gather and scatter operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ba06d053
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ba06d053
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ba06d053

Branch: refs/heads/master
Commit: ba06d0534ba07a03ee1a9ed15c2034043b28c928
Parents: 2b8161d
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Fri Apr 20 00:02:12 2018 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Fri Apr 20 00:02:12 2018 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 65 +++++++++++-----
 .../sysml/runtime/matrix/data/MatrixBlock.java  | 11 ++-
 ...llMatrixMultiplicationTransposeSelfTest.java | 80 ++++++--------------
 3 files changed, 76 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 3a2c58e..622f45c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -26,6 +26,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
+import java.util.stream.IntStream;
 
 import org.apache.commons.math3.util.FastMath;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -337,7 +338,6 @@ public class LibMatrixMult
                //Timing time = new Timing(true);
                
                //pre-processing
-               m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
                ret.sparse = false;
                ret.allocateDenseBlock();
 
@@ -370,7 +370,6 @@ public class LibMatrixMult
                //Timing time = new Timing(true);
                
                //pre-processing (no need to check isThreadSafe)
-               m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
                ret.sparse = false;
                ret.allocateDenseBlock();
        
@@ -1839,28 +1838,36 @@ public class LibMatrixMult
                SparseBlock a = m1.sparseBlock;
                DenseBlock c = ret.getDenseBlock();
                int m = m1.rlen;
-
+               
                if( leftTranspose ) // t(X)%*%X 
                {
                        //only general case (because vectors always dense)
                        //algorithm: scan rows, foreach row self join (KIJ)
-                       if( LOW_LEVEL_OPTIMIZATION )
-                       {
-                               int arlen = a.numRows();
+                       if( LOW_LEVEL_OPTIMIZATION ) {
+                               final int n = m1.clen;
+                               final int arlen = a.numRows();
                                for( int r=0; r<arlen; r++ ) {
                                        if( a.isEmpty(r) ) continue;
-                                       int apos = a.pos(r);
                                        int alen = a.size(r);
-                                       int[] aix = a.indexes(r);
                                        double[] avals = a.values(r);
-                                       int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
-                                       rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
-                                       int len = apos + alen;
-                                       for(int i = rlix; i < len && aix[i]<ru; 
i++) {
-                                               double val = avals[i];
-                                               if( val != 0 )
-                                                       vectMultiplyAdd(val, 
avals, c.values(aix[i]),
-                                                               aix, i, 
c.pos(aix[i]), len-i);
+                                       if( alen == n ) { //dense row
+                                               for( int i=rl; i<ru; i++ ) {
+                                                       
vectMultiplyAdd(avals[i], avals,
+                                                               c.values(i), i, 
c.pos(i), n-i);
+                                               }
+                                       }
+                                       else { //non-full sparse row
+                                               int apos = a.pos(r);
+                                               int[] aix = a.indexes(r);
+                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
+                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
+                                               int len = apos + alen;
+                                               for(int i = rlix; i < len && 
aix[i]<ru; i++) {
+                                                       double val = avals[i];
+                                                       if( val != 0 )
+                                                               
vectMultiplyAdd(val, avals, c.values(aix[i]),
+                                                                       aix, i, 
c.pos(aix[i]), len-i);
+                                               }
                                        }
                                }
                        }
@@ -3666,16 +3673,34 @@ public class LibMatrixMult
                return nnz;
        }
 
-       private static MatrixBlock prepMatrixMultTransposeSelfInput( 
MatrixBlock m1, boolean leftTranspose ) {
+       public static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock 
m1, boolean leftTranspose, boolean par ) {
                MatrixBlock ret = m1;
+               final int rlen = m1.rlen;
+               final int clen = m1.clen;
                
-               if( !leftTranspose && m1.sparse && m1.rlen > 1) //X%*%t(X) 
SPARSE MATRIX
-               {       
+               if( !leftTranspose && m1.sparse && rlen > 1) { //X%*%t(X) 
SPARSE MATRIX
                        //directly via LibMatrixReorg in order to prevent 
sparsity change
-                       MatrixBlock tmpBlock = new MatrixBlock(m1.clen, 
m1.rlen, m1.sparse);
+                       MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, 
m1.sparse);
                        LibMatrixReorg.reorg(m1, tmpBlock, new 
ReorgOperator(SwapIndex.getSwapIndexFnObject()));
                        ret = tmpBlock;
                }
+               else if( leftTranspose && m1.sparse && m1.sparseBlock 
instanceof SparseBlockCSR ) {
+                       //for a special case of CSR inputs where all non-empty 
rows are dense, we can
+                       //create a shallow copy of the values arrays to a 
"dense" block and perform
+                       //tsmm with the existing dense block operations w/o 
unnecessary gather/scatter
+                       SparseBlockCSR sblock = (SparseBlockCSR)m1.sparseBlock;
+                       boolean convertDense = (par ?
+                               IntStream.range(0, rlen).parallel() : 
IntStream.range(0, rlen))
+                               .allMatch(i -> sblock.isEmpty(i) || 
sblock.size(i)==clen );
+                       if( convertDense ) {
+                               int rows = (int) sblock.size() / clen;
+                               MatrixBlock tmpBlock = new MatrixBlock(rows, 
clen, false);
+                               tmpBlock.denseBlock = DenseBlockFactory
+                                       .createDenseBlock(sblock.values(), 
rows, clen);
+                               tmpBlock.setNonZeros(m1.nonZeros);
+                               ret = tmpBlock;
+                       }
+               }
                
                return ret;
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 97f883b..f738227 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3433,13 +3433,18 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                else
                        out.reset(dim, dim, false);
                
+               //pre=processing (outside LibMatrixMult for seamless integration
+               //with native BLAS library, e.g., for sparse-dense conversion)
+               MatrixBlock m1 = LibMatrixMult
+                       .prepMatrixMultTransposeSelfInput(this, leftTranspose, 
k > 1);
+               
                //compute matrix mult
                if( NativeHelper.isNativeLibraryLoaded() )
-                       LibMatrixNative.tsmm(this, out, leftTranspose, k);
+                       LibMatrixNative.tsmm(m1, out, leftTranspose, k);
                else if( k > 1 )
-                       LibMatrixMult.matrixMultTransposeSelf(this, out, 
leftTranspose, k);
+                       LibMatrixMult.matrixMultTransposeSelf(m1, out, 
leftTranspose, k);
                else
-                       LibMatrixMult.matrixMultTransposeSelf(this, out, 
leftTranspose);
+                       LibMatrixMult.matrixMultTransposeSelf(m1, out, 
leftTranspose);
                
                return out;
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
index db0d0e0..5ae480e 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
@@ -35,7 +35,6 @@ import org.apache.sysml.test.utils.TestUtils;
 
 public class FullMatrixMultiplicationTransposeSelfTest extends 
AutomatedTestBase 
 {
-
        private final static String TEST_NAME1 = 
"TransposeSelfMatrixMultiplication1";
        private final static String TEST_NAME2 = 
"TransposeSelfMatrixMultiplication2";
        private final static String TEST_DIR = 
"functions/binary/matrix_full_other/";
@@ -46,7 +45,7 @@ public class FullMatrixMultiplicationTransposeSelfTest 
extends AutomatedTestBase
        private final static int rows1 = 3500;
        private final static int cols1 = 1500;
        //for MR
-       private final static int rows2 = 7000;//7000;  
+       private final static int rows2 = 7000;//7000;
        private final static int cols2 = 750;//750; 
        
        private final static double sparsity1 = 0.7;
@@ -70,121 +69,97 @@ public class FullMatrixMultiplicationTransposeSelfTest 
extends AutomatedTestBase
        }
 
        @BeforeClass
-       public static void init()
-       {
+       public static void init() {
                TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
        }
 
        @AfterClass
-       public static void cleanUp()
-       {
+       public static void cleanUp() {
                if (TEST_CACHE_ENABLED) {
                        TestUtils.clearDirectory(TEST_DATA_DIR + 
TEST_CLASS_DIR);
                }
        }
 
        @Test
-       public void testMMLeftDenseCP() 
-       {
+       public void testMMLeftDenseCP() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, 
ExecType.CP, false);
        }
        
        @Test
-       public void testMMRightDenseCP() 
-       {
+       public void testMMRightDenseCP() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, 
ExecType.CP, false);
        }
 
        @Test
-       public void testMMLeftSparseCP() 
-       {
+       public void testMMLeftSparseCP() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, 
ExecType.CP, true);
        }
        
        @Test
-       public void testMMRightSparseCP() 
-       {
+       public void testMMRightSparseCP() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, 
ExecType.CP, true);
        }
        
        @Test
-       public void testMMLeftDenseMR() 
-       {
+       public void testMMLeftDenseMR() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, 
ExecType.MR, false);
        }
        
        @Test
-       public void testMMRightDenseMR() 
-       {
+       public void testMMRightDenseMR() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, 
ExecType.MR, false);
        }
 
        @Test
-       public void testMMLeftSparseMR() 
-       {
+       public void testMMLeftSparseMR() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, 
ExecType.MR, true);
        }
        
        @Test
-       public void testMMRightSparseMR() 
-       {
+       public void testMMRightSparseMR() {
                runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, 
ExecType.MR, true);
        }       
        
        @Test
-       public void testVVLeftDenseCP() 
-       {
+       public void testVVLeftDenseCP() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, 
ExecType.CP, false);
        }
        
        @Test
-       public void testVVRightDenseCP() 
-       {
+       public void testVVRightDenseCP() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, 
ExecType.CP, false);
        }
 
        @Test
-       public void testVVLeftSparseCP() 
-       {
+       public void testVVLeftSparseCP() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, 
ExecType.CP, true);
        }
        
        @Test
-       public void testVVRightSparseCP() 
-       {
+       public void testVVRightSparseCP() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, 
ExecType.CP, true);
        }
        
        @Test
-       public void testVVLeftDenseMR() 
-       {
+       public void testVVLeftDenseMR() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, 
ExecType.MR, false);
        }
        
        @Test
-       public void testVVRightDenseMR() 
-       {
+       public void testVVRightDenseMR() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, 
ExecType.MR, false);
        }
 
        @Test
-       public void testVVLeftSparseMR() 
-       {
+       public void testVVLeftSparseMR() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, 
ExecType.MR, true);
        }
        
        @Test
-       public void testVVRightSparseMR() 
-       {
+       public void testVVRightSparseMR() {
                runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, 
ExecType.MR, true);
        }
 
-       /**
-        * 
-        * @param type
-        * @param instType
-        * @param sparse
-        */
        private void runTransposeSelfMatrixMultiplicationTest( MMTSJType type, 
ExecType instType, boolean sparse )
        {
                //setup exec type, rows, cols
@@ -251,18 +226,11 @@ public class FullMatrixMultiplicationTransposeSelfTest 
extends AutomatedTestBase
                        HashMap<CellIndex, Double> rfile  = 
readRMatrixFromFS("B");
                        TestUtils.compareMatrices(dmlfile, rfile, eps, 
"Stat-DML", "Stat-R");
                }
-               finally
-               {
+               finally {
                        rtplatform = platformOld;
                }
        }
        
-       /**
-        * 
-        * @param type
-        * @param instType
-        * @param sparse
-        */
        private void runTransposeSelfVectorMultiplicationTest( MMTSJType type, 
ExecType instType, boolean sparse )
        {
                //setup exec type, rows, cols
@@ -329,10 +297,8 @@ public class FullMatrixMultiplicationTransposeSelfTest 
extends AutomatedTestBase
                        HashMap<CellIndex, Double> rfile  = 
readRMatrixFromFS("B");
                        TestUtils.compareMatrices(dmlfile, rfile, eps, 
"Stat-DML", "Stat-R");
                }
-               finally
-               {
+               finally {
                        rtplatform = platformOld;
                }
-       }       
-       
-}
\ No newline at end of file
+       }
+}

Reply via email to