[SYSTEMML-2261] Performance sparse tsmm (CSR with empty/dense rows) This patch significantly improves performance for sparse tsmm over special sparse matrices in CSR format, where all rows are either empty or completely dense. The trick is that we can simply take the CSR values array, count the number of dense rows, and use it as a "dense" block for existing dense tsmm operations. This greatly improves performance because existing dense operations usually perform much close at peak performance due to better cache-conscious implementations and avoiding unnecessary gather and scatter operations.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ba06d053 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ba06d053 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ba06d053 Branch: refs/heads/master Commit: ba06d0534ba07a03ee1a9ed15c2034043b28c928 Parents: 2b8161d Author: Matthias Boehm <mboe...@gmail.com> Authored: Fri Apr 20 00:02:12 2018 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Fri Apr 20 00:02:12 2018 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixMult.java | 65 +++++++++++----- .../sysml/runtime/matrix/data/MatrixBlock.java | 11 ++- ...llMatrixMultiplicationTransposeSelfTest.java | 80 ++++++-------------- 3 files changed, 76 insertions(+), 80 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 3a2c58e..622f45c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -26,6 +26,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import java.util.stream.IntStream; import org.apache.commons.math3.util.FastMath; import org.apache.sysml.hops.OptimizerUtils; @@ -337,7 +338,6 @@ public class LibMatrixMult //Timing time = new Timing(true); //pre-processing - m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose); ret.sparse = false; ret.allocateDenseBlock(); @@ -370,7 +370,6 @@ public class LibMatrixMult //Timing time = new Timing(true); //pre-processing (no need to check isThreadSafe) - m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose); ret.sparse = false; ret.allocateDenseBlock(); @@ -1839,28 +1838,36 @@ public class LibMatrixMult SparseBlock a = m1.sparseBlock; DenseBlock c = ret.getDenseBlock(); int m = m1.rlen; - + if( leftTranspose ) // t(X)%*%X { //only general case (because vectors always dense) //algorithm: scan rows, foreach row self join (KIJ) - if( LOW_LEVEL_OPTIMIZATION ) - { - int arlen = a.numRows(); + if( LOW_LEVEL_OPTIMIZATION ) { + final int n = m1.clen; + final int arlen = a.numRows(); for( int r=0; r<arlen; r++ ) { if( a.isEmpty(r) ) continue; - int apos = a.pos(r); int alen = a.size(r); - int[] aix = a.indexes(r); double[] avals = a.values(r); - int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl); - rlix = (rlix>=0) ? apos+rlix : apos+alen; - int len = apos + alen; - for(int i = rlix; i < len && aix[i]<ru; i++) { - double val = avals[i]; - if( val != 0 ) - vectMultiplyAdd(val, avals, c.values(aix[i]), - aix, i, c.pos(aix[i]), len-i); + if( alen == n ) { //dense row + for( int i=rl; i<ru; i++ ) { + vectMultiplyAdd(avals[i], avals, + c.values(i), i, c.pos(i), n-i); + } + } + else { //non-full sparse row + int apos = a.pos(r); + int[] aix = a.indexes(r); + int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl); + rlix = (rlix>=0) ? apos+rlix : apos+alen; + int len = apos + alen; + for(int i = rlix; i < len && aix[i]<ru; i++) { + double val = avals[i]; + if( val != 0 ) + vectMultiplyAdd(val, avals, c.values(aix[i]), + aix, i, c.pos(aix[i]), len-i); + } } } } @@ -3666,16 +3673,34 @@ public class LibMatrixMult return nnz; } - private static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose ) { + public static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose, boolean par ) { MatrixBlock ret = m1; + final int rlen = m1.rlen; + final int clen = m1.clen; - if( !leftTranspose && m1.sparse && m1.rlen > 1) //X%*%t(X) SPARSE MATRIX - { + if( !leftTranspose && m1.sparse && rlen > 1) { //X%*%t(X) SPARSE MATRIX //directly via LibMatrixReorg in order to prevent sparsity change - MatrixBlock tmpBlock = new MatrixBlock(m1.clen, m1.rlen, m1.sparse); + MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, m1.sparse); LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject())); ret = tmpBlock; } + else if( leftTranspose && m1.sparse && m1.sparseBlock instanceof SparseBlockCSR ) { + //for a special case of CSR inputs where all non-empty rows are dense, we can + //create a shallow copy of the values arrays to a "dense" block and perform + //tsmm with the existing dense block operations w/o unnecessary gather/scatter + SparseBlockCSR sblock = (SparseBlockCSR)m1.sparseBlock; + boolean convertDense = (par ? + IntStream.range(0, rlen).parallel() : IntStream.range(0, rlen)) + .allMatch(i -> sblock.isEmpty(i) || sblock.size(i)==clen ); + if( convertDense ) { + int rows = (int) sblock.size() / clen; + MatrixBlock tmpBlock = new MatrixBlock(rows, clen, false); + tmpBlock.denseBlock = DenseBlockFactory + .createDenseBlock(sblock.values(), rows, clen); + tmpBlock.setNonZeros(m1.nonZeros); + ret = tmpBlock; + } + } return ret; } http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index 97f883b..f738227 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -3433,13 +3433,18 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab else out.reset(dim, dim, false); + //pre=processing (outside LibMatrixMult for seamless integration + //with native BLAS library, e.g., for sparse-dense conversion) + MatrixBlock m1 = LibMatrixMult + .prepMatrixMultTransposeSelfInput(this, leftTranspose, k > 1); + //compute matrix mult if( NativeHelper.isNativeLibraryLoaded() ) - LibMatrixNative.tsmm(this, out, leftTranspose, k); + LibMatrixNative.tsmm(m1, out, leftTranspose, k); else if( k > 1 ) - LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose, k); + LibMatrixMult.matrixMultTransposeSelf(m1, out, leftTranspose, k); else - LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose); + LibMatrixMult.matrixMultTransposeSelf(m1, out, leftTranspose); return out; } http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java index db0d0e0..5ae480e 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java @@ -35,7 +35,6 @@ import org.apache.sysml.test.utils.TestUtils; public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase { - private final static String TEST_NAME1 = "TransposeSelfMatrixMultiplication1"; private final static String TEST_NAME2 = "TransposeSelfMatrixMultiplication2"; private final static String TEST_DIR = "functions/binary/matrix_full_other/"; @@ -46,7 +45,7 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase private final static int rows1 = 3500; private final static int cols1 = 1500; //for MR - private final static int rows2 = 7000;//7000; + private final static int rows2 = 7000;//7000; private final static int cols2 = 750;//750; private final static double sparsity1 = 0.7; @@ -70,121 +69,97 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase } @BeforeClass - public static void init() - { + public static void init() { TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR); } @AfterClass - public static void cleanUp() - { + public static void cleanUp() { if (TEST_CACHE_ENABLED) { TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR); } } @Test - public void testMMLeftDenseCP() - { + public void testMMLeftDenseCP() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.CP, false); } @Test - public void testMMRightDenseCP() - { + public void testMMRightDenseCP() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, false); } @Test - public void testMMLeftSparseCP() - { + public void testMMLeftSparseCP() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.CP, true); } @Test - public void testMMRightSparseCP() - { + public void testMMRightSparseCP() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, true); } @Test - public void testMMLeftDenseMR() - { + public void testMMLeftDenseMR() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.MR, false); } @Test - public void testMMRightDenseMR() - { + public void testMMRightDenseMR() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, false); } @Test - public void testMMLeftSparseMR() - { + public void testMMLeftSparseMR() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.MR, true); } @Test - public void testMMRightSparseMR() - { + public void testMMRightSparseMR() { runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, true); } @Test - public void testVVLeftDenseCP() - { + public void testVVLeftDenseCP() { runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.CP, false); } @Test - public void testVVRightDenseCP() - { + public void testVVRightDenseCP() { runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, false); } @Test - public void testVVLeftSparseCP() - { + public void testVVLeftSparseCP() { runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.CP, true); } @Test - public void testVVRightSparseCP() - { + public void testVVRightSparseCP() { runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, true); } @Test - public void testVVLeftDenseMR() - { + public void testVVLeftDenseMR() { runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.MR, false); } @Test - public void testVVRightDenseMR() - { + public void testVVRightDenseMR() { runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, false); } @Test - public void testVVLeftSparseMR() - { + public void testVVLeftSparseMR() { runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.MR, true); } @Test - public void testVVRightSparseMR() - { + public void testVVRightSparseMR() { runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, true); } - /** - * - * @param type - * @param instType - * @param sparse - */ private void runTransposeSelfMatrixMultiplicationTest( MMTSJType type, ExecType instType, boolean sparse ) { //setup exec type, rows, cols @@ -251,18 +226,11 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase HashMap<CellIndex, Double> rfile = readRMatrixFromFS("B"); TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R"); } - finally - { + finally { rtplatform = platformOld; } } - /** - * - * @param type - * @param instType - * @param sparse - */ private void runTransposeSelfVectorMultiplicationTest( MMTSJType type, ExecType instType, boolean sparse ) { //setup exec type, rows, cols @@ -329,10 +297,8 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase HashMap<CellIndex, Double> rfile = readRMatrixFromFS("B"); TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R"); } - finally - { + finally { rtplatform = platformOld; } - } - -} \ No newline at end of file + } +}