This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 8bff270e02 [MINOR] Sparse MM NonZero fix
8bff270e02 is described below

commit 8bff270e02264459cd396d195fea0176555d713e
Author: Sebastian Baunsgaard <[email protected]>
AuthorDate: Mon Oct 16 15:33:03 2023 +0200

    [MINOR] Sparse MM NonZero fix
    
    This commit fixes a bug in the sparse matrix multiply that counts the
    non-zero incorrectly. then fixes it via reallocation as dense while
    the output should have been sparse. The bug happens because nnz was
    counted twice because some edge case kernels call quickSet which increments
    the output matrices nnz while some other cases do not.
    
    Furthermore, this commit adds a specialized kernel for sparse-ultra-sparse
    matrix multiplication that improves the runtime of our decision tree from 
100
    to 30 sec and out new matrix multiplication tests from 67 to 31 sec.
    
    Also included is sparse handling of Binary cell operations and a small
    optimization to Commons math Eigen to skip an indirect call.
    
    Closes #1922
---
 .../sysds/runtime/matrix/data/LibCommonsMath.java  |   2 +-
 .../runtime/matrix/data/LibMatrixBincell.java      | 263 ++++++++++++++-------
 .../matrix/data/LibMatrixDenseToSparse.java        | 187 +++++++++++++++
 .../sysds/runtime/matrix/data/LibMatrixMult.java   | 243 +++++++++++++++----
 .../sysds/runtime/matrix/data/MatrixBlock.java     | 175 ++++++--------
 src/test/java/org/apache/sysds/test/TestUtils.java |   2 +-
 .../test/component/matrix/MatrixMultiplyTest.java  |   4 +-
 .../component/matrix/MatrixToSparseOrDense.java    | 105 ++++++++
 8 files changed, 747 insertions(+), 234 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibCommonsMath.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibCommonsMath.java
index c71838cd01..cc399bac79 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibCommonsMath.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibCommonsMath.java
@@ -384,7 +384,7 @@ public class LibCommonsMath
                        T.setValue(i, i, alpha.getValue(0, 0));
                }
 
-               MatrixBlock[] e = multiReturnOperations(T, "eigen");
+               MatrixBlock[] e = computeEigen(T);
                TV.setNonZeros((long) m*m);
                e[1] = TV.aggregateBinaryOperations(TV, e[1], op_mul_agg);
                return e;
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
index ca3ecf583d..e53f09a7f4 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
@@ -249,7 +249,7 @@ public class LibMatrixBincell {
        public static void bincellOp(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, BinaryOperator op, int k) {
                BinaryAccessType atype = getBinaryAccessType(m1, m2);
                
-               //fallback to sequential computation for specialized operations
+               // fallback to sequential computation for specialized operations
                if( m1.isEmpty() || m2.isEmpty()
                        || ret.getLength() < PAR_NUMCELL_THRESHOLD2
                        || ((op.sparseSafe || isSparseSafeDivide(op, m2))
@@ -666,94 +666,141 @@ public class LibMatrixBincell {
                return ret.getNonZeros();
        }
 
-       private static long safeBinaryMVDense(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, BinaryOperator op, int rl, int ru) {
-               boolean isMultiply = (op.fn instanceof Multiply);
-               boolean skipEmpty = (isMultiply);
-               BinaryAccessType atype = getBinaryAccessType(m1, m2);
-               int clen = m1.clen;
-               
-               //early abort on skip and empy
-               if( skipEmpty && (m1.isEmptyBlock(false) || 
m2.isEmptyBlock(false) ) )
+       private static long safeBinaryMVDense(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, BinaryOperator op, int rl,
+               int ru) {
+               final boolean isMultiply = (op.fn instanceof Multiply);
+               final boolean skipEmpty = (isMultiply);
+
+               // early abort on skip and empy
+               if(skipEmpty && (m1.isEmptyBlock(false) || 
m2.isEmptyBlock(false)))
                        return 0; // skip entire empty block
-               
-               //guard for postponed allocation in single-threaded exec
-               if( !ret.isAllocated() )
+
+               // guard for postponed allocation in single-threaded exec
+               if(!ret.isAllocated())
                        ret.allocateDenseBlock();
-               
-               DenseBlock da = m1.getDenseBlock();
-               DenseBlock dc = ret.getDenseBlock();
+
+               final BinaryAccessType atype = getBinaryAccessType(m1, m2);
+
+               if(atype == BinaryAccessType.MATRIX_COL_VECTOR)
+                       return safeBinaryMVDenseColVector(m1, m2, ret, op, rl, 
ru);
+               else // if( atype == BinaryAccessType.MATRIX_ROW_VECTOR )
+                       return safeBinaryMVDenseRowVector(m1, m2, ret, op, rl, 
ru);
+       }
+
+       private static long safeBinaryMVDenseColVector(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, BinaryOperator op,
+               int rl, int ru) {
+               final boolean multiply = (op.fn instanceof Multiply);
+               final int clen = m1.clen;
+
+               final DenseBlock da = m1.getDenseBlock();
+               if(da.values(0) == null)
+                       throw new RuntimeException("Invalid input with empty 
input");
+               final DenseBlock dc = ret.getDenseBlock();
                long nnz = 0;
-               
-               if( atype == BinaryAccessType.MATRIX_COL_VECTOR )
-               {
-                       double[] b = m2.getDenseBlockValues(); // always single 
block
-                       
-                       for( int i=rl; i<ru; i++ ) {
-                               double[] a = da.values(i);
-                               double[] c = dc.values(i);
-                               int ix = da.pos(i);
-                               
-                               //replicate vector value
-                               double v2 = (b==null) ? 0 : b[i];
-                               if( skipEmpty && v2 == 0 ) //skip empty rows
+               final double[] b = m2.getDenseBlockValues(); // always single 
block
+
+               if(b == null) {
+                       if(multiply)
+                               return 0;
+                       else {
+                               for(int i = rl; i < ru; i++) {
+                                       final double[] a = da.values(i);
+                                       final double[] c = dc.values(i);
+                                       final int ix = da.pos(i);
+                                       // GENERAL CASE
+                                       for(int j = 0; j < clen; j++) {
+                                               double val = op.fn.execute(a[ix 
+ j], 0);
+                                               nnz += ((c[ix + j] = val) != 0) 
? 1 : 0;
+                                       }
+                               }
+                       }
+               }
+               else if(multiply){
+                       for(int i = rl; i < ru; i++) {
+                               final double[] a = da.values(i);
+                               final double[] c = dc.values(i);
+                               final int ix = da.pos(i);
+
+                               // replicate vector value
+                               double v2 = b[i];
+                               if(v2 == 0) // skip empty rows
                                        continue;
-                                       
-                               if( isMultiply && v2 == 1 ) { //ROW COPY
-                                       //a guaranteed to be non-null (see 
early abort)
+                               else if(v2 == 1) { // ROW COPY
+                                       // a guaranteed to be non-null (see 
early abort)
                                        System.arraycopy(a, ix, c, ix, clen);
-                                       nnz += m1.recomputeNonZeros(i, i, 0, 
clen-1);
+                                       nnz += m1.recomputeNonZeros(i, i, 0, 
clen - 1);
                                }
-                               else { //GENERAL CASE
-                                       if( a != null )
-                                               for( int j=0; j<clen; j++ ) {
-                                                       double val = 
op.fn.execute( a[ix+j], v2 );
-                                                       nnz += ((c[ix+j] = val) 
!= 0) ? 1 : 0;
-                                               }
-                                       else {
-                                               double val = op.fn.execute( 0, 
v2 );
-                                               Arrays.fill(c, ix, ix+clen, 
val);
-                                               nnz += (val != 0) ? clen : 0;
+                               else {
+                                       // GENERAL CASE
+                                       for(int j = 0; j < clen; j++) {
+                                               double val = op.fn.execute(a[ix 
+ j], v2);
+                                               nnz += ((c[ix + j] = val) != 0) 
? 1 : 0;
                                        }
                                }
+
                        }
                }
-               else if( atype == BinaryAccessType.MATRIX_ROW_VECTOR )
-               {
-                       double[] b = m2.getDenseBlockValues(); // always single 
block
-                       
-                       if( da==null && b==null ) { //both empty
-                               double val = op.fn.execute( 0, 0 );
-                               dc.set(rl, ru, 0, clen, val);
-                               nnz += (val != 0) ? (long)(ru-rl)*clen : 0;
-                       }
-                       else if( da==null ) //left empty
-                       {
-                               //compute first row
-                               double[] c = dc.values(rl);
-                               for( int j=0; j<clen; j++ ) {
-                                       double val = op.fn.execute( 0, b[j] );
-                                       nnz += ((c[j]=val) != 0) ? (ru-rl) : 0;
+               else{
+                       for(int i = rl; i < ru; i++) {
+                               final double[] a = da.values(i);
+                               final double[] c = dc.values(i);
+                               final int ix = da.pos(i);
+
+                               // replicate vector value
+                               double v2 = b[i];
+
+                               // GENERAL CASE
+                               for(int j = 0; j < clen; j++) {
+                                       double val = op.fn.execute(a[ix + j], 
v2);
+                                       nnz += ((c[ix + j] = val) != 0) ? 1 : 0;
                                }
-                               //copy first to all other rows
-                               for( int i=rl+1; i<ru; i++ )
-                                       dc.set(i, c);
+
                        }
-                       else //default case (incl right empty) 
-                       {
-                               for( int i=rl; i<ru; i++ ) {
-                                       double[] a = da.values(i);
-                                       double[] c = dc.values(i);
-                                       int ix = da.pos(i);
-                                       for( int j=0; j<clen; j++ ) {
-                                               double val = op.fn.execute( 
a[ix+j], ((b!=null) ? b[j] : 0) );
-                                               nnz += ((c[ix+j]=val) != 0) ? 1 
: 0;
-                                       }
+               }
+               return nnz;
+       }
+
+       private static long safeBinaryMVDenseRowVector(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, BinaryOperator op,
+               int rl, int ru) {
+               final int clen = m1.clen;
+
+               final DenseBlock da = m1.getDenseBlock();
+               final DenseBlock dc = ret.getDenseBlock();
+               long nnz = 0;
+               final double[] b = m2.getDenseBlockValues(); // always single 
block
+
+               if(da == null && b == null) { // both empty
+                       double val = op.fn.execute(0, 0);
+                       dc.set(rl, ru, 0, clen, val);
+                       nnz += (val != 0) ? (long) (ru - rl) * clen : 0;
+               }
+               else if(da == null) // left empty
+               {
+                       // compute first row
+                       double[] c = dc.values(rl);
+                       for(int j = 0; j < clen; j++) {
+                               double val = op.fn.execute(0, b[j]);
+                               nnz += ((c[j] = val) != 0) ? (ru - rl) : 0;
+                       }
+                       // copy first to all other rows
+                       for(int i = rl + 1; i < ru; i++)
+                               dc.set(i, c);
+               }
+               else // default case (incl right empty)
+               {
+                       for(int i = rl; i < ru; i++) {
+                               double[] a = da.values(i);
+                               double[] c = dc.values(i);
+                               int ix = da.pos(i);
+                               for(int j = 0; j < clen; j++) {
+                                       double val = op.fn.execute(a[ix + j], 
((b != null) ? b[j] : 0));
+                                       nnz += ((c[ix + j] = val) != 0) ? 1 : 0;
                                }
                        }
                }
-               
                return nnz;
        }
+       
 
        private static void safeBinaryMVSparseDenseRow(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, BinaryOperator op) {
                boolean isMultiply = (op.fn instanceof Multiply);
@@ -885,22 +932,75 @@ public class LibMatrixBincell {
                //no need to recomputeNonZeros since maintained in append value
        }
        
-       private static void fillZeroValues(BinaryOperator op, double v2, 
MatrixBlock ret, boolean skipEmpty, int rpos, int cpos, int len) {
+       private static final void fillZeroValues(BinaryOperator op, double v2, 
MatrixBlock ret, boolean skipEmpty, int rpos, int cpos, int len) {
                if(skipEmpty)
                        return;
-               for( int k=cpos; k<len; k++ ){
-                       double v = op.fn.execute(0, v2);
-                       ret.appendValue(rpos, k, v);
+               
+               final double v = op.fn.execute(0, v2);
+               if(v != 0){
+                       for( int k=cpos; k<len; k++ )
+                               // TODO change this to not do append but 
directly allocate the filled sparse row.
+                               ret.appendValue(rpos, k, v);
                }
        }
        
-       private static void fillZeroValues(BinaryOperator op, MatrixBlock m2, 
MatrixBlock ret, boolean skipEmpty, int rpos, int cpos, int len) {
+       private static void fillZeroValues(BinaryOperator op, MatrixBlock m2, 
MatrixBlock ret, boolean skipEmpty, int rpos,
+               int cpos, int len) {
                if(skipEmpty)
                        return;
-               for( int k=cpos; k<len; k++ ){
-                       double v2 = m2.quickGetValue(0, k);
-                       double v = op.fn.execute(0, v2);
-                       ret.appendValue(rpos, k, v);
+
+               final double zero =  op.fn.execute(0.0, 0.0);
+               final boolean zeroIsZero = zero == 0.0;
+               if(m2.isEmpty()){
+                               if(!zeroIsZero){
+                                       while(cpos < len)
+                                               // TODO change this to a fill 
operation.
+                                               ret.appendValue(rpos, cpos++, 
zero);
+                               }
+               }
+               else if(m2.isInSparseFormat()) {
+                       final SparseBlock sb = m2.getSparseBlock();
+                       if(sb.isEmpty(0)){
+                               if(!zeroIsZero){
+                                       while(cpos < len)
+                                               ret.appendValue(rpos, cpos++, 
zero);
+                               }
+                       }
+                       else{
+                               int apos = sb.pos(0);
+                               final int alen = sb.size(0) + apos;
+                               final int[] aix = sb.indexes(0);
+                               final double[] vals = sb.values(0);
+                               // skip aix pos until inside range of cpos and 
len
+                               while( apos < alen && aix[apos] < len && cpos > 
aix[apos]){
+                                       apos++;
+                               }
+                               // for each point in the sparse range
+                               for(; apos < alen && aix[apos] < len; apos++){
+                                       if(!zeroIsZero){
+                                               while(cpos < len  && cpos < 
aix[apos]){
+                                                       ret.appendValue(rpos, 
cpos++, zero);
+                                               }
+                                       }
+                                       cpos = aix[apos];
+                                       final double v = op.fn.execute(0, 
vals[apos]);
+                                       ret.appendValue(rpos, aix[apos], v);
+                                       // cpos++;
+                               }
+                               // process tail.
+                               if(!zeroIsZero) {
+                                       while(cpos < len) {
+                                               ret.appendValue(rpos, cpos++, 
zero);
+                                       }
+                               }
+                       }
+               }
+               else {
+                       final DenseBlock db = m2.getDenseBlock();
+                       final double[] vals = db.values(0);
+                       for(int k = cpos; k < len; k++){
+                               ret.appendValue(rpos, k, op.fn.execute(0, 
vals[k]));
+                       }
                }
        }
 
@@ -1596,7 +1696,6 @@ public class LibMatrixBincell {
                                }
                        }
                }
-               
                return ret.nonZeros = nnz;
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDenseToSparse.java
 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDenseToSparse.java
new file mode 100644
index 0000000000..5280aa5f9b
--- /dev/null
+++ 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDenseToSparse.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.matrix.data;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import 
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlockCSR;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.data.SparseRowVector;
+import org.apache.sysds.runtime.util.CommonThreadPool;
+import org.apache.sysds.runtime.util.UtilFunctions;
+
+public interface LibMatrixDenseToSparse {
+       public static final Log LOG = 
LogFactory.getLog(LibMatrixDenseToSparse.class.getName());
+
+       /**
+        * Convert the given matrix block to a sparse allocation.
+        * 
+        * @param r        The matrix block to modify, and return the sparse 
block in.
+        * @param allowCSR If CSR is allowed.
+        */
+       public static void denseToSparse(MatrixBlock r, boolean allowCSR) {
+               final DenseBlock a = r.getDenseBlock();
+
+               // set target representation, early abort on empty blocks
+               r.sparse = true;
+               if(a == null)
+                       return;
+
+               final int k = InfrastructureAnalyzer.getLocalParallelism();
+
+               if(k > 1 && r.getNumRows() > 1000)
+                       denseToSparseParallel(r, k, allowCSR);
+               else if(allowCSR && r.nonZeros <= Integer.MAX_VALUE)
+                       denseToSparseCSR(r);
+               else
+                       denseToSparseMCSR(r);
+
+               // cleanup dense block
+               r.denseBlock = null;
+       }
+
+       private static void denseToSparseCSR(MatrixBlock r) {
+               final DenseBlock a = r.getDenseBlock();
+               final int m = r.rlen;
+               final int n = r.clen;
+               try {
+                       // allocate target in memory-efficient CSR format
+                       int lnnz = (int) r.nonZeros;
+                       int[] rptr = new int[m + 1];
+                       int[] indexes = new int[lnnz];
+                       double[] values = new double[lnnz];
+                       for(int i = 0, pos = 0; i < m; i++) {
+                               double[] avals = a.values(i);
+                               int aix = a.pos(i);
+                               for(int j = 0; j < n; j++) {
+                                       double aval = avals[aix + j];
+                                       if(aval != 0) {
+                                               indexes[pos] = j;
+                                               values[pos] = aval;
+                                               pos++;
+                                       }
+                               }
+                               rptr[i + 1] = pos;
+                       }
+                       r.sparseBlock = new SparseBlockCSR(rptr, indexes, 
values, lnnz);
+               }
+               catch(ArrayIndexOutOfBoundsException ioobe) {
+                       r.sparse = false;
+                       // this means something was wrong with the sparse count.
+                       final long nnzBefore = r.nonZeros;
+                       final long nnzNew = r.recomputeNonZeros();
+
+                       // try again.
+                       if(nnzBefore != nnzNew)
+                               denseToSparse(r, true);
+                       else
+                               denseToSparse(r, false);
+
+               }
+       }
+
+       private static void denseToSparseMCSR(MatrixBlock r) {
+               final DenseBlock a = r.getDenseBlock();
+
+               final int m = r.rlen;
+               final int n = r.clen;
+               // remember number non zeros.
+               long nnzTemp = r.getNonZeros();
+               // fallback to less-memory efficient MCSR format,
+               // which however allows much larger sparse matrices
+               if(!r.allocateSparseRowsBlock())
+                       r.reset(); // reset if not allocated
+               SparseBlockMCSR sblock = (SparseBlockMCSR) r.sparseBlock;
+               toSparseMCSRRange(a, sblock, n, 0, m);
+               r.nonZeros = nnzTemp;
+       }
+
+       private static void toSparseMCSRRange(DenseBlock a, SparseBlockMCSR b, 
int n, int rl, int ru) {
+               for(int i = rl; i < ru; i++)
+                       toSparseMCSRRow(a, b, n, i);
+       }
+
+       private static void toSparseMCSRRow(DenseBlock a, SparseBlockMCSR b, 
int n, int i) {
+               final double[] avals = a.values(i);
+               final int aix = a.pos(i);
+               // compute nnz per row (not via recomputeNonZeros as sparse 
allocated)
+               final int lnnz = UtilFunctions.computeNnz(avals, aix, n);
+               if(lnnz <= 0)
+                       return;
+
+               // allocate sparse row and append non-zero values
+               final double[] vals = new double[lnnz];
+               final int[] idx = new int[lnnz];
+
+               for(int j = 0, o = 0; j < n; j++) {
+                       double v = avals[aix + j];
+                       if(v != 0.0) {
+                               vals[o] = v;
+                               idx[o] = j;
+                               o++;
+                       }
+               }
+               b.set(i, new SparseRowVector(vals, idx), false);
+       }
+
+       private static void denseToSparseParallel(MatrixBlock r, int k, boolean 
allowCSR) {
+               final DenseBlock a = r.getDenseBlock();
+               r.denseBlock = null;
+               r.sparseBlock = null;
+               final int m = r.rlen;
+               final int n = r.clen;
+               // remember number non zeros.
+               final long nnzTemp = r.getNonZeros();
+               r.reset(r.getNumRows(), r.getNumColumns(), nnzTemp);
+               // fallback to less-memory efficient MCSR format, for efficient 
parallel conversion.
+               r.sparseBlock = new SparseBlockMCSR(r.getNumRows());
+               r.sparse = true;
+               final SparseBlockMCSR b = (SparseBlockMCSR) r.sparseBlock;
+               final int blockSize = Math.max(250, m / k);
+               ExecutorService pool = CommonThreadPool.get(k);
+               try {
+
+                       List<Future<?>> tasks = new ArrayList<>();
+                       for(int i = 0; i < m; i += blockSize) {
+                               final int start = i;
+                               final int end = Math.min(m, i + blockSize);
+                               tasks.add(pool.submit(() -> 
toSparseMCSRRange(a, b, n, start, end)));
+                       }
+
+                       for(Future<?> t : tasks)
+                               t.get();
+               }
+               catch(Exception e) {
+                       throw new RuntimeException(e);
+               }
+               finally {
+                       pool.shutdown();
+               }
+
+               r.nonZeros = nnzTemp;
+       }
+}
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 51a26f3539..abf6621a78 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -49,6 +49,7 @@ import org.apache.sysds.runtime.data.SparseBlock.Type;
 import org.apache.sysds.runtime.data.SparseBlockCSR;
 import org.apache.sysds.runtime.data.SparseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.data.SparseRow;
 import org.apache.sysds.runtime.data.SparseRowScalar;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
@@ -199,6 +200,11 @@ public class LibMatrixMult
                        ret = new MatrixBlock(m1.rlen, m2.clen, ultraSparse | 
sparse);
                else 
                        ret.reset(m1.rlen, m2.clen, ultraSparse | sparse);
+               if(ret.isInSparseFormat() && ret.getSparseBlock() instanceof 
SparseBlockMCSR){
+                       // we set the estimated number of non zeros per row to 
number of columns
+                       // to make the allocation of cells more aggressive.
+                       
((SparseBlockMCSR)ret.getSparseBlock()).setNnzEstimatePerRow(m2.clen, m2.clen);
+               }
                if(m1.denseBlock instanceof DenseBlockFP64DEDUP)
                        ret.allocateDenseBlock(true, true);
                else
@@ -260,8 +266,8 @@ public class LibMatrixMult
 
                // core multi-threaded matrix mult computation
                // (currently: always parallelization over number of rows)
+               final ExecutorService pool = CommonThreadPool.get(k);
                try {
-                       ExecutorService pool = CommonThreadPool.get(k);
                        ArrayList<MatrixMultTask> tasks = new ArrayList<>();
                        ArrayList<Integer> blklens = 
UtilFunctions.getBalancedBlockSizesDefault(num, k,
                                (pm2r || pm2c || ret.denseBlock instanceof 
DenseBlockFP64DEDUP));
@@ -269,25 +275,34 @@ public class LibMatrixMult
                        for(int i = 0, lb = 0; i < blklens.size(); lb += 
blklens.get(i), i++)
                                tasks.add(new MatrixMultTask(m1, m2, ret, tm2, 
pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i), cache));
                        // execute tasks
-                       List<Future<Object>> taskret = pool.invokeAll(tasks);
-                       pool.shutdown();
+                       
                        // aggregate partial results (nnz, ret for 
vector/matrix)
-                       ret.nonZeros = 0; // reset after execute
-                       for(Future<Object> task : taskret) {
+                       // reset nonZero before execution.
+                       // nonZero count cannot be trusted since it is not 
atomic
+                       // and some of the matrix multiplication kernels call 
quick set value modifying the count.
+                       ret.nonZeros = 0; 
+                       long nnzCount = 0;
+                       for(Future<Object> task : pool.invokeAll(tasks)) {
                                if(pm2r) // guaranteed single block
                                        vectAdd((double[]) task.get(), 
ret.getDenseBlockValues(), 0, 0, ret.rlen * ret.clen);
-                               else
-                                       ret.nonZeros += (Long) task.get();
+                               else // or count non zeros of the block
+                                       nnzCount += (Long) task.get();
                        }
                        if(pm2r)
-                               ret.recomputeNonZeros();
+                               ret.recomputeNonZeros(k);
+                       else // set the non zeros to the counted values.
+                               ret.nonZeros = nnzCount;
+
+                       // post-processing (nnz maintained in parallel)
+                       ret.examSparsity();
                }
                catch(Exception ex) {
                        throw new DMLRuntimeException(ex);
                }
+               finally{
+                       pool.shutdown();
+               }
 
-               // post-processing (nnz maintained in parallel)
-               ret.examSparsity();
        }
 
        public static MatrixBlock emptyMatrixMult(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret){
@@ -1689,7 +1704,6 @@ public class LibMatrixMult
                final boolean leftUS = m1.isUltraSparse()
                        || (m1.isUltraSparse(false) && !m2.isUltraSparse())
                        || (m1.sparse && !m2.sparse);
-               
                if( m1 == m2 ) //self-product
                        matrixMultUltraSparseSelf(m1, ret, rl, ru);
                else if( leftUS || m1Perm )
@@ -1749,14 +1763,14 @@ public class LibMatrixMult
                        }
                }
                //recompute non-zero for single-threaded
-               if( rl == 0 && ru == m1.rlen )
+               if( rl == 0 && ru == m1.rlen ){
                        ret.recomputeNonZeros();
+               }
        }
        
        private static void matrixMultUltraSparseLeft(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
                final int m  = m1.rlen;
                final int n  = m2.clen;
-               
                //left is ultra-sparse (IKJ)
                SparseBlock a = m1.sparseBlock;
                SparseBlock c = ret.sparseBlock;
@@ -1818,24 +1832,90 @@ public class LibMatrixMult
        }
        
        private static void matrixMultUltraSparseRight(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
+               if(!ret.isInSparseFormat() && 
ret.getDenseBlock().isContiguous())
+                       matrixMultUltraSparseRightDenseOut(m1, m2, ret, rl, ru);
+               else if(m1.isInSparseFormat() && ret.isInSparseFormat())
+                       matrixMultUltraSparseRightSparseMCSRLeftSparseOut(m1, 
m2, ret, rl, ru);
+               else
+                       matrixMultUltraSparseRightGeneric(m1, m2, ret, rl, ru);
+       }
+
+       private static void matrixMultUltraSparseRightDenseOut(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
+               final int cd = m1.clen;
+               final int kd = m2.clen;
+               double[] retV = ret.getDenseBlockValues();
+
+               // right is ultra-sparse (KJI)
+               final SparseBlock b = m2.sparseBlock;
+               for(int k = 0; k < cd; k++) {
+                       if(b.isEmpty(k))
+                               continue;
+                       final int bpos = b.pos(k);
+                       final int blen = b.size(k);
+                       final int[] bixs = b.indexes(k);
+                       final double[] bvals = b.values(k);
+                       for(int j = bpos; j < bpos + blen; j++) {
+                               double bval = bvals[j];
+                               int bix = bixs[j];
+                               for(int i = rl; i < ru; i++) 
+                                       retV[i *kd + bix] += bval * 
m1.quickGetValue(i, k);
+                       }
+               }
+       }
+
+       private static void 
matrixMultUltraSparseRightSparseMCSRLeftSparseOut(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret, int rl, int ru) {
                final int cd = m1.clen;
+
+               // right is ultra-sparse (KJI)
+               final SparseBlock a = m1.sparseBlock;
+               final SparseBlock b = m2.sparseBlock;
+               final SparseBlockMCSR r = (SparseBlockMCSR) ret.sparseBlock;
                
-               //right is ultra-sparse (KJI)
-               SparseBlock b = m2.sparseBlock;
-               for(int k = 0; k < cd; k++ ) {
-                       if( b.isEmpty(k) ) continue; 
-                       int bpos = b.pos(k);
-                       int blen = b.size(k);
-                       int[] bixs = b.indexes(k);
-                       double[] bvals = b.values(k);
-                       for( int j=bpos; j<bpos+blen; j++ ) {
+               for(int k = 0; k < cd; k++) {
+                       if(b.isEmpty(k))
+                               continue;
+                       final int bpos = b.pos(k);
+                       final int blen = b.size(k);
+                       final int[] bixs = b.indexes(k);
+                       final double[] bvals = b.values(k);
+                       for(int i = rl; i < ru; i++) {
+                               if(a.isEmpty(i))
+                                       continue;
+                               final double cvald = a.get(i, k);
+                               // since the left side is sparse as well, it is 
likely that this value is zero.
+                               // therefore we reorder the loop to access the 
value here.
+                               if(cvald != 0) {
+                                       for(int j = bpos; j < bpos + blen; j++) 
{
+                                               final int bix = bixs[j];
+                                               final double bval = bvals[j];
+                                               final double cval = r.get(i, 
bix);
+                                               r.set(i, bix, cval + bval * 
cvald);
+                                       }
+                               }
+                       }
+               }
+       }
+
+       private static void matrixMultUltraSparseRightGeneric(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
+               final int cd = m1.clen;
+
+               // right is ultra-sparse (KJI)
+               final SparseBlock b = m2.sparseBlock;
+               for(int k = 0; k < cd; k++) {
+                       if(b.isEmpty(k))
+                               continue;
+                       final int bpos = b.pos(k);
+                       final int blen = b.size(k);
+                       final int[] bixs = b.indexes(k);
+                       final double[] bvals = b.values(k);
+                       for(int j = bpos; j < bpos + blen; j++) {
                                double bval = bvals[j];
                                int bix = bixs[j];
-                               for( int i=rl; i<ru; i++ ) {
-                                       double cvald = bval*m1.quickGetValue(i, 
k);
-                                       if( cvald != 0 ){
+                               for(int i = rl; i < ru; i++) {
+                                       double cvald = bval * 
m1.quickGetValue(i, k);
+                                       if(cvald != 0) {
                                                double cval = 
ret.quickGetValue(i, bix);
-                                               ret.quickSetValue(i, bix, 
cval+cvald);
+                                               ret.quickSetValue(i, bix, cval 
+ cvald);
                                        }
                                }
                        }
@@ -1908,36 +1988,101 @@ public class LibMatrixMult
 
        private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock 
mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) 
        {
-               SparseBlock a = mX.sparseBlock;
-               double[] b = mV.getDenseBlockValues();
-               double[] w = (mW!=null) ? mW.getDenseBlockValues() : null;
-               double[] c = ret.getDenseBlockValues();
-               boolean weights = (ct == ChainType.XtwXv);
-               boolean weights2 = (ct == ChainType.XtXvy);
+               final SparseBlock a = mX.sparseBlock;
+               final double[] b = mV.getDenseBlockValues();
+               final double[] w = (mW != null) ? mW.getDenseBlockValues() : 
null;
+               final double[] c = ret.getDenseBlockValues();
                
                //row-wise mmchain computation
-               for( int i=rl; i < ru; i++ ) {
-                       if( a.isEmpty(i) || (weights && w[i]==0) )
+               if(ct == ChainType.XtXvy)
+                       matrixMultChainSparseXtXvy(a, b, w, c, rl, ru);
+               else if(b != null) {
+                       if(ct == ChainType.XtwXv)
+                               matrixMultChainSparseXtwXv(a, b, w, c, rl, ru);
+                       else // XtXv
+                               matrixMultChainSparseXtXv(a, b, c, rl, ru);
+               }
+       }
+
+       private static final void matrixMultChainSparseXtXv(SparseBlock a, 
double[] b,  double[] c, int rl,
+               int ru) {
+               for(int i = rl; i < ru; i++) {
+                       if(a.isEmpty(i))
                                continue;
-                       int apos = a.pos(i);
-                       int alen = a.size(i);
-                       int[] aix = a.indexes(i);
-                       double[] avals = a.values(i);
-                       
-                       //compute 1st matrix-vector dot product
-                       double val = (b == null) ? 0 :
-                               dotProduct(avals, b, aix, apos, 0, alen);
-                       
-                       //multiply/subtract weights, if required
-                       val *= (weights) ? w[i] : 1;
-                       val -= (weights2) ? w[i] : 0;
-                       
-                       //compute 2nd matrix vector and aggregate
-                       if( val != 0 )
+                       final int apos = a.pos(i);
+                       final int alen = a.size(i);
+                       final int[] aix = a.indexes(i);
+                       final double[] avals = a.values(i);
+
+                       // compute 1st matrix-vector dot product
+                       final double val = dotProduct(avals, b, aix, apos, 0, 
alen);
+
+                       // compute 2nd matrix vector and aggregate
+                       if(val != 0)
                                vectMultiplyAdd(val, avals, c, aix, apos, 0, 
alen);
                }
        }
 
+       private static final void matrixMultChainSparseXtwXv(SparseBlock a, 
double[] b, double[] w, double[] c, int rl,
+               int ru) {
+               for(int i = rl; i < ru; i++) {
+                       if(w[i] == 0 || a.isEmpty(i))
+                               continue;
+                       final int apos = a.pos(i);
+                       final int alen = a.size(i);
+                       final int[] aix = a.indexes(i);
+                       final double[] avals = a.values(i);
+                       // compute 1st matrix-vector dot product
+                       double val = dotProduct(avals, b, aix, apos, 0, alen) * 
w[i];
+
+                       // compute 2nd matrix vector and aggregate
+                       if(val != 0)
+                               vectMultiplyAdd(val, avals, c, aix, apos, 0, 
alen);
+               }
+       }
+
+
+       private static final void matrixMultChainSparseXtXvy(SparseBlock a, 
double[] b, double[] w, double[] c, int rl,
+               int ru) {
+               if(b == null && w == null) {// early abort
+                       return;
+               }
+               else if(b == null && w != null) { // short case with empty B.
+                       for(int i = rl; i < ru; i++) {
+                               final double val = -w[i];
+                               if(val != 0 && !a.isEmpty(i)) {
+                                       final int apos = a.pos(i);
+                                       final int alen = a.size(i);
+                                       final int[] aix = a.indexes(i);
+                                       final double[] avals = a.values(i);
+                                       vectMultiplyAdd(val, avals, c, aix, 
apos, 0, alen);
+                               }
+                       }
+               }
+               else { // case XtXvy
+                       // row-wise mmchain computation
+                       for(int i = rl; i < ru; i++) {
+                               if(a.isEmpty(i))
+                                       continue;
+                               final int apos = a.pos(i);
+                               final int alen = a.size(i);
+                               final int[] aix = a.indexes(i);
+                               final double[] avals = a.values(i);
+
+                               // compute 1st matrix-vector dot product
+                               double val = dotProduct(avals, b, aix, apos, 0, 
alen);
+
+                               // multiply/subtract weights, if required
+                               if(w != null)
+                                       val -= w[i];
+
+                               // compute 2nd matrix vector and aggregate
+                               if(val != 0)
+                                       vectMultiplyAdd(val, avals, c, aix, 
apos, 0, alen);
+                       }
+               }
+       }
+
        private static void matrixMultTransposeSelfDense( MatrixBlock m1, 
MatrixBlock ret, boolean leftTranspose, int rl, int ru ) {
                //2) transpose self matrix multiply dense
                // (compute only upper-triangular matrix due to symmetry)
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index bf39bd6601..46dc10a3e4 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -124,7 +124,7 @@ import org.apache.sysds.utils.NativeHelper;
 
 
 public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>, Externalizable {
-       private static final Log LOG = 
LogFactory.getLog(MatrixBlock.class.getName());
+       protected static final Log LOG = 
LogFactory.getLog(MatrixBlock.class.getName());
 
        private static final long serialVersionUID = 7319972089143154056L;
        
@@ -459,7 +459,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
        public boolean allocateSparseRowsBlock(boolean clearNNZ) {
                //allocate block if non-existing or too small (guaranteed to be 
0-initialized)
                //but do not replace existing block even if not in default type
-               boolean reset = sparseBlock == null || 
sparseBlock.numRows()<rlen;
+               boolean reset = sparseBlock == null || sparseBlock.numRows() < 
rlen;
                if( reset ) {
                        sparseBlock = SparseBlockFactory
                                .createSparseBlock(DEFAULT_SPARSEBLOCK, rlen);
@@ -761,7 +761,8 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
        }
        
        /**
-        * Append value is only used when values are appended at the end of 
each row for the sparse representation
+        * <p>Append value is only used when values are appended at the end of 
each row for the sparse representation</p>
+        * 
         * This can only be called, when the caller knows the access pattern of 
the block
         *       
         * @param r row
@@ -966,21 +967,29 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
         * @return the mean value of all values in the matrix
         */
        public double mean() {
+               return mean(1);
+       }
+       
+       public double mean(int k ) {
                MatrixBlock out = new MatrixBlock(1, 3, false);
                LibMatrixAgg.aggregateUnaryMatrix(this, out,
-                       
InstructionUtils.parseBasicAggregateUnaryOperator("uamean", 1));
+                       
InstructionUtils.parseBasicAggregateUnaryOperator("uamean", k));
                return out.quickGetValue(0, 0);
        }
-       
+
        /**
         * Wrapper method for reduceall-min of a matrix.
         * 
         * @return the minimum value of all values in the matrix
         */
        public double min() {
+               return min(1);
+       }
+
+       public double min(int k) {
                MatrixBlock out = new MatrixBlock(1, 1, false);
                LibMatrixAgg.aggregateUnaryMatrix(this, out,
-                       
InstructionUtils.parseBasicAggregateUnaryOperator("uamin", 1));
+                       
InstructionUtils.parseBasicAggregateUnaryOperator("uamin", k));
                return out.quickGetValue(0, 0);
        }
 
@@ -989,8 +998,12 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
         *
         * @return A new MatrixBlock containing the column mins of this matrix
         */
-       public MatrixBlock colMin() {
-               AggregateUnaryOperator op = 
InstructionUtils.parseBasicAggregateUnaryOperator("uacmin", 1);
+       public final MatrixBlock colMin() {
+               return colMin(1);
+       }
+
+       public final MatrixBlock colMin(int k) {
+               AggregateUnaryOperator op = 
InstructionUtils.parseBasicAggregateUnaryOperator("uacmin", k);
                return aggregateUnaryOperations(op, null, 1000, null, true);
        }
 
@@ -999,8 +1012,12 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
         *
         * @return A new MatrixBlock containing the column mins of this matrix
         */
-       public MatrixBlock colMax() {
-               AggregateUnaryOperator op = 
InstructionUtils.parseBasicAggregateUnaryOperator("uacmax", 1);
+       public final MatrixBlock colMax() {
+               return colMax(1);
+       }
+
+       public final MatrixBlock colMax(int k) {
+               AggregateUnaryOperator op = 
InstructionUtils.parseBasicAggregateUnaryOperator("uacmax", k);
                return aggregateUnaryOperations(op, null, 1000, null, true);
        }
        
@@ -1010,9 +1027,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
         * @return the maximum value of all values in the matrix
         */
        public double max() {
-               AggregateUnaryOperator op 
=InstructionUtils.parseBasicAggregateUnaryOperator("uamax", 1);
-               MatrixBlock out = aggregateUnaryOperations(op, null, 1000, 
null, true);
-               return out.quickGetValue(0, 0);
+               return max(1).quickGetValue(0,0);
        }
 
        /**
@@ -1286,79 +1301,8 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
                denseToSparse(true);
        }
        
-       public void denseToSparse(boolean allowCSR)
-       {
-               DenseBlock a = getDenseBlock();
-               
-               //set target representation, early abort on empty blocks
-               sparse = true;
-               if( a == null )
-                       return;
-               
-               final int m = rlen;
-               final int n = clen;
-               
-               if( allowCSR && nonZeros <= Integer.MAX_VALUE ) {
-                       try{
-
-                               //allocate target in memory-efficient CSR format
-                               int lnnz = (int) nonZeros;
-                               int[] rptr = new int[m+1];
-                               int[] indexes = new int[lnnz];
-                               double[] values = new double[lnnz];
-                               for( int i=0, pos=0; i<m; i++ ) {
-                                       double[] avals = a.values(i);
-                                       int aix = a.pos(i);
-                                       for(int j=0; j<n; j++) {
-                                               double aval = avals[aix+j];
-                                               if( aval != 0 ) {
-                                                       indexes[pos] = j;
-                                                       values[pos] = aval;
-                                                       pos++;
-                                               }
-                                       }
-                                       rptr[i+1]=pos;
-                               }
-                               sparseBlock = new SparseBlockCSR(
-                                       rptr, indexes, values, lnnz);
-                       } catch(ArrayIndexOutOfBoundsException ioobe){
-                               sparse = false;
-                               long nnzBefore = nonZeros;
-                               long nnzNew = recomputeNonZeros();
-                               if(nnzBefore != nnzNew){
-                                       LOG.error("Error in dense to sparse 
because nonZeros was set incorrectly\nTrying again with correction");
-                                       denseToSparse(true);
-                               }
-                               else{
-                                       LOG.error("Failed construction of 
SparseCSR block", ioobe);
-                                       denseToSparse(false);
-                               }
-                       }
-               }
-               else {
-                       // remember number non zeros.
-                       long nnzTemp = getNonZeros();
-                       //fallback to less-memory efficient MCSR format,
-                       //which however allows much larger sparse matrices
-                       if( !allocateSparseRowsBlock() )
-                               reset(); //reset if not allocated
-                       SparseBlock sblock = sparseBlock;
-                       for( int i=0; i<m; i++ ) {
-                               double[] avals = a.values(i);
-                               int aix = a.pos(i);
-                               //compute nnz per row (not via 
recomputeNonZeros as sparse allocated)
-                               int lnnz = UtilFunctions.computeNnz(avals, aix, 
clen);
-                               if( lnnz <= 0 ) continue;
-                               //allocate sparse row and append non-zero values
-                               sblock.allocate(i, lnnz);
-                               for( int j=0; j<n; j++ )
-                                       sblock.append(i, j, avals[aix+j]);
-                       }
-                       nonZeros = nnzTemp;
-               }
-               
-               //update nnz and cleanup dense block
-               denseBlock = null;
+       public void denseToSparse(boolean allowCSR){
+               LibMatrixDenseToSparse.denseToSparse(this, allowCSR);
        }
        
        public void sparseToDense() {
@@ -1447,7 +1391,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
 
                        }
                        catch(Exception e) {
-                               LOG.warn("Failed Parallel non zero count 
fallback to singlethread");
+                               // LOG.warn("Failed Parallel non zero count 
fallback to singlethread");
                                return recomputeNonZeros();
                        }
                        finally {
@@ -3025,6 +2969,9 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
                return ret;
        }
 
+       public final MatrixBlock unaryOperations(UnaryOperator op){
+               return unaryOperations(op, null);
+       }
 
        @Override
        public MatrixBlock unaryOperations(UnaryOperator op, MatrixValue 
result) {
@@ -3778,6 +3725,34 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
                return append(new MatrixBlock[]{that}, ret, cbind);
        }
        
+       private final long calculateCombinedNNz(MatrixBlock[] that){
+               long nnz = nonZeros;
+               for(MatrixBlock b : that)
+                       nnz += b.nonZeros;
+               return nnz;
+       }
+
+       private final int combinedRows(MatrixBlock[] that){
+               int r =  rlen;
+               for(MatrixBlock b : that)
+                       r += b.rlen;
+               return r;
+       }
+
+       private final int combinedCols(MatrixBlock[] that){
+               int c =  clen;
+               for(MatrixBlock b : that)
+                       c += b.clen;
+               return c;
+       }
+
+       private final int computeNNzRow(MatrixBlock[] that, int row) {
+               int lnnz = (int) this.recomputeNonZeros(row, row, 0, this.clen 
- 1);
+               for(MatrixBlock b : that)
+                       lnnz += b.recomputeNonZeros(row, row, 0, b.clen - 1);
+               return lnnz;
+       }
+
        /**
         * Append that list of matrixes to this matrix.
         * 
@@ -3788,12 +3763,13 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
         * @param cbind if binding on columns or rows
         * @return the ret MatrixBlock object with the appended result
         */
-       public MatrixBlock append( MatrixBlock[] that, MatrixBlock result, 
boolean cbind) {
+       public MatrixBlock append(MatrixBlock[] that, MatrixBlock result, 
boolean cbind) {
                checkDimensionsForAppend(that, cbind);
 
-               final int m = cbind ? rlen : rlen + 
Arrays.stream(that).mapToInt(mb -> mb.rlen).sum();
-               final int n = cbind ? clen + Arrays.stream(that).mapToInt(mb -> 
mb.clen).sum() : clen;
-               final long nnz = nonZeros + Arrays.stream(that).mapToLong(mb -> 
mb.nonZeros).sum();
+               final int m = cbind ? rlen : combinedRows(that);
+               final int n = cbind ? combinedCols(that) : clen;
+               final long nnz = calculateCombinedNNz(that);
+
                boolean shallowCopy = (nonZeros == nnz);
                boolean sp = evalSparseFormatInMemory(m, n, nnz);
                
@@ -3846,22 +3822,21 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
                                }
                        }
                }
-               else if(nnz != 0) //SPARSE
-               {
+               //SPARSE
+               else if(nnz != 0) {
                        //adjust sparse rows if required
                        result.allocateSparseRowsBlock();
                        //allocate sparse rows once for cbind
                        if( cbind && nnz > rlen && !shallowCopy && 
result.getSparseBlock() instanceof SparseBlockMCSR ) {
-                               SparseBlock sblock = result.getSparseBlock();
-                               for( int i=0; i<result.rlen; i++ ) {
-                                       final int row = i; //workaround for 
lambda compile issue
-                                       int lnnz = (int) 
(this.recomputeNonZeros(i, i, 0, this.clen-1) + Arrays.stream(that)
-                                               .mapToLong(mb -> 
mb.recomputeNonZeros(row, row, 0, mb.clen-1)).sum());
-                                       sblock.allocate(i, lnnz);
-                               }
+                               final SparseBlock sblock = 
result.getSparseBlock();
+                               // for each row calculate how many non zeros 
are pressent.
+                               for( int i=0; i<result.rlen; i++ ) 
+                                       sblock.allocate(i, computeNNzRow(that, 
i));
+                               
                        }
                        
                        //core append operation
+                       // we can always append this directly to offset 0.0 in 
both cbind and rbind.
                        result.appendToSparse(this, 0, 0, !shallowCopy);
                        if( cbind ) {
                                for(int i=0, off=clen; i<that.length; i++) {
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java 
b/src/test/java/org/apache/sysds/test/TestUtils.java
index 9e866e5b33..45fe79a4a3 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -1343,7 +1343,7 @@ public class TestUtils {
                        double avgDistance = sumPercentDistance / (rows * cols);
                        if(countErrors != 0)
                                fail(message + "\n" + countErrors + " values 
are not in equal of total: " + (rows * cols));
-                       if(avgDistance <= maxAveragePercentDistance)
+                       if(avgDistance < maxAveragePercentDistance)
                                fail(message + "\nThe avg distance: " + 
avgDistance + " was lower than threshold "
                                        + maxAveragePercentDistance);
                }
diff --git 
a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java 
b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
index d35b47a4c6..c5df98a24b 100644
--- 
a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
+++ 
b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysds.test.component.matrix;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
 
 import java.util.ArrayList;
@@ -154,11 +155,12 @@ public class MatrixMultiplyTest {
 
                        String totalMessage = "\n\n" + sizeErrMessage + "\n" + 
sparseErrMessage;
 
-                       if(ret.getNumRows() * ret.getNumColumns() < 1000) {
+                       if(ret.getNumRows() * ret.getNumColumns() < 1000 || 
ret.getNonZeros() < 100) {
                                totalMessage += "\n\nExp" + exp;
                                totalMessage += "\n\nAct" + ret;
                        }
 
+                       assertEquals(totalMessage,exp.getNonZeros(), 
ret.getNonZeros());
                        TestUtils.compareMatricesPercentageDistance(exp, ret, 
0.999, 0.99999, totalMessage, false);
                }
                catch(Exception e) {
diff --git 
a/src/test/java/org/apache/sysds/test/component/matrix/MatrixToSparseOrDense.java
 
b/src/test/java/org/apache/sysds/test/component/matrix/MatrixToSparseOrDense.java
new file mode 100644
index 0000000000..e3cd74466e
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/component/matrix/MatrixToSparseOrDense.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrix;
+
+import static org.junit.Assert.fail;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class MatrixToSparseOrDense {
+       protected static final Log LOG = 
LogFactory.getLog(MatrixMultiplyTest.class.getName());
+
+       private final MatrixBlock a;
+
+       public MatrixToSparseOrDense(int rows, int cols, double sparsity) {
+
+               this.a = TestUtils.generateTestMatrixBlock(rows, cols, -10, 10, 
sparsity, 42151);
+       }
+
+       @Parameters
+       public static Collection<Object[]> data() {
+
+               List<Object[]> tests = new ArrayList<>();
+               try {
+                       double[] sparsities = new double[] {0.0001, 0.1, 0.5};
+                       int[] is = new int[] {1, 2, 10, 1302};
+                       int[] js = new int[] {1, 2, 5, 1203};
+
+                       for(int s = 0; s < sparsities.length; s++) {
+                               for(int i = 0; i < is.length; i++) {
+                                       for(int j = 0; j < js.length; j++) {
+                                               tests.add(new Object[] {is[i], 
js[j], sparsities[s]});
+
+                                       }
+                               }
+
+                       }
+               }
+               catch(Exception e) {
+                       e.printStackTrace();
+                       fail("failed constructing tests");
+               }
+
+               return tests;
+       }
+
+       @Test
+       public void testA() {
+               MatrixBlock c = new MatrixBlock();
+               c.copy(a);
+               TestUtils.compareMatricesPercentageDistance(a, c, 1.0, 1.0, "");
+       }
+
+       @Test
+       public void testB_MCSR() {
+               MatrixBlock c = new MatrixBlock();
+               c.copy(a);
+               c.denseToSparse(false);
+               TestUtils.compareMatricesPercentageDistance(a, c, 1.0, 1.0, "");
+       }
+
+       @Test
+       public void testB_CSR() {
+               MatrixBlock c = new MatrixBlock();
+               c.copy(a);
+               c.denseToSparse(true);
+               TestUtils.compareMatricesPercentageDistance(a, c, 1.0, 1.0, "");
+       }
+
+       @Test
+       public void testCDense() {
+               MatrixBlock c = new MatrixBlock();
+               c.copy(a);
+               c.sparseToDense();
+               TestUtils.compareMatricesPercentageDistance(a, c, 1.0, 1.0, "");
+       }
+}


Reply via email to