This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit de91f669b87589d390a4c2804658210bb54ce1d7
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Thu Jul 4 12:35:12 2024 +0200

    [MINOR] Remove outdated flags and dead code from matmult and bufferpool
---
 .../hops/cost/CostEstimatorStaticRuntime.java      |   7 +-
 .../runtime/controlprogram/caching/ByteBuffer.java |   4 -
 .../controlprogram/caching/CacheableData.java      |   9 +-
 .../controlprogram/caching/LazyWriteBuffer.java    |   4 -
 .../caching/UnifiedMemoryManager.java              |   4 -
 .../sysds/runtime/matrix/data/LibMatrixMult.java   | 611 ++++++++-------------
 6 files changed, 237 insertions(+), 402 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/hops/cost/CostEstimatorStaticRuntime.java 
b/src/main/java/org/apache/sysds/hops/cost/CostEstimatorStaticRuntime.java
index 3f291326c6..61c2048836 100644
--- a/src/main/java/org/apache/sysds/hops/cost/CostEstimatorStaticRuntime.java
+++ b/src/main/java/org/apache/sysds/hops/cost/CostEstimatorStaticRuntime.java
@@ -26,7 +26,6 @@ import org.apache.sysds.lops.RightIndex;
 import org.apache.sysds.common.Types.ExecType;
 import org.apache.sysds.lops.MMTSJ.MMTSJType;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysds.runtime.controlprogram.caching.LazyWriteBuffer;
 import org.apache.sysds.runtime.instructions.CPInstructionParser;
 import org.apache.sysds.runtime.instructions.Instruction;
@@ -73,8 +72,7 @@ public class CostEstimatorStaticRuntime extends CostEstimator
                if( !vs[0]._inmem ){
                        ltime += getHDFSReadTime( vs[0].getRows(), 
vs[0].getCols(), vs[0].getSparsity() );
                        //eviction costs
-                       if( CacheableData.CACHING_WRITE_CACHE_ON_READ &&
-                               
LazyWriteBuffer.getWriteBufferLimit()<MatrixBlock.estimateSizeOnDisk(vs[0].getRows(),
 vs[0].getCols(), 
+                       if( 
LazyWriteBuffer.getWriteBufferLimit()<MatrixBlock.estimateSizeOnDisk(vs[0].getRows(),
 vs[0].getCols(), 
                                        (vs[0]._dc.getNonZeros()<0)? 
vs[0].getRows()*vs[0].getCols():vs[0]._dc.getNonZeros()) )
                        {
                                ltime += Math.abs( getFSWriteTime( 
vs[0].getRows(), vs[0].getCols(), vs[0].getSparsity() ));
@@ -84,8 +82,7 @@ public class CostEstimatorStaticRuntime extends CostEstimator
                if( !vs[1]._inmem ){
                        ltime += getHDFSReadTime( vs[1].getRows(), 
vs[1].getCols(), vs[1].getSparsity() );
                        //eviction costs
-                       if( CacheableData.CACHING_WRITE_CACHE_ON_READ &&
-                               
LazyWriteBuffer.getWriteBufferLimit()<MatrixBlock.estimateSizeOnDisk(vs[1].getRows(),
 vs[1].getCols(), (vs[1]._dc.getNonZeros()<0)? 
vs[1].getRows()*vs[1].getCols():vs[1]._dc.getNonZeros()) )
+                       if( 
LazyWriteBuffer.getWriteBufferLimit()<MatrixBlock.estimateSizeOnDisk(vs[1].getRows(),
 vs[1].getCols(), (vs[1]._dc.getNonZeros()<0)? 
vs[1].getRows()*vs[1].getCols():vs[1]._dc.getNonZeros()) )
                        {
                                ltime += Math.abs( getFSWriteTime( 
vs[1].getRows(), vs[1].getCols(), vs[1].getSparsity()) );
                        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java
index 40af176c59..8c74de792f 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java
@@ -61,8 +61,6 @@ public class ByteBuffer
                        if( !_shallow ) //SPARSE/DENSE -> SPARSE
                        {
                                //deep serialize (for compression)
-                               if( CacheableData.CACHING_BUFFER_PAGECACHE )
-                                       _bdata = PageCache.getPage((int)_size);
                                if( _bdata==null )
                                        _bdata = new byte[(int)_size];
                                DataOutput dout = new CacheDataOutput(_bdata);
@@ -133,8 +131,6 @@ public class ByteBuffer
        {
                //clear strong references to buffer/matrix
                if( !_shallow ) {
-                       if( CacheableData.CACHING_BUFFER_PAGECACHE )
-                               PageCache.putPage(_bdata);
                        _bdata = null;
                }
                else {
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
index 7fc42a992b..b70e583765 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
@@ -84,8 +84,6 @@ public abstract class CacheableData<T extends CacheBlock<?>> 
extends Data
        public static final long CACHING_THRESHOLD = (long)Math.max(4*1024, 
//obj not s.t. caching
                1e-5 * InfrastructureAnalyzer.getLocalMaxMemory());       //if 
below threshold [in bytes]
        public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO;
-       public static final boolean CACHING_BUFFER_PAGECACHE = false;
-       public static final boolean CACHING_WRITE_CACHE_ON_READ = false;
        public static final String  CACHING_COUNTER_GROUP_NAME = "SystemDS 
Caching Counters";
        public static final String  CACHING_EVICTION_FILEEXTENSION = ".dat";
        public static final boolean CACHING_ASYNC_FILECLEANUP = true;
@@ -571,7 +569,7 @@ public abstract class CacheableData<T extends 
CacheBlock<?>> extends Data
                                        _data = 
readBlobFromFederated(_fedMapping);
 
                                        //mark for initial local write despite 
read operation
-                                       _requiresLocalWrite = 
CACHING_WRITE_CACHE_ON_READ;
+                                       _requiresLocalWrite = false;
                                }
                                else if( getRDDHandle()==null || 
getRDDHandle().allowsShortCircuitRead() ) {
                                        if( DMLScript.STATISTICS )
@@ -585,7 +583,7 @@ public abstract class CacheableData<T extends 
CacheBlock<?>> extends Data
                                        _data = readBlobFromHDFS( _hdfsFileName 
);
                                        
                                        //mark for initial local write despite 
read operation
-                                       _requiresLocalWrite = 
CACHING_WRITE_CACHE_ON_READ;
+                                       _requiresLocalWrite = false;
                                }
                                else {
                                        //read matrix from rdd (incl execute 
pending rdd operations)
@@ -593,8 +591,7 @@ public abstract class CacheableData<T extends 
CacheBlock<?>> extends Data
                                        _data = readBlobFromRDD( 
getRDDHandle(), writeStatus );
                                        
                                        //mark for initial local write (prevent 
repeated execution of rdd operations)
-                                       _requiresLocalWrite = 
writeStatus.booleanValue() ? 
-                                               CACHING_WRITE_CACHE_ON_READ : 
true;
+                                       _requiresLocalWrite = 
!writeStatus.booleanValue();
                                }
                                
                                setDirty(false);
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/LazyWriteBuffer.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/LazyWriteBuffer.java
index 48fb8c81e8..3c91ff463a 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/LazyWriteBuffer.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/LazyWriteBuffer.java
@@ -183,8 +183,6 @@ public class LazyWriteBuffer {
                _fClean = new CacheMaintenanceService();
                _limit = OptimizerUtils.getBufferPoolLimit();
                _size = 0;
-               if( CacheableData.CACHING_BUFFER_PAGECACHE )
-                       PageCache.init();
        }
 
        public static void cleanup() {
@@ -192,8 +190,6 @@ public class LazyWriteBuffer {
                        _mQueue.clear();
                if( _fClean != null )
                        _fClean.close();
-               if( CacheableData.CACHING_BUFFER_PAGECACHE )
-                       PageCache.clear();
        }
 
        public static long getWriteBufferLimit() {
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/UnifiedMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/UnifiedMemoryManager.java
index a82d4a5402..91f700e85f 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/UnifiedMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/UnifiedMemoryManager.java
@@ -186,8 +186,6 @@ public class UnifiedMemoryManager
                _totCachedSize = 0;
                _pinnedPhysicalMemSize = 0;
                _pinnedVirtualMemSize = 0;
-               if( CacheableData.CACHING_BUFFER_PAGECACHE )
-                       PageCache.init();
        }
 
        // Cleanup the unified memory manager
@@ -196,8 +194,6 @@ public class UnifiedMemoryManager
                        _mQueue.clear();
                if( _fClean != null )
                        _fClean.close();
-               if( CacheableData.CACHING_BUFFER_PAGECACHE )
-                       PageCache.clear();
                _totCachedSize = 0;
                _pinnedPhysicalMemSize = 0;
                _pinnedVirtualMemSize = 0;
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index b7a4f42bf6..490f350aaf 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -72,7 +72,6 @@ import org.apache.sysds.utils.NativeHelper;
 public class LibMatrixMult 
 {
        //internal configuration
-       private static final boolean LOW_LEVEL_OPTIMIZATION = true;
        private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 
2 MB
        private static final long PAR_MINFLOP_THRESHOLD1 = 2L*1024*1024; //MIN 
2 MFLOP
        private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 
MFLOP
@@ -1022,69 +1021,51 @@ public class LibMatrixMult
                final int n = m2.clen;
                final int cd = m1.clen;
                
-               if( LOW_LEVEL_OPTIMIZATION ) {
-                       if( m==1 && n==1 ) {            //DOT PRODUCT
-                               double[] avals = a.valuesAt(0);
-                               double[] bvals = b.valuesAt(0);
-                               if( ru > m ) //pm2r - parallelize over common 
dim
-                                       c.set(0, 0, dotProduct(avals, bvals, 
rl, rl, ru-rl));
-                               else
-                                       c.set(0, 0, dotProduct(avals, bvals, 
cd));
-                       }
-                       else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
-                               double[] avals = a.valuesAt(0);
-                               double[] bvals = b.valuesAt(0);
-                               for( int i=rl; i < ru; i++) {
-                                       double[] cvals = c.values(i);
-                                       int cix = c.pos(i);
-                                       if( avals[i] == 1 )
-                                               System.arraycopy(bvals, 0, 
cvals, cix, n);
-                                       else if( avals[i] != 0 )
-                                               vectMultiplyWrite(avals[i], 
bvals, cvals, 0, cix, n);
-                                       else
-                                               Arrays.fill(cvals, cix, cix+n, 
0);
-                               }
-                       }
-                       else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
-                               double[] avals = a.valuesAt(0);
-                               double[] cvals = c.valuesAt(0);
-                               vectMultiplyWrite(b.get(0,0), avals, cvals, rl, 
rl, ru-rl);
-                       }
-                       else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short 
rhs)
-                               matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, 
ru);
-                       }
-                       else if( n==1 ) {               //MATRIX-VECTOR (tall 
rhs)
-                               matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, 
ru);
-                       }
-                       else if( pm2 && m==1 ) {        //VECTOR-MATRIX
-                               matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
-                       }
-                       else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short 
lhs) 
-                               matrixMultDenseDenseMMShortLHS(a, b, c, m, n, 
cd, rl, ru);
-                       }
-                       else if( tm2 ) {                //MATRIX-MATRIX (skinny 
rhs)
-                               matrixMultDenseDenseMMSkinnyRHS(a, b, c, 
m2.rlen, cd, rl, ru);
-                       }
-                       else {                          //MATRIX-MATRIX
-                               matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, 
cl, cu);
-                       }
+               if( m==1 && n==1 ) {            //DOT PRODUCT
+                       double[] avals = a.valuesAt(0);
+                       double[] bvals = b.valuesAt(0);
+                       if( ru > m ) //pm2r - parallelize over common dim
+                               c.set(0, 0, dotProduct(avals, bvals, rl, rl, 
ru-rl));
+                       else
+                               c.set(0, 0, dotProduct(avals, bvals, cd));
                }
-               else {
-                       for( int i = rl; i < ru; i++) {
-                               double[] avals = a.values(i);
+               else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
+                       double[] avals = a.valuesAt(0);
+                       double[] bvals = b.valuesAt(0);
+                       for( int i=rl; i < ru; i++) {
                                double[] cvals = c.values(i);
-                               int aix = a.pos(i), cix = c.pos(i);
-                               for( int k = 0; k < cd; k++) {
-                                       double val = avals[aix + k];
-                                       if( val != 0 ) {
-                                               double[] bvals = b.values(k);
-                                               int bix = b.pos(k);
-                                               for( int j = 0; j < n; j++) 
-                                                       cvals[cix+j] += val * 
bvals[bix+j];
-                                       }
-                               }
+                               int cix = c.pos(i);
+                               if( avals[i] == 1 )
+                                       System.arraycopy(bvals, 0, cvals, cix, 
n);
+                               else if( avals[i] != 0 )
+                                       vectMultiplyWrite(avals[i], bvals, 
cvals, 0, cix, n);
+                               else
+                                       Arrays.fill(cvals, cix, cix+n, 0);
                        }
                }
+               else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
+                       double[] avals = a.valuesAt(0);
+                       double[] cvals = c.valuesAt(0);
+                       vectMultiplyWrite(b.get(0,0), avals, cvals, rl, rl, 
ru-rl);
+               }
+               else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+                       matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
+               }
+               else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+                       matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
+               }
+               else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+                       matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
+               }
+               else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+                       matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, 
ru);
+               }
+               else if( tm2 ) {                //MATRIX-MATRIX (skinny rhs)
+                       matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, 
rl, ru);
+               }
+               else {                          //MATRIX-MATRIX
+                       matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
+               }
        }
        
        private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, 
DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
@@ -1277,66 +1258,46 @@ public class LibMatrixMult
                int cd = m1.clen;
                
                // MATRIX-MATRIX (VV, MV not applicable here because V always 
dense)
-               if( LOW_LEVEL_OPTIMIZATION )
-               {
-                       SparseBlock b = m2.sparseBlock;
+               SparseBlock b = m2.sparseBlock;
                        
-                       if( pm2 && m==1 ) {        //VECTOR-MATRIX
-                               //parallelization over rows in rhs matrix
-                               double[] avals = a.valuesAt(0); //vector
-                               double[] cvals = c.valuesAt(0); //vector
-                               for( int k=rl; k<ru; k++ )
-                                       if( avals[k] != 0 && !b.isEmpty(k) ) {
-                                               vectMultiplyAdd(avals[k], 
b.values(k), cvals,
-                                                       b.indexes(k), b.pos(k), 
0, b.size(k));
-                                       }
-                       }
-                       else {                     //MATRIX-MATRIX
-                               //best effort blocking, without blocking over J 
because it is 
-                               //counter-productive, even with front of 
current indexes
-                               final int blocksizeK = 32;
-                               final int blocksizeI = 32;
-                               
-                               int rl1 = pm2 ? 0 : rl;
-                               int ru1 = pm2 ? m : ru;
-                               int rl2 = pm2 ? rl : 0;
-                               int ru2 = pm2 ? ru : cd;
-                               
-                               //blocked execution
-                               for( int bi = rl1; bi < ru1; bi+=blocksizeI )
-                                       for( int bk = rl2, bimin = 
Math.min(ru1, bi+blocksizeI); bk < ru2; bk+=blocksizeK ) {
-                                               int bkmin = Math.min(ru2, 
bk+blocksizeK);
-                                               //core sub block matrix 
multiplication
-                                               for(int i = bi; i < bimin; i++) 
{
-                                                       double[] avals = 
a.values(i), cvals = c.values(i);
-                                                       int aix = a.pos(i), cix 
= c.pos(i);
-                                                       for( int k = bk; k < 
bkmin; k++ ) {
-                                                               double aval = 
avals[aix+k];
-                                                               if( aval == 0 
|| b.isEmpty(k) )
-                                                                       
continue;
-                                                               
vectMultiplyAdd(aval, b.values(k), cvals, 
-                                                                       
b.indexes(k), b.pos(k), cix, b.size(k));
-                                                       }
+               if( pm2 && m==1 ) {        //VECTOR-MATRIX
+                       //parallelization over rows in rhs matrix
+                       double[] avals = a.valuesAt(0); //vector
+                       double[] cvals = c.valuesAt(0); //vector
+                       for( int k=rl; k<ru; k++ )
+                               if( avals[k] != 0 && !b.isEmpty(k) ) {
+                                       vectMultiplyAdd(avals[k], b.values(k), 
cvals,
+                                               b.indexes(k), b.pos(k), 0, 
b.size(k));
+                               }
+               }
+               else {                     //MATRIX-MATRIX
+                       //best effort blocking, without blocking over J because 
it is 
+                       //counter-productive, even with front of current indexes
+                       final int blocksizeK = 32;
+                       final int blocksizeI = 32;
+                       
+                       int rl1 = pm2 ? 0 : rl;
+                       int ru1 = pm2 ? m : ru;
+                       int rl2 = pm2 ? rl : 0;
+                       int ru2 = pm2 ? ru : cd;
+                       
+                       //blocked execution
+                       for( int bi = rl1; bi < ru1; bi+=blocksizeI )
+                               for( int bk = rl2, bimin = Math.min(ru1, 
bi+blocksizeI); bk < ru2; bk+=blocksizeK ) {
+                                       int bkmin = Math.min(ru2, 
bk+blocksizeK);
+                                       //core sub block matrix multiplication
+                                       for(int i = bi; i < bimin; i++) {
+                                               double[] avals = a.values(i), 
cvals = c.values(i);
+                                               int aix = a.pos(i), cix = 
c.pos(i);
+                                               for( int k = bk; k < bkmin; k++ 
) {
+                                                       double aval = 
avals[aix+k];
+                                                       if( aval == 0 || 
b.isEmpty(k) )
+                                                               continue;
+                                                       vectMultiplyAdd(aval, 
b.values(k), cvals, 
+                                                               b.indexes(k), 
b.pos(k), cix, b.size(k));
                                                }
                                        }
-                       }
-               }
-               else {
-                       SparseBlock b = m2.sparseBlock;
-                       for( int i=rl; i < ru; i++ ) {
-                               double[] avals = a.values(i), cvals = 
c.values(i);
-                               int aix = a.pos(i), cix = c.pos(i);
-                               for(int k = 0; k < cd; k++ ) {
-                                       double val = avals [aix];
-                                       if( val == 0 || b.isEmpty(k) ) continue;
-                                       int bpos = b.pos(k);
-                                       int blen = b.size(k);
-                                       int[] bix = b.indexes(k);
-                                       double[] bvals = b.values(k);
-                                       for(int j = bpos; j < bpos+blen; j++)
-                                               cvals[cix+bix[j]] += val * 
bvals[j];
                                }
-                       }
                }
        }
 
@@ -1349,48 +1310,27 @@ public class LibMatrixMult
                final int cd = m2.rlen;
                final long xsp = (long)m*cd/m1.nonZeros;
 
-               if( LOW_LEVEL_OPTIMIZATION ) {
-                       if( m==1 && n==1 ) {            //DOT PRODUCT
-                               if( !a.isEmpty(0) )
-                                       c.set(0, 0, dotProduct(a.values(0), 
b.values(0), a.indexes(0), a.pos(0), 0, a.size(0)));
-                       }
-                       else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short 
rhs)
-                               matrixMultSparseDenseMVShortRHS(a, b, c, cd, 
rl, ru);
-                       }
-                       else if( n==1 ) {               //MATRIX-VECTOR (tall 
rhs)
-                               matrixMultSparseDenseMVTallRHS(a, b, c, cd, 
xsp, rl, ru);
-                       }
-                       else if( pm2 && m==1 ) {        //VECTOR-MATRIX
-                               matrixMultSparseDenseVM(a, b, c, n, rl, ru);
-                       }
-                       else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short 
lhs) 
-                               matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, 
rl, ru);
-                       }
-                       else if( n<=64 ) {              //MATRIX-MATRIX (skinny 
rhs)
-                               matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, 
rl, ru);
-                       }
-                       else {                          //MATRIX-MATRIX
-                               matrixMultSparseDenseMM(a, b, c, n, cd, xsp, 
rl, ru);
-                       }
+               if( m==1 && n==1 ) {            //DOT PRODUCT
+                       if( !a.isEmpty(0) )
+                               c.set(0, 0, dotProduct(a.values(0), 
b.values(0), a.indexes(0), a.pos(0), 0, a.size(0)));
                }
-               else {
-                       for( int i=rl; i<ru; i++ ) {
-                               if( a.isEmpty(i) ) continue; 
-                               int apos = a.pos(i);
-                               int alen = a.size(i);
-                               int[] aix = a.indexes(i);
-                               double[] avals = a.values(i);
-                               double[] cvals = c.values(i);
-                               int cix = c.pos(i);
-                               for(int k = apos; k < apos+alen; k++) {
-                                       double val = avals[k];
-                                       int colIndex = aix[k];
-                                       double[] bvals = b.values(colIndex);
-                                       int bix = b.pos(colIndex);
-                                       for(int j = 0; j < n; j++)
-                                               cvals[cix+j] += val * 
bvals[bix+j];
-                               }
-                       }
+               else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+                       matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru);
+               }
+               else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+                       matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, 
ru);
+               }
+               else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+                       matrixMultSparseDenseVM(a, b, c, n, rl, ru);
+               }
+               else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+                       matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
+               }
+               else if( n<=64 ) {              //MATRIX-MATRIX (skinny rhs)
+                       matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
+               }
+               else {                          //MATRIX-MATRIX
+                       matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
                }
        }
        
@@ -1585,19 +1525,14 @@ public class LibMatrixMult
                int n = m2.clen;
                
                // MATRIX-MATRIX (VV, MV not applicable here because V always 
dense)
-               if(LOW_LEVEL_OPTIMIZATION) {
-                       if( pm2 && m==1 )               //VECTOR-MATRIX
-                               matrixMultSparseSparseVM(a, b, 
ret.getDenseBlock(), rl, ru);
-                       else if( sparse )               //SPARSE OUPUT
-                               
ret.setNonZeros(matrixMultSparseSparseSparseMM(a, b, ret.getSparseBlock(), n, 
rl, ru));
-                       else if( m2.nonZeros < 2048 )   //MATRIX-SMALL MATRIX
-                               matrixMultSparseSparseMMSmallRHS(a, b, 
ret.getDenseBlock(), rl, ru);
-                       else                            //MATRIX-MATRIX
-                               matrixMultSparseSparseMM(a, b, 
ret.getDenseBlock(), m, cd, m1.nonZeros, rl, ru);
-               }
-               else {
-                       matrixMultSparseSparseMMGeneric(a, b, 
ret.getDenseBlock(), rl, ru);
-               }
+               if( pm2 && m==1 )               //VECTOR-MATRIX
+                       matrixMultSparseSparseVM(a, b, ret.getDenseBlock(), rl, 
ru);
+               else if( sparse )               //SPARSE OUPUT
+                       ret.setNonZeros(matrixMultSparseSparseSparseMM(a, b, 
ret.getSparseBlock(), n, rl, ru));
+               else if( m2.nonZeros < 2048 )   //MATRIX-SMALL MATRIX
+                       matrixMultSparseSparseMMSmallRHS(a, b, 
ret.getDenseBlock(), rl, ru);
+               else                            //MATRIX-MATRIX
+                       matrixMultSparseSparseMM(a, b, ret.getDenseBlock(), m, 
cd, m1.nonZeros, rl, ru);
        }
        
        private static void matrixMultSparseSparseVM(SparseBlock a, SparseBlock 
b, DenseBlock c, int rl, int ru) {
@@ -1708,6 +1643,7 @@ public class LibMatrixMult
                }
        }
        
+       @SuppressWarnings("unused")
        private static void matrixMultSparseSparseMMGeneric(SparseBlock a, 
SparseBlock b, DenseBlock c, int rl, int ru) {
                for( int i=rl; i<Math.min(ru, a.numRows()); i++ ) {
                        if( a.isEmpty(i) ) continue;
@@ -2261,146 +2197,110 @@ public class LibMatrixMult
                
                if( leftTranspose ) // t(X)%*%X
                {
-                       if( LOW_LEVEL_OPTIMIZATION )
+                       if( n==1 ) //VECTOR (col)
                        {
-                               if( n==1 ) //VECTOR (col)
-                               {
-                                       double[] avals = a.valuesAt(0);
-                                       c.set(0, 0, dotProduct(avals, avals, 
m));
-                               }
-                               else //MATRIX
-                               {
-                                       //1) Unrolled inner loop (for better 
instruction-level parallelism)
-                                       //2) Blocked execution (for less cache 
trashing in parallel exec)
-                                       //3) Asymmetric block sizes (for less 
misses in inner loop, yet blocks in L1/L2)
-                                       
-                                       final int blocksizeI = 32; //64//256KB 
c block (typical L2 size per core), 32KB a block
-                                       final int blocksizeK = 24; //64//256KB 
b block (typical L2 size per core), used while read 512B of a / read/write 4KB 
of c 
-                                       final int blocksizeJ = 1024; //512//4KB 
(typical main-memory page size), for scan
+                               double[] avals = a.valuesAt(0);
+                               c.set(0, 0, dotProduct(avals, avals, m));
+                       }
+                       else //MATRIX
+                       {
+                               //1) Unrolled inner loop (for better 
instruction-level parallelism)
+                               //2) Blocked execution (for less cache trashing 
in parallel exec)
+                               //3) Asymmetric block sizes (for less misses in 
inner loop, yet blocks in L1/L2)
+                               
+                               final int blocksizeI = 32; //64//256KB c block 
(typical L2 size per core), 32KB a block
+                               final int blocksizeK = 24; //64//256KB b block 
(typical L2 size per core), used while read 512B of a / read/write 4KB of c 
+                               final int blocksizeJ = 1024; //512//4KB 
(typical main-memory page size), for scan
 
-                                       //temporary arrays (nnz a, b index)
-                                       double[] ta = new double[ blocksizeK ];
-                                       int[]  tbi  = new int[ blocksizeK ];
-                                       
-                                       final int mx = ru;
-                                       final int cdx = m;
-                                       final int nx = n;
-                                       
-                                       //blocked execution
-                                       for( int bi = rl; bi < mx; 
bi+=blocksizeI ) //from bi due to symmetry
-                                               for( int bk = 0, bimin = 
Math.min(mx, bi+blocksizeI); bk < cdx; bk+=blocksizeK )
-                                                       for( int bj = bi, bkmin 
= Math.min(cdx, bk+blocksizeK); bj < nx; bj+=blocksizeJ )
+                               //temporary arrays (nnz a, b index)
+                               double[] ta = new double[ blocksizeK ];
+                               int[]  tbi  = new int[ blocksizeK ];
+                               
+                               final int mx = ru;
+                               final int cdx = m;
+                               final int nx = n;
+                               
+                               //blocked execution
+                               for( int bi = rl; bi < mx; bi+=blocksizeI ) 
//from bi due to symmetry
+                                       for( int bk = 0, bimin = Math.min(mx, 
bi+blocksizeI); bk < cdx; bk+=blocksizeK )
+                                               for( int bj = bi, bkmin = 
Math.min(cdx, bk+blocksizeK); bj < nx; bj+=blocksizeJ )
+                                               {
+                                                       int bklen = bkmin-bk;
+                                                       int bjlen = 
Math.min(nx, bj+blocksizeJ)-bj;
+                                                       
+                                                       //core sub block matrix 
multiplication
+                                                       for( int i = bi; i < 
bimin; i++) 
                                                        {
-                                                               int bklen = 
bkmin-bk;
-                                                               int bjlen = 
Math.min(nx, bj+blocksizeJ)-bj;
+                                                               double[] cvals 
= c.values(i);
+                                                               int cixj = 
c.pos(i, bj);
                                                                
-                                                               //core sub 
block matrix multiplication
-                                                               for( int i = 
bi; i < bimin; i++) 
-                                                               {
-                                                                       
double[] cvals = c.values(i);
-                                                                       int 
cixj = c.pos(i, bj);
+                                                               if( 
a.isContiguous(bk, bkmin-1) ) {
+                                                                       
double[] avals = a.values(bk);
+                                                                       int 
aixi = a.pos(bk, i);
+                                                                       int 
bkpos = a.pos(bk, bj);
                                                                        
-                                                                       if( 
a.isContiguous(bk, bkmin-1) ) {
-                                                                               
double[] avals = a.values(bk);
-                                                                               
int aixi = a.pos(bk, i);
-                                                                               
int bkpos = a.pos(bk, bj);
-                                                                               
-                                                                               
//determine nnz of a (for sparsity-aware skipping of rows)
-                                                                               
int knnz = copyNonZeroElements(avals, aixi, bkpos, n, nx, ta, tbi, bklen);
-                                                                               
-                                                                               
//rest not aligned to blocks of 4 rows
-                                                                               
final int bn = knnz % 4;
-                                                                               
switch( bn ){
-                                                                               
        case 1: vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen); 
break;
-                                                                               
        case 2: vectMultiplyAdd2(ta[0],ta[1], avals, cvals, tbi[0], tbi[1], 
cixj, bjlen); break;
-                                                                               
        case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], avals, cvals, tbi[0], 
tbi[1],tbi[2], cixj, bjlen); break;
-                                                                               
}
-                                                                               
-                                                                               
//compute blocks of 4 rows (core inner loop)
-                                                                               
for( int k = bn; k<knnz; k+=4 ){
-                                                                               
        vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], avals, cvals,
-                                                                               
                tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-                                                                               
}
+                                                                       
//determine nnz of a (for sparsity-aware skipping of rows)
+                                                                       int 
knnz = copyNonZeroElements(avals, aixi, bkpos, n, nx, ta, tbi, bklen);
+                                                                       
+                                                                       //rest 
not aligned to blocks of 4 rows
+                                                                       final 
int bn = knnz % 4;
+                                                                       switch( 
bn ){
+                                                                               
case 1: vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen); break;
+                                                                               
case 2: vectMultiplyAdd2(ta[0],ta[1], avals, cvals, tbi[0], tbi[1], cixj, 
bjlen); break;
+                                                                               
case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], avals, cvals, tbi[0], 
tbi[1],tbi[2], cixj, bjlen); break;
                                                                        }
-                                                                       else {
-                                                                               
for( int k = bk; k<bkmin; k++ ) {
-                                                                               
        double[] avals = a.values(bk);
-                                                                               
        int aix = a.pos(bk, i);
-                                                                               
        if( avals[aix] != 0 )
-                                                                               
                vectMultiplyAdd( avals[aix], a.values(k),
-                                                                               
                        cvals, a.pos(k, bj), cixj, bjlen );
-                                                                               
}
+                                                                       
+                                                                       
//compute blocks of 4 rows (core inner loop)
+                                                                       for( 
int k = bn; k<knnz; k+=4 ){
+                                                                               
vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], avals, cvals,
+                                                                               
        tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+                                                                       }
+                                                               }
+                                                               else {
+                                                                       for( 
int k = bk; k<bkmin; k++ ) {
+                                                                               
double[] avals = a.values(bk);
+                                                                               
int aix = a.pos(bk, i);
+                                                                               
if( avals[aix] != 0 )
+                                                                               
        vectMultiplyAdd( avals[aix], a.values(k),
+                                                                               
                cvals, a.pos(k, bj), cixj, bjlen );
                                                                        }
                                                                }
                                                        }
-                               }
-                       }
-                       else {
-                               for( int k = 0; k < m; k++ ) {
-                                       double[] avals = a.values(k);
-                                       int aix = a.pos(k);
-                                       for( int i = rl; i < ru; i++ ) {
-                                               double[] cvals = c.values(i);
-                                               int cix = c.pos(i);
-                                               double val = avals[ aix+i ];
-                                               if( val != 0 ) {
-                                                       for(int j = i; j < n; 
j++) //from i due to symmetry
-                                                               cvals[ cix+j ] 
+= val * avals[ aix+j ];
                                                }
-                                       }
-                               }
                        }
                }
                else // X%*%t(X)
                {
-                       if(LOW_LEVEL_OPTIMIZATION)
+                       if( m==1 ) //VECTOR
                        {
-                               if( m==1 ) //VECTOR
-                               {
-                                       double[] avals = a.valuesAt(0);
-                                       c.set(0, 0, dotProduct(avals, avals, 
n));
-                               }
-                               else //MATRIX
-                               {
-                                       //algorithm: scan c, foreach ci,j: scan 
row of a and t(a) (IJK)
-                               
-                                       //1) Unrolled inner loop, for better ILP
-                                       //2) Blocked execution, for less cache 
trashing in parallel exec 
-                                       //   (we block such that lhs, rhs, and 
output roughly fit into L2, output in L1)
-                                       //3) Asymmetric block sizes and 
exploitation of result symmetry
-                                       int blocksizeK = 1024; //two memory 
pages for sufficiently long scans
-                                       int blocksizeIJ = L2_CACHESIZE / 8 / 
blocksizeK / 2 - 1; //15
-                               
-                                       //blocked execution over IKJ (lhs/rhs 
in L2, output in L1)
-                                       for( int bi = rl; bi<ru; 
bi+=blocksizeIJ ) 
-                                               for( int bk = 0, bimin = 
Math.min(ru, bi+blocksizeIJ); bk<n; bk+=blocksizeK )
-                                                       for( int bj = bi, bklen 
= Math.min(blocksizeK, n-bk); bj<m; bj+=blocksizeIJ ) {
-                                                               //core tsmm 
block operation (15x15 vectors of length 1K elements)
-                                                               int bjmin = 
Math.min(m, bj+blocksizeIJ);
-                                                               for( int i=bi; 
i<bimin; i++ ) {
-                                                                       final 
int bjmax = Math.max(i,bj); //from i due to symmetry
-                                                                       
double[] avals = a.values(i), cvals = c.values(i);
-                                                                       int aix 
= a.pos(i, bk), cix = c.pos(i);
-                                                                       for(int 
j=bjmax; j <bjmin; j++) 
-                                                                               
cvals[ cix+j ] += dotProduct(avals, a.values(j), aix, a.pos(j, bk), bklen);
-                                                               }
-                                                       }
-                               }
+                               double[] avals = a.valuesAt(0);
+                               c.set(0, 0, dotProduct(avals, avals, n));
                        }
-                       else
+                       else //MATRIX
                        {
-                               for( int i = rl; i < ru; i++ ) {
-                                       double[] avals1 = a.values(i), cvals = 
c.values(i);
-                                       int aix1 = a.pos(i), cix = c.pos(i);
-                                       for(int j = i; j < m; j++) { //from i 
due to symmetry
-                                               double[] avals2 = a.values(j);
-                                               int aix2 = a.pos(j);
-                                               double val = 0;
-                                               for(int k = 0; k < n; k++)
-                                                       val += avals1[aix1+k] * 
avals2[aix2+k];
-                                               cvals[cix+j] = val;
-                                       }
-                               }
+                               //algorithm: scan c, foreach ci,j: scan row of 
a and t(a) (IJK)
+                       
+                               //1) Unrolled inner loop, for better ILP
+                               //2) Blocked execution, for less cache trashing 
in parallel exec 
+                               //   (we block such that lhs, rhs, and output 
roughly fit into L2, output in L1)
+                               //3) Asymmetric block sizes and exploitation of 
result symmetry
+                               int blocksizeK = 1024; //two memory pages for 
sufficiently long scans
+                               int blocksizeIJ = L2_CACHESIZE / 8 / blocksizeK 
/ 2 - 1; //15
+                       
+                               //blocked execution over IKJ (lhs/rhs in L2, 
output in L1)
+                               for( int bi = rl; bi<ru; bi+=blocksizeIJ ) 
+                                       for( int bk = 0, bimin = Math.min(ru, 
bi+blocksizeIJ); bk<n; bk+=blocksizeK )
+                                               for( int bj = bi, bklen = 
Math.min(blocksizeK, n-bk); bj<m; bj+=blocksizeIJ ) {
+                                                       //core tsmm block 
operation (15x15 vectors of length 1K elements)
+                                                       int bjmin = Math.min(m, 
bj+blocksizeIJ);
+                                                       for( int i=bi; i<bimin; 
i++ ) {
+                                                               final int bjmax 
= Math.max(i,bj); //from i due to symmetry
+                                                               double[] avals 
= a.values(i), cvals = c.values(i);
+                                                               int aix = 
a.pos(i, bk), cix = c.pos(i);
+                                                               for(int 
j=bjmax; j <bjmin; j++) 
+                                                                       cvals[ 
cix+j ] += dotProduct(avals, a.values(j), aix, a.pos(j, bk), bklen);
+                                                       }
+                                               }
                        }
                }
        }
@@ -2429,52 +2329,29 @@ public class LibMatrixMult
                {
                        //only general case (because vectors always dense)
                        //algorithm: scan rows, foreach row self join (KIJ)
-                       if( LOW_LEVEL_OPTIMIZATION ) {
-                               final int n = m1.clen;
-                               final int arlen = a.numRows();
-                               for( int r=0; r<arlen; r++ ) {
-                                       if( a.isEmpty(r) ) continue;
-                                       final int alen = a.size(r);
-                                       final double[] avals = a.values(r);
-                                       final int apos = a.pos(r);
-                                       if( alen == n ) { //dense row
-                                               for (int i = rl; i < ru; i++){
-                                                       double[] cvals = 
c.values(i);
-                                                       int cix = c.pos(i);
-                                                       double val = avals[i + 
apos];
-                                                       for(int j = i; j < 
m1.clen; j++)
-                                                               cvals[cix + j] 
+=val * avals[j + apos];
-                                               }
-                                       }
-                                       else { //non-full sparse row
-                                               int[] aix = a.indexes(r);
-                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
-                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
-                                               int len = apos + alen;
-                                               for(int i = rlix; i < len && 
aix[i] < ru; i++)
-                                                       
vectMultiplyAdd(avals[i], avals, c.values(aix[i]), aix, i, c.pos(aix[i]), len - 
i);
+                       final int n = m1.clen;
+                       final int arlen = a.numRows();
+                       for( int r=0; r<arlen; r++ ) {
+                               if( a.isEmpty(r) ) continue;
+                               final int alen = a.size(r);
+                               final double[] avals = a.values(r);
+                               final int apos = a.pos(r);
+                               if( alen == n ) { //dense row
+                                       for (int i = rl; i < ru; i++){
+                                               double[] cvals = c.values(i);
+                                               int cix = c.pos(i);
+                                               double val = avals[i + apos];
+                                               for(int j = i; j < m1.clen; j++)
+                                                       cvals[cix + j] +=val * 
avals[j + apos];
                                        }
                                }
-                       }
-                       else
-                       {
-                               for( int r=0; r<a.numRows(); r++ ) {
-                                       if( a.isEmpty(r) ) continue;
-                                       int apos = a.pos(r);
-                                       int alen = a.size(r);
+                               else { //non-full sparse row
                                        int[] aix = a.indexes(r);
-                                       double[] avals = a.values(r);
                                        int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
                                        rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
-                                       for(int i = rlix; i < apos+alen && 
aix[i]<ru; i++) {
-                                               double val = avals[i];
-                                               if( val != 0 ) {
-                                                       double[] cvals = 
c.values(aix[i]);
-                                                       int cix = c.pos(aix[i]);
-                                                       for(int j = i; j < 
apos+alen; j++)
-                                                               
cvals[cix+aix[j]] += val * avals[j];
-                                               }
-                                       }
+                                       int len = apos + alen;
+                                       for(int i = rlix; i < len && aix[i] < 
ru; i++)
+                                               vectMultiplyAdd(avals[i], 
avals, c.values(aix[i]), aix, i, c.pos(aix[i]), len - i);
                                }
                        }
                }
@@ -2496,44 +2373,20 @@ public class LibMatrixMult
                                m = m1.clen;
                                
                                //algorithm: scan rows, foreach row self join 
(KIJ)
-                               if( LOW_LEVEL_OPTIMIZATION )
-                               {
-                                       int arlen = a.numRows();
-                                       for( int r=0; r<arlen; r++ ) {
-                                               if( a.isEmpty(r) ) continue;
-                                               int apos = a.pos(r);
-                                               int alen = a.size(r);
-                                               int[] aix = a.indexes(r);
-                                               double[] avals = a.values(r);
-                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
-                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
-                                               for(int i = rlix; i < apos+alen 
&& aix[i]<ru; i++) {
-                                                       double val = avals[i];
-                                                       if( val != 0 )
-                                                               
vectMultiplyAdd(val, avals, c.values(aix[i]),
-                                                                       aix, i, 
c.pos(aix[i]), alen-i);
-                                               }
-                                       }
-                               }
-                               else
-                               {
-                                       for( int r=0; r<a.numRows(); r++ ) {
-                                               if( a.isEmpty(r) ) continue;
-                                               int apos = a.pos(r);
-                                               int alen = a.size(r);
-                                               int[] aix = a.indexes(r);
-                                               double[] avals = a.values(r);
-                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r,rl);
-                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
-                                               for(int i = rlix; i < apos+alen 
&& aix[i]<ru; i++) {
-                                                       double val = avals[i];
-                                                       if( val != 0 ) {
-                                                               double[] cvals 
= c.values(aix[i]);
-                                                               int cix = 
c.pos(aix[i]);
-                                                               for( int j = i; 
j < alen; j++ )
-                                                                       
cvals[cix+aix[j]] += val * avals[j];
-                                                       }
-                                               }
+                               int arlen = a.numRows();
+                               for( int r=0; r<arlen; r++ ) {
+                                       if( a.isEmpty(r) ) continue;
+                                       int apos = a.pos(r);
+                                       int alen = a.size(r);
+                                       int[] aix = a.indexes(r);
+                                       double[] avals = a.values(r);
+                                       int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
+                                       rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
+                                       for(int i = rlix; i < apos+alen && 
aix[i]<ru; i++) {
+                                               double val = avals[i];
+                                               if( val != 0 )
+                                                       vectMultiplyAdd(val, 
avals, c.values(aix[i]),
+                                                               aix, i, 
c.pos(aix[i]), alen-i);
                                        }
                                }
                        }
@@ -4465,7 +4318,7 @@ public class LibMatrixMult
 
        private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, 
MatrixBlock m2 ) {
                //transpose if dense-dense, skinny rhs matrix (not vector), and 
memory guarded by output 
-               return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
+               return (!m1.sparse && !m2.sparse 
                        && isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, 
m2.clen, true));
        }
        
@@ -4478,15 +4331,15 @@ public class LibMatrixMult
        private static boolean checkParMatrixMultRightInputRows( MatrixBlock 
m1, MatrixBlock m2, int k ) {
                //parallelize over rows in rhs matrix if number of rows in 
lhs/output is very small
                double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-               return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && 
!(m1.isUltraSparse()||m2.isUltraSparse()))
-                       || (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.rlen > 
m1.rlen 
+               return (m1.rlen==1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
+                       || (m1.rlen<=16 && m2.rlen > m1.rlen 
                           && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) 
) //dense-dense / sparse-dense / dense-sparse
                           && (long)k * 8 * m1.rlen * m2.clen < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) );
        }
 
        private static boolean checkParMatrixMultRightInputCols( MatrixBlock 
m1, MatrixBlock m2, int k, boolean pm2r ) {
                //parallelize over cols in rhs matrix if dense, number of cols 
in rhs is large, and lhs fits in l2
-               return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
+               return (!m1.sparse && !m2.sparse 
                                && m2.clen > k * 1024 && m1.rlen < k * 32 && 
!pm2r
                                && 8*m1.rlen*m1.clen < 256*1024 ); //lhs fits 
in L2 cache
        }
@@ -4498,7 +4351,7 @@ public class LibMatrixMult
        public static boolean satisfiesMultiThreadingConstraints(MatrixBlock 
m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
                boolean sharedTP = 
(InfrastructureAnalyzer.getLocalParallelism() == k);
                double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-               return k > 1 && LOW_LEVEL_OPTIMIZATION
+               return k > 1
                        && (!checkMem || 8L * m1.clen * k < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
                        && (!checkFLOPs || FPfactor * m1.rlen * m1.clen >
                        (sharedTP ? PAR_MINFLOP_THRESHOLD2 : 
PAR_MINFLOP_THRESHOLD1));
@@ -4507,7 +4360,7 @@ public class LibMatrixMult
        public static boolean satisfiesMultiThreadingConstraints(MatrixBlock 
m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) 
{
                boolean sharedTP = 
(InfrastructureAnalyzer.getLocalParallelism() == k);
                double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-               return k > 1 && LOW_LEVEL_OPTIMIZATION
+               return k > 1
                        && (!checkMem || 8L * m2.clen * k < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
                        //note: cast to double to avoid long overflows on 
ultra-sparse matrices
                        //due to FLOP computation based on number of cells not 
non-zeros
@@ -4518,7 +4371,7 @@ public class LibMatrixMult
        private static boolean 
satisfiesMultiThreadingConstraintsTSMM(MatrixBlock m1, boolean leftTranspose, 
double FPfactor, int k) {
                boolean sharedTP = 
(InfrastructureAnalyzer.getLocalParallelism() == k);
                double threshold = sharedTP ? PAR_MINFLOP_THRESHOLD2 : 
PAR_MINFLOP_THRESHOLD1;
-               return k > 1 && LOW_LEVEL_OPTIMIZATION && 
(leftTranspose?m1.clen:m1.rlen)!=1
+               return k > 1 && (leftTranspose?m1.clen:m1.rlen)!=1
                        && ((leftTranspose && FPfactor * m1.rlen * m1.clen * 
m1.clen > threshold)
                        ||(!leftTranspose && FPfactor * m1.clen * m1.rlen * 
m1.rlen > threshold));
        }


Reply via email to