This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 13dd8cb  [MINOR] Improved selection of multi-threaded matrix 
multiplications
13dd8cb is described below

commit 13dd8cb68260cec6692b2006271f651150519e42
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Mar 28 03:00:40 2021 +0200

    [MINOR] Improved selection of multi-threaded matrix multiplications
    
    This patch makes the validity checks for multi-threaded matrix
    multiplications (which check for a maximum memory overhead, scaled by
    the number of threads) more flexible and thus amenable on scale-up nodes
    (large mem, # cores) to matrices with millions of features.
    
    On training lm models on KDD98, this change improved end-to-end
    performance by 37x on a server with 56 physical / 112 virtual cores. On
    KDD98 lmCG takes ~27,000 batch iterations to convergence (1e-7 target
    norm) and now completes in 310s.
---
 .../java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 9db9754..a402134 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3893,16 +3893,18 @@ public class LibMatrixMult
        
        public static boolean satisfiesMultiThreadingConstraints(MatrixBlock 
m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
                boolean sharedTP = 
(InfrastructureAnalyzer.getLocalParallelism() == k);
+               double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
                return k > 1 && LOW_LEVEL_OPTIMIZATION
-                       && (!checkMem || 8L * m1.clen * k < 
MEM_OVERHEAD_THRESHOLD)
+                       && (!checkMem || 8L * m1.clen * k < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
                        && (!checkFLOPs || FPfactor * m1.rlen * m1.clen >
                        (sharedTP ? PAR_MINFLOP_THRESHOLD2 : 
PAR_MINFLOP_THRESHOLD1));
        }
        
        public static boolean satisfiesMultiThreadingConstraints(MatrixBlock 
m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) 
{
                boolean sharedTP = 
(InfrastructureAnalyzer.getLocalParallelism() == k);
+               double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
                return k > 1 && LOW_LEVEL_OPTIMIZATION
-                       && (!checkMem || 8L * m2.clen * k < 
MEM_OVERHEAD_THRESHOLD)
+                       && (!checkMem || 8L * m2.clen * k < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
                        //note: cast to double to avoid long overflows on 
ultra-sparse matrices
                        //due to FLOP computation based on number of cells not 
non-zeros
                        && (!checkFLOPs || (double)FPfactor * m1.rlen * m1.clen 
* m2.clen >

Reply via email to