This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 265329c [SYSTEMDS-2266] Fix native BLAS dgemm/dsyrk for large dense
blocks >16GB
265329c is described below
commit 265329cf240b728ace5cc5f93426d24de397c20f
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Jun 6 17:45:47 2021 +0200
[SYSTEMDS-2266] Fix native BLAS dgemm/dsyrk for large dense blocks >16GB
This patch adds the missing support for native BLAS matrix
multiplications (DGEMM, and DSYRK aka TSMM) for large dense blocks that
are internally composed of multiple arrays, each with up to MAX_INT
(2^(32-1)) cells. Due to issues with concurrent multi-threaded native
BLAS operations (e.g., w/ Intel MKL) we sequentially execute these
independent block operations.
---
.../sysds/runtime/matrix/data/LibMatrixNative.java | 50 +++++++++++++++++++---
1 file changed, 43 insertions(+), 7 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
index 9a55111..5c92253 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
@@ -30,6 +30,8 @@ import org.apache.sysds.api.DMLScript;
import org.apache.sysds.conf.ConfigurationManager;
import org.apache.sysds.conf.DMLConfig;
import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFactory;
import org.apache.sysds.utils.NativeHelper;
import org.apache.sysds.utils.Statistics;
@@ -81,7 +83,8 @@ public class LibMatrixNative
boolean isValidForNative = !isMatMultMemoryBound(m1.rlen,
m1.clen, m2.clen)
&& !m1.isInSparseFormat() && !m2.isInSparseFormat()
- && m1.getDenseBlock().isContiguous() &&
m2.getDenseBlock().isContiguous() //contiguous but not allocated
+ && (m1.getDenseBlock().isContiguous() ||
!isSinglePrecision())
+ && m2.getDenseBlock().isContiguous() //contiguous but
not allocated
&& 8L * ret.getLength() < Integer.MAX_VALUE;
if( NativeHelper.isNativeLibraryLoaded() && isValidForNative )
@@ -89,7 +92,7 @@ public class LibMatrixNative
ret.sparse = false;
ret.allocateDenseBlock();
long start = DMLScript.STATISTICS ? System.nanoTime() :
0;
- long nnz;
+ long nnz = 0;
if( isSinglePrecision() ) {
FloatBuffer fin1 =
toFloatBuffer(m1.getDenseBlockValues(), inBuff, true);
FloatBuffer fin2 =
toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true);
@@ -99,8 +102,23 @@ public class LibMatrixNative
fromFloatBuffer(outBuff.get(),
ret.getDenseBlockValues());
}
else {
- nnz =
NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
- ret.getDenseBlockValues(),
m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
+ DenseBlock a = m1.getDenseBlock();
+ if( a.isContiguous() ) {
+ nnz =
NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+ ret.getDenseBlockValues(),
m1.rlen, m1.clen, m2.clen, k);
+ }
+ else {
+ //sequential processing of individual
blocks to
+ //avoid segementation faults with
concurrent multi-threaded BLAS calls
+ for(int bix = 0; bix < a.numBlocks();
bix++) {
+ double[] tmp = new
double[a.blockSize(bix)*m2.clen];
+ nnz +=
NativeHelper.dmmdd(a.valuesAt(bix), m2.getDenseBlockValues(),
+ tmp, a.blockSize(bix),
m1.clen, m2.clen, k);
+ int rl = bix * a.blockSize();
+ ret.getDenseBlock().set(rl,
rl+a.blockSize(bix), 0, m2.clen,
+
DenseBlockFactory.createDenseBlock(tmp, new int[]{a.blockSize(bix),m2.clen}));
+ }
+ }
}
if(nnz > -1) {
@@ -134,14 +152,32 @@ public class LibMatrixNative
if( NativeHelper.isNativeLibraryLoaded() && (ret.clen > 1 ||
ret.getLength()==1)
&& !LibMatrixMult.isOuterProductTSMM(m1.rlen, m1.clen,
leftTrans)
- && (!m1.sparse && m1.getDenseBlock().isContiguous() ) )
+ && !m1.sparse && (m1.getDenseBlock().isContiguous() |
leftTrans) )
{
ret.sparse = false;
ret.allocateDenseBlock();
long start = DMLScript.STATISTICS ? System.nanoTime() :
0;
- long nnz = NativeHelper.tsmm(m1.getDenseBlockValues(),
- ret.getDenseBlockValues(), m1.rlen, m1.clen,
leftTrans, k);
+ DenseBlock a = m1.getDenseBlock();
+ double[] cvals = ret.getDenseBlockValues();
+ long nnz = 0;
+ if( a.isContiguous() ) {
+ nnz = NativeHelper.tsmm(a.valuesAt(0),
+ cvals, m1.rlen, m1.clen, leftTrans, k);
+ }
+ else { //large blocks (but only leftTrans)
+ //sequential processing of individual blocks to
+ //avoid segementation faults with concurrent
multi-threaded BLAS calls
+ IntStream.range(0, a.numBlocks()).forEach(bix
-> {
+ double[] tmp = new
double[m1.clen*m1.clen];
+ NativeHelper.tsmm(a.valuesAt(bix),
+ tmp, a.blockSize(bix), m1.clen,
leftTrans, k);
+ LibMatrixMult.vectAdd(tmp, cvals, 0, 0,
m1.clen*m1.clen);
+ });
+ nnz = ret.recomputeNonZeros();
+ }
+ //TODO flip upper triangular matrix down for consistent
+ //representations with the default java implementation?
if(nnz > -1) {
if(DMLScript.STATISTICS) {