This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 3ce3270 [SYSTEMDS-2760] Performance sparse-sparse transpose operations
3ce3270 is described below
commit 3ce32709366203eccb8da9063cedafa567f8d3bf
Author: Matthias Boehm <[email protected]>
AuthorDate: Thu Dec 17 23:04:18 2020 +0100
[SYSTEMDS-2760] Performance sparse-sparse transpose operations
This patch makes two minor performance improvements to multi-threaded
sparse-sparse transpose operations. The tasks get column groups of the
input and append outputs to the sparse output rows. For tall&skinny
sparse matrices there were two shortcomings.
First, we aligned the minimum column group size to 8, which is only
required for dense-dense transpose operations. This improved performance
on above scenario.
Second, if the number of cores is larger than the number of columns,
cache blocking and related position maintenance only adds unnecessary
overhead. Accordingly, we now use added a special case for these
scenarios.
On a scenario of a 2.5M x 50 matrix with sparsity = 0.1, the total
execution time of 10 transpose operations improved from 2.7s to 2.5s
(with 1) and 1.9s (with 1 and 2) respectively.
---
.../sysds/runtime/matrix/data/LibMatrixReorg.java | 101 ++++++++++++---------
1 file changed, 60 insertions(+), 41 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index edc38a7..ec1731f 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -205,7 +205,7 @@ public class LibMatrixReorg
boolean row = (in.sparse || in.rlen >= in.clen) &&
!out.sparse;
int len = row ? in.rlen : in.clen;
int blklen = (int)(Math.ceil((double)len/k));
- blklen += (blklen%8 != 0)?8-blklen%8:0;
+ blklen += (!out.sparse && (blklen%8)!=0) ? 8-blklen%8 :
0;
for( int i=0; i<k & i*blklen<len; i++ )
tasks.add(new TransposeTask(in, out, row,
i*blklen, Math.min((i+1)*blklen, len), cnt));
List<Future<Object>> taskret = pool.invokeAll(tasks);
@@ -861,50 +861,69 @@ public class LibMatrixReorg
SparseBlock a = in.getSparseBlock();
SparseBlock c = out.getSparseBlock();
- //allocate output sparse rows
- if( cnt != null ) {
- for( int i=cl; i<cu; i++ )
- if( cnt[i] > 0 )
- c.allocate(i, cnt[i]);
+ if( cu-cl == 1 ) { // SINGLE TARGET ROW
+ //number of columns <= num cores, use sequential scan
over input
+ //and avoid cache blocking and temporary position
maintenance
+ if( cnt[cl] > 0 )
+ c.allocate(cl, cnt[cl]);
+
+ for( int i=0; i<ru; i++ ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for( int j=apos; j<apos+alen && aix[j]<=cl; j++
)
+ if( aix[j] == cl )
+ c.append(cl, i, avals[j]);
+ }
}
-
- //blocking according to typical L2 cache sizes w/ awareness of
sparsity
- final long xsp = (long)in.rlen*in.clen/in.nonZeros;
- final int blocksizeI = Math.max(128, (int) (8*xsp));
- final int blocksizeJ = Math.max(128, (int) (8*xsp));
-
- //temporary array for block boundaries (for preventing binary
search)
- int[] ix = new int[Math.min(blocksizeI, ru-rl)];
-
- //blocked execution
- for( int bi=rl; bi<ru; bi+=blocksizeI )
- {
- Arrays.fill(ix, 0);
- //find column starting positions
- int bimin = Math.min(bi+blocksizeI, ru);
- if( cl > 0 ) {
- for( int i=bi; i<bimin; i++ ) {
- if( a.isEmpty(i) ) continue;
- int j = a.posFIndexGTE(i, cl);
- ix[i-bi] = (j>=0) ? j : a.size(i);
- }
+ else { //GENERAL CASE
+ //allocate output sparse rows
+ if( cnt != null ) {
+ for( int i=cl; i<cu; i++ )
+ if( cnt[i] > 0 )
+ c.allocate(i, cnt[i]);
}
- for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
- int bjmin = Math.min(bj+blocksizeJ, cu);
- //core block transpose operation
- for( int i=bi; i<bimin; i++ ) {
- if( a.isEmpty(i) ) continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- int j = ix[i-bi] + apos; //last block
boundary
- for( ; j<apos+alen && aix[j]<bjmin; j++
) {
- c.allocate(aix[j], ennz2, n2);
- c.append(aix[j], i, avals[j]);
+ //blocking according to typical L2 cache sizes w/
awareness of sparsity
+ final long xsp = (long)in.rlen*in.clen/in.nonZeros;
+ final int blocksizeI = Math.max(128, (int) (8*xsp));
+ final int blocksizeJ = Math.max(128, (int) (8*xsp));
+
+ //temporary array for block boundaries (for preventing
binary search)
+ int[] ix = new int[Math.min(blocksizeI, ru-rl)];
+
+ //blocked execution
+ for( int bi=rl; bi<ru; bi+=blocksizeI )
+ {
+ Arrays.fill(ix, 0);
+ //find column starting positions
+ int bimin = Math.min(bi+blocksizeI, ru);
+ if( cl > 0 ) {
+ for( int i=bi; i<bimin; i++ ) {
+ if( a.isEmpty(i) ) continue;
+ int j = a.posFIndexGTE(i, cl);
+ ix[i-bi] = (j>=0) ? j :
a.size(i);
+ }
+ }
+
+ for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
+ int bjmin = Math.min(bj+blocksizeJ, cu);
+ //core block transpose operation
+ for( int i=bi; i<bimin; i++ ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ int j = ix[i-bi] + apos; //last
block boundary
+ for( ; j<apos+alen &&
aix[j]<bjmin; j++ ) {
+ c.allocate(aix[j],
ennz2, n2);
+ c.append(aix[j], i,
avals[j]);
+ }
+ ix[i-bi] = j - apos; //keep
block boundary
}
- ix[i-bi] = j - apos; //keep block
boundary
}
}
}