Repository: incubator-systemml Updated Branches: refs/heads/master 6b1572e4b -> 20e33bbe8
[SYSTEMML-1311] Performance dataset/libsvm converters (sparse vectors) This patch modifies the existing dataset-binary block and libsvm-binary block converters to properly handle sparse input vectors, i.e., scan non-zeros instead of repeated lookups to avoid unnecessary binary search operations (O(nnz) vs O(N log nnz), with nnz<=N). On a 81M x 784 mnist libsvm-binary scenario, this improved performance from 17min24s to 16min14s. Furthermore, this patch also cleans up the creation of dense/sparse output vectors for binary block-dataset converters. Finally, the change also includes various cleanups: * Remove unnecessary gc hint in tear down of all tests * Reduce unnecessarily large number of columns in dataset-matrix conversion test * Fix gpu-related import and formatting issue Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/20e33bbe Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/20e33bbe Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/20e33bbe Branch: refs/heads/master Commit: 20e33bbe80aa10a1b11ea179e82feb15408f6cf8 Parents: 6b1572e Author: Matthias Boehm <mboe...@gmail.com> Authored: Tue Mar 7 14:26:36 2017 -0800 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Tue Mar 7 14:29:46 2017 -0800 ---------------------------------------------------------------------- .../instructions/gpu/context/GPUContext.java | 1 - .../spark/utils/RDDConverterUtils.java | 97 ++++++++++++-------- .../org/apache/sysml/utils/GPUStatistics.java | 4 +- .../test/integration/AutomatedTestBase.java | 2 - .../DataFrameMatrixConversionTest.java | 2 +- 5 files changed, 63 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index a7c0ab8..b792882 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -21,7 +21,6 @@ package org.apache.sysml.runtime.instructions.gpu.context; import java.util.ArrayList; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.sysml.api.DMLScript; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java index 902924a..f370890 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java @@ -36,7 +36,7 @@ import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.ml.feature.LabeledPoint; -import org.apache.spark.ml.linalg.DenseVector; +import org.apache.spark.ml.linalg.SparseVector; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.ml.linalg.Vectors; @@ -64,7 +64,6 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixCell; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; import org.apache.sysml.runtime.matrix.data.OutputInfo; -import org.apache.sysml.runtime.matrix.data.SparseBlock; import org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue; import org.apache.sysml.runtime.matrix.mapred.ReblockBuffer; import org.apache.sysml.runtime.util.DataConverter; @@ -416,6 +415,16 @@ public class RDDConverterUtils return lnnz; } + private static Vector createVector(MatrixBlock row) { + if( row.isEmptyBlock(false) ) //EMPTY SPARSE ROW + return Vectors.sparse(row.getNumColumns(), new int[0], new double[0]); + else if( row.isInSparseFormat() ) //SPARSE ROW + return Vectors.sparse(row.getNumColumns(), + row.getSparseBlock().indexes(0), row.getSparseBlock().values(0)); + else // DENSE ROW + return Vectors.dense(row.getDenseBlock()); + } + ///////////////////////////////// // BINARYBLOCK-SPECIFIC FUNCTIONS @@ -433,24 +442,9 @@ public class RDDConverterUtils throws Exception { ArrayList<LabeledPoint> ret = new ArrayList<LabeledPoint>(); - for( int i=0; i<arg0.getNumRows(); i++ ) - { + for( int i=0; i<arg0.getNumRows(); i++ ) { MatrixBlock tmp = arg0.sliceOperations(i, i, 0, arg0.getNumColumns()-2, new MatrixBlock()); - double[] data = DataConverter.convertToDoubleVector(tmp); - if( tmp.isEmptyBlock(false) ) //EMPTY SPARSE ROW - { - ret.add(new LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), Vectors.sparse(0, new int[0], new double[0]))); - } - else if( tmp.isInSparseFormat() ) //SPARSE ROW - { - SparseBlock sblock = tmp.getSparseBlock(); - ret.add(new LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), - Vectors.sparse(sblock.size(0), sblock.indexes(0), sblock.values(0)))); - } - else // DENSE ROW - { - ret.add(new LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), Vectors.dense(data))); - } + ret.add(new LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), createVector(tmp))); } return ret.iterator(); @@ -808,6 +802,8 @@ public class RDDConverterUtils { Tuple2<org.apache.spark.mllib.regression.LabeledPoint,Long> tmp = arg0.next(); org.apache.spark.mllib.regression.LabeledPoint row = tmp._1(); + boolean lsparse = _sparseX || (!_labels && + row.features() instanceof org.apache.spark.mllib.linalg.SparseVector); long rowix = tmp._2() + 1; long rix = UtilFunctions.computeBlockIndex(rowix, _brlen); @@ -818,7 +814,7 @@ public class RDDConverterUtils if( ix[0] !=null ) flushBlocksToList(ix, mb, ret); long len = UtilFunctions.computeBlockSize(_rlen, rix, _brlen); - createBlocks(rowix, (int)len, ix, mb); + createBlocks(rowix, (int)len, ix, mb, lsparse); } //process row data @@ -828,12 +824,26 @@ public class RDDConverterUtils _aNnz.add((val != 0) ? 1 : 0); } else { //features - for( int cix=1, pix=0; cix<=ncblks; cix++ ) { - int lclen = (int)UtilFunctions.computeBlockSize(_clen, cix, _bclen); - for( int j=0; j<lclen; j++ ) - mb[cix-1].appendValue(pos, j, row.features().apply(pix++)); + int lnnz = row.features().numNonzeros(); + if( row.features() instanceof org.apache.spark.mllib.linalg.SparseVector ) + { + org.apache.spark.mllib.linalg.SparseVector srow = + (org.apache.spark.mllib.linalg.SparseVector) row.features(); + for( int k=0; k<lnnz; k++ ) { + int gix = srow.indices()[k]+1; + int cix = (int)UtilFunctions.computeBlockIndex(gix, _bclen); + int j = UtilFunctions.computeCellInBlock(gix, _bclen); + mb[cix-1].appendValue(pos, j, srow.values()[k]); + } + } + else { //dense + for( int cix=1, pix=0; cix<=ncblks; cix++ ) { + int lclen = (int)UtilFunctions.computeBlockSize(_clen, cix, _bclen); + for( int j=0; j<lclen; j++ ) + mb[cix-1].appendValue(pos, j, row.features().apply(pix++)); + } } - _aNnz.add(row.features().numNonzeros()); + _aNnz.add(lnnz); } } @@ -844,7 +854,7 @@ public class RDDConverterUtils } // Creates new state of empty column blocks for current global row index. - private void createBlocks(long rowix, int lrlen, MatrixIndexes[] ix, MatrixBlock[] mb) + private void createBlocks(long rowix, int lrlen, MatrixIndexes[] ix, MatrixBlock[] mb, boolean lsparse) { //compute row block index and number of column blocks long rix = UtilFunctions.computeBlockIndex(rowix, _brlen); @@ -852,9 +862,9 @@ public class RDDConverterUtils //create all column blocks (assume dense since csv is dense text format) for( int cix=1; cix<=ncblks; cix++ ) { - int lclen = (int)UtilFunctions.computeBlockSize(_clen, cix, _bclen); + int lclen = (int)UtilFunctions.computeBlockSize(_clen, cix, _bclen); ix[cix-1] = new MatrixIndexes(rix, cix); - mb[cix-1] = new MatrixBlock(lrlen, lclen, _sparseX); + mb[cix-1] = new MatrixBlock(lrlen, lclen, lsparse); mb[cix-1].allocateDenseOrSparseBlock(); } } @@ -1058,8 +1068,18 @@ public class RDDConverterUtils //append data to matrix blocks if( _isVector ) { Vector vect = (Vector) obj; - for( int j=0; j<lclen; j++ ) - mb[cix-1].appendValue(pos, j, vect.apply(pix++)); + if( vect instanceof SparseVector ) { + SparseVector svect = (SparseVector) vect; + int[] svectIx = svect.indices(); + while( pix<svectIx.length && svectIx[pix]<cix*_bclen ) { + int j = UtilFunctions.computeCellInBlock(svectIx[pix]+1, _bclen); + mb[cix-1].appendValue(pos, j, svect.values()[pix++]); + } + } + else { //dense + for( int j=0; j<lclen; j++ ) + mb[cix-1].appendValue(pos, j, vect.apply(pix++)); + } } else { //row Row row = (Row) obj; @@ -1176,13 +1196,18 @@ public class RDDConverterUtils //copy block data into target row if( _toVector ) { - double[] tmp = new double[_clen]; - for(Tuple2<Long, MatrixBlock> kv : arg0._2()) { - int cl = (kv._1().intValue()-1)*_bclen; - MatrixBlock mb = kv._2(); - DataConverter.copyToDoubleVector(mb, tmp, cl); + if( _clen <= _bclen ) { //single block + row[1] = createVector(arg0._2().iterator().next()._2()); + } + else { //multiple column blocks + double[] tmp = new double[_clen]; + for(Tuple2<Long, MatrixBlock> kv : arg0._2()) { + int cl = (kv._1().intValue()-1)*_bclen; + MatrixBlock mb = kv._2(); + DataConverter.copyToDoubleVector(mb, tmp, cl); + } + row[1] = Vectors.dense(tmp); } - row[1] = new DenseVector(tmp); } else { for(Tuple2<Long, MatrixBlock> kv : arg0._2()) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index 122a7ea..8347902 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -155,9 +155,7 @@ public class GPUStatistics { StringBuffer sb = new StringBuffer(); HashMap<String, Long> miscTimerMap = _cpInstMiscTime.get(instructionName); if (miscTimerMap != null) { - List<Map.Entry<String, Long>> sortedList - = new ArrayList< - Map.Entry<String, Long>>(miscTimerMap.entrySet()); + List<Map.Entry<String, Long>> sortedList = new ArrayList<Map.Entry<String, Long>>(miscTimerMap.entrySet()); // Sort the times to display by the most expensive first Collections.sort(sortedList, new Comparator<Map.Entry<String, Long>>() { @Override http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java index e6123ef..8b9f10c 100644 --- a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java +++ b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java @@ -1487,8 +1487,6 @@ public abstract class AutomatedTestBase } TestUtils.clearAssertionInformation(); - - System.gc(); } /** http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java index 199100e..bf5d33d 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java @@ -50,7 +50,7 @@ public class DataFrameMatrixConversionTest extends AutomatedTestBase private final static int rows3 = 7; private final static int cols1 = 745; private final static int cols2 = 1264; - private final static int cols3 = 1003820; + private final static int cols3 = 10038; private final static double sparsity1 = 0.9; private final static double sparsity2 = 0.1; private final static double eps=0.0000000001;