This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit e7abe8765b06f7a9a59c349d6a2f15f67f15685f Author: baunsgaard <[email protected]> AuthorDate: Mon May 10 17:47:25 2021 +0200 [SYSTEMDS-2997] CLA MatrixBlock Dictionary - Add Basic MatrixBlock Dictionary - CocodeMatrixCost function for estimating the matrix multiplication cost. --- .../runtime/compress/CompressionSettings.java | 23 ++- .../compress/CompressionSettingsBuilder.java | 19 +- .../runtime/compress/cocode/AColumnCoCoder.java | 10 +- .../sysds/runtime/compress/cocode/CoCodeCost.java | 8 +- .../{CoCodeCost.java => CoCodeCostMatrixMult.java} | 97 ++++----- .../runtime/compress/cocode/PlanningCoCoder.java | 8 +- .../runtime/compress/colgroup/ColGroupSDC.java | 28 ++- .../compress/colgroup/ColGroupSDCSingle.java | 28 ++- .../runtime/compress/colgroup/ColGroupValue.java | 67 ++----- .../compress/colgroup/dictionary/ADictionary.java | 4 +- .../compress/colgroup/dictionary/Dictionary.java | 3 - .../compress/colgroup/dictionary/QDictionary.java | 50 +++-- .../colgroup/dictionary/SparseDictionary.java | 218 +++++++++++++++++++++ .../compress/estim/CompressedSizeEstimator.java | 19 +- .../estim/CompressedSizeEstimatorSample.java | 8 +- .../compress/estim/CompressedSizeInfoColGroup.java | 8 + 16 files changed, 440 insertions(+), 158 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index f1ff14e..7be74ef 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -21,6 +21,8 @@ package org.apache.sysds.runtime.compress; import java.util.EnumSet; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType; import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType; @@ -29,6 +31,7 @@ import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType; * CompressionSettingsBuilder for default non static parameters. */ public class CompressionSettings { + private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); /** * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not @@ -79,8 +82,14 @@ public class CompressionSettings { /** The selected method for column partitioning used in CoCoding compressed columns */ public final PartitionerType columnPartitioner; - /** The maximum number of columns CoCoded if the Static CoCoding strategy is selected */ - public final int maxStaticColGroupCoCode; + /** The maximum number of columns CoCoded allowed */ + public final int maxColGroupCoCode; + + /** + * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects + * aggressively likely coCoding is used. + */ + public final double coCodePercentage; /** * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression @@ -91,7 +100,7 @@ public class CompressionSettings { protected CompressionSettings(double samplingRatio, boolean allowSharedDictionary, String transposeInput, boolean skipList, int seed, boolean investigateEstimate, boolean lossy, EnumSet<CompressionType> validCompressions, boolean sortValuesByLength, PartitionerType columnPartitioner, - int maxStaticColGroupCoCode) { + int maxColGroupCoCode, double coCodePercentage) { this.samplingRatio = samplingRatio; this.allowSharedDictionary = allowSharedDictionary; this.transposeInput = transposeInput; @@ -102,7 +111,9 @@ public class CompressionSettings { this.lossy = lossy; this.sortValuesByLength = sortValuesByLength; this.columnPartitioner = columnPartitioner; - this.maxStaticColGroupCoCode = maxStaticColGroupCoCode; + this.maxColGroupCoCode = maxColGroupCoCode; + this.coCodePercentage = coCodePercentage; + LOG.debug(this); } @Override @@ -113,6 +124,10 @@ public class CompressionSettings { sb.append("\n DDC1 share dict: " + allowSharedDictionary); sb.append("\n Partitioner: " + columnPartitioner); sb.append("\n Lossy: " + lossy); + sb.append("\n sortValuesByLength: " + sortValuesByLength); + sb.append("\n column Partitioner: " + columnPartitioner); + sb.append("\n max Static ColGroup CoCode " + maxColGroupCoCode); + sb.append("\n max cocodePercentage " + coCodePercentage); // If needed for debugging add more fields to the printing. return sb.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index a53bcfa..72aeeeb 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -41,7 +41,8 @@ public class CompressionSettingsBuilder { private EnumSet<CompressionType> validCompressions; private boolean sortValuesByLength = true; private PartitionerType columnPartitioner; - private int maxStaticColGroupCoCode = 10; + private int maxStaticColGroupCoCode = 10000; + private double coCodePercentage = 0.01; public CompressionSettingsBuilder() { @@ -236,6 +237,20 @@ public class CompressionSettingsBuilder { } /** + * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that + * higher values results in more coCoding while lower values result in less. + * + * Note that with high coCoding the compression ratio would possibly be lower. + * + * @param coCodePercentage The percentage to set. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { + this.coCodePercentage = coCodePercentage; + return this; + } + + /** * Create the CompressionSettings object to use in the compression. * * @return The CompressionSettings @@ -243,6 +258,6 @@ public class CompressionSettingsBuilder { public CompressionSettings create() { return new CompressionSettings(samplingRatio, allowSharedDictionary, transposeInput, skipList, seed, investigateEstimate, lossy, validCompressions, sortValuesByLength, columnPartitioner, - maxStaticColGroupCoCode); + maxStaticColGroupCoCode, coCodePercentage); } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java index 4dc1f8e..1ede652 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java @@ -30,14 +30,14 @@ public abstract class AColumnCoCoder { protected static final Log LOG = LogFactory.getLog(AColumnCoCoder.class.getName()); - protected CompressedSizeEstimator _est; - protected CompressionSettings _cs; - protected int _numRows; + final protected CompressedSizeEstimator _est; + final protected CompressionSettings _cs; + // final protected int _numRows; protected AColumnCoCoder(CompressedSizeEstimator sizeEstimator, CompressionSettings cs, int numRows) { _est = sizeEstimator; _cs = cs; - _numRows = numRows; + // _numRows = numRows; } /** @@ -64,7 +64,7 @@ public abstract class AColumnCoCoder { CompressedSizeInfoColGroup rhs) { int[] joined = Util.join(lhs.getColumns(), rhs.getColumns()); int numVals = lhs.getNumVals() + rhs.getNumVals(); - return new CompressedSizeInfoColGroup(joined, numVals, _numRows); + return new CompressedSizeInfoColGroup(joined, numVals, _est.getNumRows()); } protected CompressedSizeInfoColGroup analyze(CompressedSizeInfoColGroup g) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java index 1cce5b1..d2aec2c 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java @@ -48,15 +48,11 @@ public class CoCodeCost extends AColumnCoCoder { */ private final int largestDistinct; - private final int toSmallForAnalysis; - - private final double percentMaxCardinality = 0.08; + private final static int toSmallForAnalysis = 64; protected CoCodeCost(CompressedSizeEstimator sizeEstimator, CompressionSettings cs, int numRows) { super(sizeEstimator, cs, numRows); - largestDistinct = Math.max(256, (int) (_numRows * percentMaxCardinality)); - toSmallForAnalysis = largestDistinct / 4; - LOG.error("CocodeCost largest Distinct: "+ largestDistinct + " toSmallForAnalysis: " + toSmallForAnalysis); + largestDistinct = Math.min(4096, Math.max(256, (int) (sizeEstimator.getNumRows() * cs.coCodePercentage))); } @Override diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java similarity index 56% copy from src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java copy to src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java index 1cce5b1..09a9990 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java @@ -20,13 +20,12 @@ package org.apache.sysds.runtime.compress.cocode; import java.util.ArrayList; -import java.util.Comparator; +import java.util.Arrays; import java.util.List; import java.util.PriorityQueue; import java.util.Queue; import org.apache.sysds.runtime.compress.CompressionSettings; -import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType; import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; @@ -39,7 +38,7 @@ import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; * This method allows us to compress many more columns than the BinPacking * */ -public class CoCodeCost extends AColumnCoCoder { +public class CoCodeCostMatrixMult extends AColumnCoCoder { /** * This value specifies the maximum distinct count allowed int a coCoded group. Note that this value is the number @@ -50,13 +49,11 @@ public class CoCodeCost extends AColumnCoCoder { private final int toSmallForAnalysis; - private final double percentMaxCardinality = 0.08; - - protected CoCodeCost(CompressedSizeEstimator sizeEstimator, CompressionSettings cs, int numRows) { - super(sizeEstimator, cs, numRows); - largestDistinct = Math.max(256, (int) (_numRows * percentMaxCardinality)); - toSmallForAnalysis = largestDistinct / 4; - LOG.error("CocodeCost largest Distinct: "+ largestDistinct + " toSmallForAnalysis: " + toSmallForAnalysis); + protected CoCodeCostMatrixMult(CompressedSizeEstimator e, CompressionSettings cs, int numRows) { + super(e, cs, numRows); + largestDistinct = Math.max(256, (int) (_est.getNumRows() * _est.getNumColumns() * cs.coCodePercentage * 0.2)); + toSmallForAnalysis = Math.min(Math.max(256, largestDistinct / 4), 1028); + LOG.debug("CocodeCost largest Distinct: " + largestDistinct + " toSmallForAnalysis: " + toSmallForAnalysis); } @Override @@ -67,54 +64,66 @@ public class CoCodeCost extends AColumnCoCoder { private List<CompressedSizeInfoColGroup> join(List<CompressedSizeInfoColGroup> currentGroups) { - Comparator<CompressedSizeInfoColGroup> comp = Comparator.comparing(CompressedSizeInfoColGroup::getNumVals); - Queue<CompressedSizeInfoColGroup> que = new PriorityQueue<>(currentGroups.size(), comp); - List<CompressedSizeInfoColGroup> ret = new ArrayList<>(); + Queue<CostOfJoin> que = new PriorityQueue<>(currentGroups.size()); - for(CompressedSizeInfoColGroup g : currentGroups) { - if(g.getBestCompressionType() == CompressionType.CONST) - ret.add(g); - else - que.add(g); - } + List<CompressedSizeInfoColGroup> ret = new ArrayList<>(); + for(CompressedSizeInfoColGroup g : currentGroups) + que.add(new CostOfJoin(g)); - boolean finished = false; - while(!finished) { + while(true) { if(que.peek() != null) { - CompressedSizeInfoColGroup l = que.poll(); + final CostOfJoin l = que.poll(); if(que.peek() != null) { - CompressedSizeInfoColGroup r = que.poll(); - int worstCaseJoinedSize = l.getNumVals() * r.getNumVals(); - if(worstCaseJoinedSize < toSmallForAnalysis) - que.add(joinWithoutAnalysis(l, r)); - else if(worstCaseJoinedSize < largestDistinct){ - - CompressedSizeInfoColGroup g = joinWithAnalysis(l, r); - if(g.getNumVals() < largestDistinct) - que.add(joinWithAnalysis(l, r)); - else{ - finished = true; - que.add(l); - que.add(r); - } - } + final CostOfJoin r = que.poll(); + final double costIndividual = (l.cost + r.cost); + final CostOfJoin g = new CostOfJoin(joinWithAnalysis(l.elm, r.elm)); + if(g.cost < costIndividual) + que.add(g); else { - finished = true; - que.add(l); + ret.add(l.elm); que.add(r); } } else { - que.add(l); - finished = true; + ret.add(l.elm); + break; } } else - finished = true; + break; } - for(CompressedSizeInfoColGroup g : que) - ret.add(g); + for(CostOfJoin g : que) + ret.add(g.elm); return ret; } + + private class CostOfJoin implements Comparable<CostOfJoin> { + protected final CompressedSizeInfoColGroup elm; + protected final double cost; + + protected CostOfJoin(CompressedSizeInfoColGroup elm) { + this.elm = elm; + final int nRows = _est.getNumRows(); + final double commonFraction = elm.getMostCommonFraction(); + final double rowsToProcess = commonFraction > 0.2 ? nRows * (1 - Math.min(commonFraction, 0.95)) : nRows; + this.cost = rowsToProcess + elm.getNumVals() * elm.getColumns().length; + + } + + @Override + public int compareTo(CostOfJoin o) { + return cost == o.cost ? 0 : cost > o.cost ? 1 : -1; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("\n"); + sb.append(cost); + sb.append(" - "); + sb.append(Arrays.toString(elm.getColumns())); + return sb.toString(); + } + } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java index 46bf988..94572c9 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java @@ -41,7 +41,7 @@ public class PlanningCoCoder { * The Valid coCoding techniques */ public enum PartitionerType { - BIN_PACKING, STATIC, COST; + BIN_PACKING, STATIC, COST, COST_MATRIX_MULT; } /** @@ -66,10 +66,10 @@ public class PlanningCoCoder { constantGroups = new ArrayList<>(); List<CompressedSizeInfoColGroup> newGroups = new ArrayList<>(); mem = new Memorizer(); - for(CompressedSizeInfoColGroup g : colInfos.getInfo()){ + for(CompressedSizeInfoColGroup g : colInfos.getInfo()) { if(g.getBestCompressionType() == CompressionType.CONST) constantGroups.add(g); - else{ + else { mem.put(g); newGroups.add(g); } @@ -98,6 +98,8 @@ public class PlanningCoCoder { return new CoCodeStatic(est, cs, numRows); case COST: return new CoCodeCost(est, cs, numRows); + case COST_MATRIX_MULT: + return new CoCodeCostMatrixMult(est, cs, numRows); default: throw new RuntimeException("Unsupported column group partitioner: " + type.toString()); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java index 1acbfdc..268f294 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java @@ -643,7 +643,7 @@ public class ColGroupSDC extends ColGroupValue { else itThis.next(); } - + while(itThat.hasNext()) { final int fr = that.getIndex(itThat.getDataIndexAndIncrement()); that._dict.addToEntry(ret, fr, offsetToDefaultThis, nCol); @@ -717,7 +717,31 @@ public class ColGroupSDC extends ColGroupValue { @Override public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle that, Dictionary ret, boolean preModified) { - throw new NotImplementedException(); + final AIterator itThat = that._indexes.getIterator(); + final AIterator itThis = _indexes.getIterator(); + final int nCol = that._colIndexes.length; + final int defThis = this.getNumValues() * nCol - nCol; + + if(preModified) { + while(itThat.hasNext() && itThis.hasNext()) { + if(itThat.value() == itThis.value()) { + itThat.next(); + final int to = getIndex(itThis.getDataIndexAndIncrement()); + that._dict.addToEntry(ret, 0, to, nCol); + } + else if(itThat.value() < itThis.value()) { + itThat.next(); + that._dict.addToEntry(ret, 0, defThis, nCol); + } + else + itThis.next(); + } + } + else { + throw new NotImplementedException(); + } + + return ret; } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java index 85aab80..14afdf4 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java @@ -565,7 +565,29 @@ public class ColGroupSDCSingle extends ColGroupValue { @Override public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, Dictionary ret, boolean preModified) { - throw new NotImplementedException(); + final AIterator itThat = that._indexes.getIterator(); + final AIterator itThis = _indexes.getIterator(); + final int nCol = that._colIndexes.length; + // final int defThat = that.getNumValues() * nCol - nCol; + + if(preModified) { + while(itThat.hasNext() && itThis.hasNext()) { + if(itThat.value() == itThis.value()) { + final int fr = that.getIndex(itThat.getDataIndexAndIncrement()); + that._dict.addToEntry(ret, fr, 1, nCol); + } + else if(itThat.value() < itThis.value()) + itThat.next(); + else{ + itThis.next(); + // that._dict.addToEntry(ret, defThat, 0, nCol); + } + } + } + else { + throw new NotImplementedException(); + } + return ret; } @Override @@ -600,10 +622,6 @@ public class ColGroupSDCSingle extends ColGroupValue { itThis.next(); } - // while(itThat.hasNext()) { - // final int fr = that.getIndex(itThat.getDataIndexAndIncrement()); - // that._dict.addToEntry(ret, fr, 1, nCol); - // } return ret; } else { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java index 223c3a0..dc52ca1 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java @@ -828,13 +828,23 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea if(sameIndexStructure(lhs)) { int[] agI = getCounts(); - for(int a = 0, off = 0; a < nvL; a++, off += nvL + 1) - leftMultDictEntry(agI[a], off, nvL, lCol, rCol, lhs, numCols, lhValues, rhValues, result); + for(int i = 0; i < agI.length; i++) { + for(int l = 0; l < lCol; l++) { + final int leftOff = lhs._colIndexes[l] * numCols; + final double lhV = lhValues[i * lCol + l] * agI[i]; + if(lhV != 0) + for(int r = 0; r < rCol; r++) { + final double rhV = rhValues[i * rCol + r]; + final double va = lhV * rhV; + result[leftOff + this._colIndexes[r]] += va; + } + } + } } else if(lhs instanceof ColGroupConst || this instanceof ColGroupConst) { - double[] r = this instanceof ColGroupConst ? rhValues : this._dict.colSum(getCounts(), rCol); - double[] l = lhs instanceof ColGroupConst ? lhValues : lhs._dict.colSum(lhs.getCounts(), lCol); - vectorVectorMultiply(l, lhs._colIndexes, r, this._colIndexes, result, numCols); + // double[] r = this instanceof ColGroupConst ? rhValues : this._dict.colSum(getCounts(), rCol); + // double[] l = lhs instanceof ColGroupConst ? lhValues : lhs._dict.colSum(lhs.getCounts(), lCol); + // vectorVectorMultiply(l, lhs._colIndexes, r, this._colIndexes, result, numCols); } else { int[] countsRight = getCounts(); @@ -848,9 +858,9 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea if(skipRight > threshold && percentageRight > percentageLeft && !(this instanceof ColGroupDDC)) { double[] mct = this._dict.getMostCommonTuple(this.getCounts(), rCol); - double[] lhsSum = lhs._dict.colSum(lhs.getCounts(), lCol); - if(mct != null) - vectorVectorMultiply(lhsSum, lhs._colIndexes, mct, this._colIndexes, result, numCols); + // double[] lhsSum = lhs._dict.colSum(lhs.getCounts(), lCol); + // if(mct != null) + // vectorVectorMultiply(lhsSum, lhs._colIndexes, mct, this._colIndexes, result, numCols); ColGroupValue thisM = (mct != null) ? (ColGroupValue) this .copyAndSet(this._dict.subtractTuple(mct)) : this; @@ -860,9 +870,9 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea } else if(skipLeft > threshold && !(lhs instanceof ColGroupDDC)) { double[] mct = lhs._dict.getMostCommonTuple(lhs.getCounts(), lCol); - double[] thisColSum = this._dict.colSum(getCounts(), rCol); - if(mct != null) - vectorVectorMultiply(mct, lhs._colIndexes, thisColSum, this._colIndexes, result, numCols); + // double[] thisColSum = this._dict.colSum(getCounts(), rCol); + // if(mct != null) + // vectorVectorMultiply(mct, lhs._colIndexes, thisColSum, this._colIndexes, result, numCols); ColGroupValue lhsM = (mct != null) ? (ColGroupValue) lhs.copyAndSet(lhs._dict.subtractTuple(mct)) : lhs; Dictionary preAgg = this.preAggregateThatIndexStructure(lhsM, true); @@ -882,41 +892,6 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea } } - // private void leftMultMapPreAggregate(final int nvL, final int lCol, final int rCol, final ColGroupValue lhs, - // final int numCols, double[] lhValues, double[] rhValues, double[] c, MapPreAggregate agM) { - // final int[] map = agM.getMap(); - // final int aggSize = agM.getSize(); - // for(int k = 0; k < aggSize; k += 2) - // leftMultDictEntry(map[k + 1], map[k], nvL, lCol, rCol, lhs, numCols, lhValues, rhValues, c); - // leftMultDictEntry(agM.getMapFreeValue(), 0, nvL, lCol, rCol, lhs, numCols, lhValues, rhValues, c); - // } - - // private void leftMultArrayPreAggregate(final int nvL, final int nvR, final int lCol, final int rCol, - // final ColGroupValue lhs, final int numCols, double[] lhValues, double[] rhValues, double[] c, int[] arr) { - // for(int a = 0; a < nvL * nvR; a++) - // leftMultDictEntry(arr[a], a, nvL, lCol, rCol, lhs, numCols, lhValues, rhValues, c); - // } - - private void leftMultDictEntry(final int m, final int a, final int nvL, final int lCol, final int rCol, - final ColGroupValue lhs, final int numCols, final double[] lhValues, final double[] rhValues, - final double[] c) { - - if(m > 0) { - final int lhsRowOffset = (a % nvL) * lCol; - final int rhsRowOffset = (a / nvL) * rCol; - - for(int j = 0; j < lCol; j++) { - final int resultOff = lhs._colIndexes[j] * numCols; - final double lh = lhValues[lhsRowOffset + j] * m; - if(lh != 0) - for(int i = 0; i < rCol; i++) { - double rh = rhValues[rhsRowOffset + i]; - c[resultOff + _colIndexes[i]] += lh * rh; - } - } - } - } - @Override public void tsmm(double[] result, int numColumns) { int[] counts = getCounts(); diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java index 1aeda85..352e96b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java @@ -22,6 +22,8 @@ package org.apache.sysds.runtime.compress.colgroup.dictionary; import java.io.DataOutput; import java.io.IOException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.utils.ABitmap; import org.apache.sysds.runtime.compress.utils.Bitmap; import org.apache.sysds.runtime.compress.utils.BitmapLossy; @@ -35,7 +37,7 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator; */ public abstract class ADictionary { - // private static final Log LOG = LogFactory.getLog(ADictionary.class.getName()); + protected static final Log LOG = LogFactory.getLog(ADictionary.class.getName()); /** * Get all the values contained in the dictionary as a linearized double array. diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java index b4f843f..5a32823 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java @@ -24,8 +24,6 @@ import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.DMLCompressionException; import org.apache.sysds.runtime.functionobjects.Builtin; import org.apache.sysds.runtime.functionobjects.ValueFunction; @@ -39,7 +37,6 @@ import org.apache.sysds.utils.MemoryEstimates; */ public class Dictionary extends ADictionary { - protected static final Log LOG = LogFactory.getLog(Dictionary.class.getName()); private final double[] _values; public Dictionary(double[] values) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java index a8559db..7b01fb5 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java @@ -24,8 +24,6 @@ import java.io.IOException; import java.util.Arrays; import org.apache.commons.lang.NotImplementedException; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.utils.BitmapLossy; import org.apache.sysds.runtime.functionobjects.Builtin; import org.apache.sysds.runtime.functionobjects.Divide; @@ -42,7 +40,6 @@ import org.apache.sysds.utils.MemoryEstimates; */ public class QDictionary extends ADictionary { - protected static final Log LOG = LogFactory.getLog(QDictionary.class.getName()); protected double _scale; protected byte[] _values; @@ -121,14 +118,13 @@ public class QDictionary extends ADictionary { return ret; } - @Override - public double[] aggregateTuples(Builtin fn, final int nCol){ + public double[] aggregateTuples(Builtin fn, final int nCol) { if(nCol == 1) return getValues(); final int nRows = _values.length / nCol; double[] res = new double[nRows]; - for(int i = 0; i < nRows; i++){ + for(int i = 0; i < nRows; i++) { final int off = i * nCol; res[i] = _values[off]; for(int j = off + 1; j < off + nCol; j++) @@ -258,7 +254,6 @@ public class QDictionary extends ADictionary { return new QDictionary(ret, _scale); } - @Override public void write(DataOutput out) throws IOException { super.write(out); @@ -314,20 +309,20 @@ public class QDictionary extends ADictionary { } } - @Override - public double[] colSum(int[] counts, int nCol){ + public double[] colSum(int[] counts, int nCol) { throw new NotImplementedException("Not Implemented"); // final double[] res = new double[counts.length]; // int idx = 0; // for(int k = 0; k< _values.length / counts.length; k++){ - // final int cntk = counts[k]; - // for(int j = 0; j< counts.length; j++){ - // res[j] += _values[idx++] * cntk; - // } + // final int cntk = counts[k]; + // for(int j = 0; j< counts.length; j++){ + // res[j] += _values[idx++] * cntk; + // } // } // return res; } + @Override public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) { throw new NotImplementedException("Not Implemented"); @@ -412,7 +407,7 @@ public class QDictionary extends ADictionary { } } - public String getString( int colIndexes) { + public String getString(int colIndexes) { StringBuilder sb = new StringBuilder(); for(int i = 0; i < size(); i++) { sb.append(_values[i]); @@ -453,32 +448,31 @@ public class QDictionary extends ADictionary { } @Override - public boolean containsValue(double pattern){ + public boolean containsValue(double pattern) { if(Double.isNaN(pattern) || Double.isInfinite(pattern)) return false; throw new NotImplementedException("Not contains value on Q Dictionary"); } @Override - public long getNumberNonZeros(int[] counts, int nCol){ - long nnz = 0; + public long getNumberNonZeros(int[] counts, int nCol) { + long nnz = 0; final int nRow = _values.length / nCol; - for(int i = 0; i < nRow; i++){ + for(int i = 0; i < nRow; i++) { long rowCount = 0; - final int off = i * nCol; - for(int j = off; j < off + nCol; j++){ + final int off = i * nCol; + for(int j = off; j < off + nCol; j++) { if(_values[j] != 0) - rowCount ++; + rowCount++; } nnz += rowCount * counts[i]; } return nnz; } - @Override - public void addToEntry(Dictionary d, int fr, int to, int nCol){ - throw new NotImplementedException("Not implemented yet"); + public void addToEntry(Dictionary d, int fr, int to, int nCol) { + throw new NotImplementedException("Not implemented yet"); } @Override @@ -487,9 +481,9 @@ public class QDictionary extends ADictionary { } @Override - public long getNumberNonZerosContained(){ + public long getNumberNonZerosContained() { long count = 0; - for(double v : _values){ + for(double v : _values) { if(v != 0.0) count++; } @@ -497,12 +491,12 @@ public class QDictionary extends ADictionary { } @Override - public double[] getMostCommonTuple(int[] counts, int nCol){ + public double[] getMostCommonTuple(int[] counts, int nCol) { return null; } @Override - public ADictionary subtractTuple(double[] tuple){ + public ADictionary subtractTuple(double[] tuple) { throw new NotImplementedException(); } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/SparseDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/SparseDictionary.java new file mode 100644 index 0000000..3a400f3 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/SparseDictionary.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.compress.colgroup.dictionary; + +import org.apache.sysds.runtime.functionobjects.Builtin; +import org.apache.sysds.runtime.functionobjects.ValueFunction; +import org.apache.sysds.runtime.matrix.operators.ScalarOperator; + +/** + * A sparse dictionary implementation, use if the tuples are sparse. + */ +public class SparseDictionary extends ADictionary { + + @Override + public double[] getValues() { + LOG.warn("Inefficient materialization of sparse Dictionary."); + + return null; + } + + @Override + public double getValue(int i) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public int hasZeroTuple(int nCol) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public long getInMemorySize() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double aggregate(double init, Builtin fn) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double[] aggregateTuples(Builtin fn, int nCol) { + // TODO Auto-generated method stub + return null; + } + + @Override + public int size() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public ADictionary apply(ScalarOperator op) { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary applyBinaryRowOpLeft(ValueFunction fn, double[] v, boolean sparseSafe, int[] colIndexes) { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary applyBinaryRowOpRight(ValueFunction fn, double[] v, boolean sparseSafe, int[] colIndexes) { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary clone() { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary cloneAndExtend(int len) { + // TODO Auto-generated method stub + return null; + } + + @Override + public boolean isLossy() { + // TODO Auto-generated method stub + return false; + } + + @Override + public int getNumberOfValues(int ncol) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double[] sumAllRowsToDouble(boolean square, int nrColumns) { + // TODO Auto-generated method stub + return null; + } + + @Override + public double sumRow(int k, boolean square, int nrColumns) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double[] colSum(int[] counts, int nCol) { + // TODO Auto-generated method stub + return null; + } + + @Override + public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) { + // TODO Auto-generated method stub + + } + + @Override + public double sum(int[] counts, int ncol) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double sumsq(int[] counts, int ncol) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public String getString(int colIndexes) { + // TODO Auto-generated method stub + return null; + } + + @Override + public void addMaxAndMin(double[] ret, int[] colIndexes) { + // TODO Auto-generated method stub + + } + + @Override + public ADictionary sliceOutColumnRange(int idxStart, int idxEnd, int previousNumberOfColumns) { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary reExpandColumns(int max) { + // TODO Auto-generated method stub + return null; + } + + @Override + public boolean containsValue(double pattern) { + // TODO Auto-generated method stub + return false; + } + + @Override + public long getNumberNonZeros(int[] counts, int nCol) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public long getNumberNonZerosContained() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public void addToEntry(Dictionary d, int fr, int to, int nCol) { + // TODO Auto-generated method stub + + } + + @Override + public double[] getMostCommonTuple(int[] counts, int nCol) { + // TODO Auto-generated method stub + return null; + } + + @Override + public ADictionary subtractTuple(double[] tuple) { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java index 5332060..4902cb8 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java @@ -42,19 +42,19 @@ public abstract class CompressedSizeEstimator { protected static final Log LOG = LogFactory.getLog(CompressedSizeEstimator.class.getName()); /** The Matrix Block to extract the compression estimates from */ - protected MatrixBlock _data; + final protected MatrixBlock _data; /** The number of rows in the matrix block, extracted to a field because the matrix could be transposed */ - protected int _numRows; + final protected int _numRows; /** The number of columns in the matrix block, extracted to a field because the matrix could be transposed */ - protected int _numCols; + final protected int _numCols; /** The compression settings to use, for estimating the size, and compress the ColGroups. */ - protected final CompressionSettings _compSettings; + final protected CompressionSettings _compSettings; /** * Boolean specifying if the _data is in transposed format. This is used to select the correct readers for the * extraction of bitmaps for the columns. */ - protected boolean _transposed = false; + protected boolean _transposed; /** * Main Constructor for Compression Estimator. @@ -72,6 +72,15 @@ public abstract class CompressedSizeEstimator { _compSettings = compSettings; } + + public int getNumRows(){ + return _numRows; + } + + public int getNumColumns(){ + return _numCols; + } + /** * Multi threaded version of extracting Compression Size info * diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java index 0c83202..af94473 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java @@ -41,6 +41,8 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator { private static final int FORCE_TRANSPOSE_ON_SAMPLE_THRESHOLD = 8000; private final int[] _sampleRows; + + private final MatrixBlock _sample; private HashMap<Integer, Double> _solveCache = null; /** @@ -56,7 +58,7 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator { super(data, compSettings, transposed); _sampleRows = sampleRows; _solveCache = new HashMap<>(); - _data = sampleData(data, compSettings, sampleRows, transposed); + _sample = sampleData(data, compSettings, sampleRows, transposed); } protected MatrixBlock sampleData(MatrixBlock data, CompressionSettings compSettings, int[] sampleRows, @@ -75,8 +77,6 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator { sampledMatrixBlock = LibMatrixReorg.transposeInPlace(sampledMatrixBlock, 16); } else { - - // Override the _data Matrix block with the sampled matrix block. MatrixBlock select = (transposed) ? new MatrixBlock(data.getNumColumns(), 1, true) : new MatrixBlock(data.getNumRows(), 1, true); for(int i = 0; i < sampleRows.length; i++) @@ -106,7 +106,7 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator { // final int numCols = colIndexes.length; // extract statistics from sample - final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, _data, _transposed); + final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, _sample, _transposed); final EstimationFactors fact = EstimationFactors.computeSizeEstimationFactors(ubm, false, _numRows, colIndexes); final int numZerosInSample = ubm.getZeroCounts(); final boolean lossy = ubm.getType() == BitmapType.Lossy; diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java index 7b9bb8d..d25f280 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java @@ -144,6 +144,14 @@ public class CompressedSizeInfoColGroup { return _cardinalityRatio; } + public double getMostCommonFraction(){ + return (double) _facts.largestOff / _facts.numRows; + } + + public double getTupleSparsity(){ + return _facts.tupleSparsity; + } + private static Map<CompressionType, Long> calculateCompressionSizes(EstimationFactors fact, Set<CompressionType> validCompressionTypes) { Map<CompressionType, Long> res = new HashMap<>();
