This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new c21fa9997d [SYSTEMDS-3642] CLA NaN in Dictionaries replace
c21fa9997d is described below
commit c21fa9997deadc7534b40b3b303a445b3c68c630
Author: Sebastian Baunsgaard <[email protected]>
AuthorDate: Mon Oct 30 12:34:43 2023 +0100
[SYSTEMDS-3642] CLA NaN in Dictionaries replace
This commit fixes a bug of replace in ColumnGroups that did not
correctly replace NaN values with replacement values.
Example:
X_test = replace(target=X_test, pattern=NaN, replacement=0);
---
.../runtime/compress/colgroup/ColGroupDDC.java | 40 +++++++++++++++++++---
.../runtime/compress/colgroup/ColGroupDDCFOR.java | 5 +--
.../runtime/compress/colgroup/ColGroupSDC.java | 3 +-
.../runtime/compress/colgroup/ColGroupSDCFOR.java | 3 +-
.../compress/colgroup/ColGroupSDCSingle.java | 3 +-
.../compress/colgroup/ColGroupUncompressed.java | 14 +++++---
.../compress/colgroup/mapping/AMapToData.java | 13 +++++++
7 files changed, 66 insertions(+), 15 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index 8f5fccaf7d..6340affede 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -35,6 +35,8 @@ import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictiona
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToByte;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToChar;
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator;
import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme;
@@ -78,8 +80,8 @@ public class ColGroupDDC extends APreAgg implements
IMapToDataGroup {
int[] c = getCounts();
if(c.length !=
dict.getNumberOfValues(colIndexes.size()))
throw new DMLCompressionException("Invalid DDC
Construction");
+ data.verify();
}
-
}
public static AColGroup create(IColIndex colIndexes, IDictionary dict,
AMapToData data, int[] cachedCounts) {
@@ -157,8 +159,37 @@ public class ColGroupDDC extends APreAgg implements
IMapToDataGroup {
private final void
decompressToDenseBlockDenseDictSingleColOutContiguous(DenseBlock db, int rl,
int ru, int offR,
int offC, double[] values) {
final double[] c = db.values(0);
- for(int i = rl, offT = rl + offR + _colIndexes.get(0) + offC; i
< ru; i++, offT++)
- c[offT] += values[_data.getIndex(i)];
+ decompressToDenseBlockDenseDictSingleColOutContiguous(c, rl,
ru, offR + _colIndexes.get(0), values, _data);
+ }
+
+ private final static void
decompressToDenseBlockDenseDictSingleColOutContiguous(double[] c, int rl, int
ru, int offR,
+ double[] values, AMapToData data) {
+
+ if(data instanceof MapToByte)
+
decompressToDenseBlockDenseDictSingleColOutContiguousByteM(c, rl, ru, offR,
values, (MapToByte) data);
+ else if(data instanceof MapToChar)
+
decompressToDenseBlockDenseDictSingleColOutContiguousCharM(c, rl, ru, offR,
values, (MapToChar) data);
+ else
+
decompressToDenseBlockDenseDictSingleColOutContiguousGenM(c, rl, ru, offR,
values, data);
+
+ }
+
+ private final static void
decompressToDenseBlockDenseDictSingleColOutContiguousByteM(double[] c, int rl,
int ru,
+ int offR, double[] values, MapToByte data) {
+ for(int i = rl, offT = rl + offR; i < ru; i++, offT++)
+ c[offT] += values[data.getIndex(i)];
+ }
+
+ private final static void
decompressToDenseBlockDenseDictSingleColOutContiguousCharM(double[] c, int rl,
int ru,
+ int offR, double[] values, MapToChar data) {
+ for(int i = rl, offT = rl + offR; i < ru; i++, offT++)
+ c[offT] += values[data.getIndex(i)];
+ }
+
+ private final static void
decompressToDenseBlockDenseDictSingleColOutContiguousGenM(double[] c, int rl,
int ru,
+ int offR, double[] values, AMapToData data) {
+ for(int i = rl, offT = rl + offR; i < ru; i++, offT++)
+ c[offT] += values[data.getIndex(i)];
}
private final void
decompressToDenseBlockDenseDictAllColumnsContiguous(DenseBlock db, int rl, int
ru, int offR,
@@ -287,8 +318,7 @@ public class ColGroupDDC extends APreAgg implements
IMapToDataGroup {
lmSparseMatrixNoPreAggSingleCol(matrix.getSparseBlock(), nColM, retV, nColRet,
dictVals, rl, ru);
}
else
-
lmDenseMatrixNoPreAggSingleCol(matrix.getDenseBlockValues(), nColM, retV,
nColRet, dictVals, rl, ru, cl,
- cu);
+
lmDenseMatrixNoPreAggSingleCol(matrix.getDenseBlockValues(), nColM, retV,
nColRet, dictVals, rl, ru, cl, cu);
}
private void lmSparseMatrixNoPreAggSingleCol(SparseBlock sb, int nColM,
double[] retV, int nColRet, double[] vals,
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
index 85f48ae5b6..d09ba4e624 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
@@ -26,8 +26,8 @@ import java.util.Arrays;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
import
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
@@ -39,6 +39,7 @@ import
org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
import org.apache.sysds.runtime.compress.estim.EstimationFactors;
import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.compress.utils.Util;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.functionobjects.Divide;
import org.apache.sysds.runtime.functionobjects.Minus;
@@ -251,7 +252,7 @@ public class ColGroupDDCFOR extends AMorphingMMColGroup
implements IFrameOfRefer
if(patternInReference) {
double[] nRef = new double[_reference.length];
for(int i = 0; i < _reference.length; i++)
- if(pattern == _reference[i])
+ if(Util.eq(pattern ,_reference[i]))
nRef[i] = replace;
else
nRef[i] = _reference[i];
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index 476e86c973..a905e401e4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -42,6 +42,7 @@ import
org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.compress.utils.Util;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -458,7 +459,7 @@ public class ColGroupSDC extends ASDC implements
IMapToDataGroup {
IDictionary replaced = _dict.replace(pattern, replace,
_colIndexes.size());
double[] newDefaultTuple = new double[_defaultTuple.length];
for(int i = 0; i < _defaultTuple.length; i++)
- newDefaultTuple[i] = _defaultTuple[i] == pattern ?
replace : _defaultTuple[i];
+ newDefaultTuple[i] = Util.eq(_defaultTuple[i],pattern)
? replace : _defaultTuple[i];
return create(_colIndexes, _numRows, replaced, newDefaultTuple,
_indexes, _data, getCachedCounts());
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
index 65a5a42aa4..dfb9a60511 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
@@ -41,6 +41,7 @@ import
org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.compress.utils.Util;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.functionobjects.Divide;
import org.apache.sysds.runtime.functionobjects.Minus;
@@ -260,7 +261,7 @@ public class ColGroupSDCFOR extends ASDC implements
IMapToDataGroup, IFrameOfRef
if(patternInReference) {
double[] nRef = new double[_reference.length];
for(int i = 0; i < _reference.length; i++)
- if(pattern == _reference[i])
+ if(Util.eq(pattern, _reference[i]))
nRef[i] = replace;
else
nRef[i] = _reference[i];
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index 5d3be0d3f1..a13150c12c 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -41,6 +41,7 @@ import
org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.compress.utils.Util;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
@@ -423,7 +424,7 @@ public class ColGroupSDCSingle extends ASDC {
IDictionary replaced = _dict.replace(pattern, replace,
_colIndexes.size());
double[] newDefaultTuple = new double[_defaultTuple.length];
for(int i = 0; i < _defaultTuple.length; i++)
- newDefaultTuple[i] = _defaultTuple[i] == pattern ?
replace : _defaultTuple[i];
+ newDefaultTuple[i] = Util.eq(_defaultTuple[i], pattern)
? replace : _defaultTuple[i];
return create(_colIndexes, _numRows, replaced, newDefaultTuple,
_indexes, getCachedCounts());
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index 17e954b219..c4713d6e59 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -568,9 +568,11 @@ public class ColGroupUncompressed extends AColGroup {
final MatrixBlock dictM =
paCG._dict.getMBDict(nCols).getMatrixBlock();
if(dictM == null)
return;
- LOG.warn("\nInefficient transpose of uncompressed to fit to"
- + " t(AColGroup) %*% UncompressedColGroup mult by
colGroup uncompressed column"
- + "\nCurrently solved by t(t(Uncompressed) %*%
AColGroup)");
+ if(paCG.getNumCols() != 1) {
+ LOG.warn("\nInefficient transpose of uncompressed to
fit to"
+ + " t(AColGroup) %*% UncompressedColGroup mult
by colGroup uncompressed column"
+ + "\nCurrently solved by t(t(Uncompressed) %*%
AColGroup)");
+ }
final int k = InfrastructureAnalyzer.getLocalParallelism();
final MatrixBlock ucCGT = LibMatrixReorg.transpose(getData(),
k);
final MatrixBlock preAgg = new MatrixBlock(1,
paCG.getNumValues(), false);
@@ -606,10 +608,12 @@ public class ColGroupUncompressed extends AColGroup {
}
private void leftMultByAColGroupUncompressed(ColGroupUncompressed lhs,
MatrixBlock result) {
- LOG.warn("Inefficient Left Matrix Multiplication with transpose
of left hand side : t(l) %*% r");
final MatrixBlock tmpRet = new MatrixBlock(lhs.getNumCols(),
_colIndexes.size(), 0);
final int k = InfrastructureAnalyzer.getLocalParallelism();
-
+
+ if(lhs._data.getNumColumns() != 1){
+ LOG.warn("Inefficient Left Matrix Multiplication with
transpose of left hand side : t(l) %*% r");
+ }
// multiply to temp
MatrixBlock lhData = lhs._data;
MatrixBlock transposed = LibMatrixReorg.transpose(lhData, k);
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
index d22234945d..b12461bf7c 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
@@ -27,6 +27,8 @@ import java.util.BitSet;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.colgroup.IMapToDataGroup;
import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
@@ -860,6 +862,17 @@ public abstract class AMapToData implements Serializable {
*/
public abstract boolean equals(AMapToData e);
+ /** Debugging verification that this mapping is correctly made. */
+ public void verify() {
+ if(CompressedMatrixBlock.debug) {
+ for(int i = 0; i < size(); i++) {
+ if(getIndex(i) >= nUnique) {
+ throw new
DMLCompressionException("invalid construction of Mapping data containing values
above unique");
+ }
+ }
+ }
+ }
+
@Override
public String toString() {
final int sz = size();