This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 2aae692 [MINOR] Bug fixes in TransformEncode
2aae692 is described below
commit 2aae6922b38b6b50ee30adaf5cf7cb34231456eb
Author: Lukas Erlbacher <[email protected]>
AuthorDate: Fri Jun 11 08:32:31 2021 +0200
[MINOR] Bug fixes in TransformEncode
Closes #1309
---
.../transform/encode/ColumnEncoderComposite.java | 8 ++++++--
.../transform/encode/MultiColumnEncoder.java | 24 ++++++++++++++++------
2 files changed, 24 insertions(+), 8 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
index f3640f3..1973741 100644
---
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
+++
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java
@@ -213,9 +213,13 @@ public class ColumnEncoderComposite extends ColumnEncoder {
addEncoder(otherEnc);
}
}
- else {
+ else
addEncoder(other);
- }
+
+ updateAllDCEncoders();
+ }
+
+ public void updateAllDCEncoders(){
// update dummycode encoder domain sizes based on distinctness
information from other encoders
ColumnEncoderDummycode dc =
getEncoder(ColumnEncoderDummycode.class);
if(dc != null)
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
index 4a8570a..aff078b 100644
---
a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
+++
b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java
@@ -83,8 +83,13 @@ public class MultiColumnEncoder implements Encoder {
MatrixBlock out;
try {
build(in, k);
- _meta = getMetaData(new FrameBlock(in.getNumColumns(),
Types.ValueType.STRING));
- initMetaData(_meta);
+ if(_legacyMVImpute != null){
+ // These operations are redundant for every
encoder excluding the legacyMVImpute, the workaround to fix
+ // it for this encoder would be very dirty.
This will only have a performance impact if there is a lot of
+ // recoding in combination with the
legacyMVImpute. But since it is legacy this should be fine
+ _meta = getMetaData(new
FrameBlock(in.getNumColumns(), Types.ValueType.STRING));
+ initMetaData(_meta);
+ }
// apply meta data
out = apply(in, k);
}
@@ -104,8 +109,10 @@ public class MultiColumnEncoder implements Encoder {
buildMT(in, k);
}
else {
- for(ColumnEncoder columnEncoder : _columnEncoders)
+ for(ColumnEncoderComposite columnEncoder :
_columnEncoders){
columnEncoder.build(in);
+ columnEncoder.updateAllDCEncoders();
+ }
}
legacyBuild(in);
}
@@ -117,6 +124,8 @@ public class MultiColumnEncoder implements Encoder {
try {
if(blockSize != in.getNumRows()) {
// Partial builds and merges
+ // Most of the time not worth it for RC with
the current implementation, GC overhead is to large.
+ // Depending on unique values and rows more
testing need to be done
List<List<Future<Object>>> partials = new
ArrayList<>();
for(ColumnEncoderComposite encoder :
_columnEncoders) {
List<Callable<Object>>
partialBuildTasks = encoder.getPartialBuildTasks(in, blockSize);
@@ -124,7 +133,7 @@ public class MultiColumnEncoder implements Encoder {
partials.add(null);
continue;
}
-
partials.add(pool.invokeAll(partialBuildTasks));
+
partials.add(partialBuildTasks.stream().map(pool::submit).collect(Collectors.toList()));
}
for(int e = 0; e < _columnEncoders.size(); e++)
{
List<Future<Object>> partial =
partials.get(e);
@@ -179,7 +188,7 @@ public class MultiColumnEncoder implements Encoder {
if(in.getNumColumns() != numEncoders)
throw new DMLRuntimeException("Not every column in has
a CompositeEncoder. Please make sure every column "
+ "has a encoder or slice the input
accordingly");
- // Denseblock allocation since access is only on the DenseBlock
+ // Block allocation for MT access
out.allocateBlock();
if(out.isInSparseFormat()) {
SparseBlock block = out.getSparseBlock();
@@ -204,7 +213,7 @@ public class MultiColumnEncoder implements Encoder {
offset +=
columnEncoder.getEncoder(ColumnEncoderDummycode.class)._domainSize - 1;
}
}
- // Recomputing NNZ since we access the Dense block directly
+ // Recomputing NNZ since we access the block directly
// TODO set NNZ explicit count them in the encoders
out.recomputeNonZeros();
if(_legacyOmit != null)
@@ -634,6 +643,8 @@ public class MultiColumnEncoder implements Encoder {
@Override
public Integer call() throws Exception {
_encoder.build(_input);
+ if(_encoder instanceof ColumnEncoderComposite)
+ ((ColumnEncoderComposite)
_encoder).updateAllDCEncoders();
return 1;
}
}
@@ -652,6 +663,7 @@ public class MultiColumnEncoder implements Encoder {
@Override
public Integer call() throws Exception {
_encoder.mergeBuildPartial(_partials, 0,
_partials.size());
+ _encoder.updateAllDCEncoders();
return 1;
}
}