This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 056eb02 [SYSTEMDS-3165] Fix lost column names on binary frame
writes/reads
056eb02 is described below
commit 056eb028d44c245045b1df0f6bf0bcf4dd7d357a
Author: Matthias Boehm <[email protected]>
AuthorDate: Thu Oct 14 20:41:48 2021 +0200
[SYSTEMDS-3165] Fix lost column names on binary frame writes/reads
Although the column names of frame blocks were correctly serialized
during write, the readers did only reconstruct basic column meta data
(num distinct, mvValue) but not the column names. This patch fixes the
binary frame readers (single-threaded and multi-threaded) accordingly.
Furthermore, this patch also increases the robustness of the reworked
binning encoder to handle even cases without any bin boundaries.
---
.../org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java | 6 ++++--
.../sysds/runtime/transform/encode/ColumnEncoderBin.java | 7 ++++++-
.../sysds/test/functions/frame/FrameMetaReadWriteTest.java | 12 ++++++++----
3 files changed, 18 insertions(+), 7 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java
b/src/main/java/org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java
index 3b2d57c..aa125a7 100644
--- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java
@@ -91,7 +91,7 @@ public class FrameReaderBinaryBlock extends FrameReader
try
{
- while( reader.next(key, value) ) {
+ while( reader.next(key, value) ) {
int row_offset = (int)(key.get()-1);
int rows = value.getNumRows();
int cols = value.getNumColumns();
@@ -107,8 +107,10 @@ public class FrameReaderBinaryBlock extends FrameReader
//copy block into target frame, incl meta on
first
dest.copy( row_offset, row_offset+rows-1, 0,
cols-1, value);
- if( row_offset==0 )
+ if( row_offset==0 ) {
+
dest.setColumnNames(value.getColumnNames());
dest.setColumnMetadata(value.getColumnMetadata());
+ }
}
}
finally {
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
index 8439b4e..7def0d3 100644
---
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
+++
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
@@ -179,8 +179,13 @@ public class ColumnEncoderBin extends ColumnEncoder {
}
private double applyValue(double inVal) {
- if( inVal < _binMins[0] | inVal > _binMaxs[_binMaxs.length-1] )
+ if( _binMins.length == 0 || _binMaxs.length == 0 ) {
+ LOG.warn("ColumnEncoderBin: applyValue without bucket
boundaries, assign 1");
+ return 1; //robustness in case of missing bins
+ }
+ if( inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1]
) {
return Double.NaN; //value outside min/max range
+ }
int ix = Arrays.binarySearch(_binMaxs, inVal);
int binID = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
return binID;
diff --git
a/src/test/java/org/apache/sysds/test/functions/frame/FrameMetaReadWriteTest.java
b/src/test/java/org/apache/sysds/test/functions/frame/FrameMetaReadWriteTest.java
index f4d500d..8971cbc 100644
---
a/src/test/java/org/apache/sysds/test/functions/frame/FrameMetaReadWriteTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/frame/FrameMetaReadWriteTest.java
@@ -42,6 +42,7 @@ public class FrameMetaReadWriteTest extends AutomatedTestBase
private final static int rows = 1382;
private final static int cols = 7;
+ private final static String[] colNames = new String[]
{"A","B","C","D","E","F","G"};
@Override
public void setUp() {
@@ -108,6 +109,7 @@ public class FrameMetaReadWriteTest extends
AutomatedTestBase
double[][] A = getRandomMatrix(rows, cols, -10, 10,
0.7, 3412);
FrameBlock fA = DataConverter.convertToFrameBlock(
DataConverter.convertToMatrixBlock(A),
ValueType.STRING);
+ fA.setColumnNames(colNames);
for( int j=0; j<cols; j++ ) {
fA.getColumnMetadata(j).setMvValue(String.valueOf(j+1));
fA.getColumnMetadata(j).setNumDistinct(j+1);
@@ -120,14 +122,16 @@ public class FrameMetaReadWriteTest extends
AutomatedTestBase
//read output and compare meta data
FrameBlock fB = FrameReaderFactory
- .createFrameReader(fmt)
- .readFrameFromHDFS(output("B"), rows,
cols);
+ .createFrameReader(fmt)
+ .readFrameFromHDFS(output("B"), rows, cols);
for( int j=0; j<cols; j++ ) {
Assert.assertEquals("MV meta data wrong!",
-
fA.getColumnMetadata(j).getMvValue(), fB.getColumnMetadata(j).getMvValue());
+ fA.getColumnMetadata(j).getMvValue(),
fB.getColumnMetadata(j).getMvValue());
Assert.assertEquals("Distinct meta data wrong!",
-
fA.getColumnMetadata(j).getNumDistinct(),
fB.getColumnMetadata(j).getNumDistinct());
+
fA.getColumnMetadata(j).getNumDistinct(),
fB.getColumnMetadata(j).getNumDistinct());
}
+ if( fmt == FileFormat.BINARY )
+ Assert.assertArrayEquals("Column names wrong!",
fA.getColumnNames(), fB.getColumnNames());
}
catch(Exception ex) {
ex.printStackTrace();