This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 0987be1 [SYSTEMDS-3203] Improved frame removeEmpty operations
(row/col)
0987be1 is described below
commit 0987be1c907d48668697df1929fce22bb31480d3
Author: OlgaOvcharenko <[email protected]>
AuthorDate: Sat Dec 18 23:15:18 2021 +0100
[SYSTEMDS-3203] Improved frame removeEmpty operations (row/col)
Closes #1455.
---
.../sysds/runtime/matrix/data/FrameBlock.java | 77 +++++++++++----
.../apache/sysds/runtime/util/DataConverter.java | 39 ++++++++
src/main/python/tests/frame/test_slice.py | 30 +++---
.../test/component/frame/FrameRemoveEmptyTest.java | 106 +++++++++++----------
src/test/scripts/functions/frame/removeEmpty1.dml | 10 +-
.../frame/{removeEmpty1.dml => removeEmpty2.dml} | 10 +-
6 files changed, 178 insertions(+), 94 deletions(-)
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
index 751d24e..bcdf431 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
@@ -30,7 +30,14 @@ import java.lang.ref.SoftReference;
import java.lang.reflect.InvocationTargetException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
@@ -48,7 +55,6 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Writable;
import org.apache.sysds.api.DMLException;
-import org.apache.sysds.common.Types;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.codegen.CodegenUtils;
@@ -66,6 +72,7 @@ import org.apache.sysds.runtime.meta.MatrixCharacteristics;
import org.apache.sysds.runtime.transform.encode.ColumnEncoderRecode;
import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.runtime.util.DMVUtils;
+import org.apache.sysds.runtime.util.DataConverter;
import org.apache.sysds.runtime.util.EMAUtils;
import org.apache.sysds.runtime.util.IndexRange;
import org.apache.sysds.runtime.util.UtilFunctions;
@@ -2549,25 +2556,39 @@ public class FrameBlock implements CacheBlock,
Externalizable {
}
private FrameBlock removeEmptyRows(MatrixBlock select, boolean
emptyReturn) {
+ if(select != null && (select.getNumRows() != getNumRows() &&
select.getNumColumns() != getNumRows()))
+ throw new DMLRuntimeException("Frame rmempty: Incorrect
select vector dimensions.");
+
FrameBlock ret = new FrameBlock(_schema, _colnames);
- for(int i = 0; i < _numRows; i++) {
- boolean isEmpty = true;
+ if (select == null) {
Object[] row = new Object[getNumColumns()];
-
- for(int j = 0; j < getNumColumns(); j++) {
- Array colData = _coldata[j].clone();
- row[j] = colData.get(i);
- ValueType type = _schema[j];
- isEmpty = isEmpty && (ArrayUtils.contains(new
double[]{0.0, Double.NaN}, UtilFunctions.objectToDoubleSafe(type,
colData.get(i))));
+ for(int i = 0; i < _numRows; i++) {
+ boolean isEmpty = true;
+ for(int j = 0; j < getNumColumns(); j++) {
+ row[j] = _coldata[j].get(i);
+ isEmpty &= ArrayUtils.contains(new
double[]{0.0, Double.NaN},
+
UtilFunctions.objectToDoubleSafe(_schema[j], row[j]));
+ }
+ if(!isEmpty)
+ ret.appendRow(row);
}
+ }
+ else {
+ if(select.getNonZeros() == getNumRows())
+ return this;
- if((!isEmpty && select == null) || (select != null &&
select.getValue(i, 0) == 1)) {
+ int[] indices =
DataConverter.convertVectorToIndexList(select);
+
+ Object[] row = new Object[getNumColumns()];
+ for(int i : indices) {
+ for(int j = 0; j < getNumColumns(); j++)
+ row[j] = _coldata[j].get(i);
ret.appendRow(row);
}
}
- if(ret.getNumRows() == 0 && emptyReturn) {
+ if (ret.getNumRows() == 0 && emptyReturn) {
String[][] arr = new String[1][getNumColumns()];
Arrays.fill(arr, new String[]{null});
ValueType[] schema = new ValueType[getNumColumns()];
@@ -2579,23 +2600,37 @@ public class FrameBlock implements CacheBlock,
Externalizable {
}
private FrameBlock removeEmptyColumns(MatrixBlock select, boolean
emptyReturn) {
+ if(select != null && (select.getNumRows() != getNumColumns() &&
select.getNumColumns() != getNumColumns())) {
+ throw new DMLRuntimeException("Frame rmempty: Incorrect
select vector dimensions.");
+ }
+
FrameBlock ret = new FrameBlock();
List<ColumnMetadata> columnMetadata = new ArrayList<>();
- for(int i = 0; i < getNumColumns(); i++) {
- Array colData = _coldata[i];
-
- boolean isEmpty = false;
- if(select == null) {
+ if (select == null) {
+ for(int i = 0; i < getNumColumns(); i++) {
+ Array colData = _coldata[i];
ValueType type = _schema[i];
- isEmpty = IntStream.range(0,
colData._size).mapToObj((IntFunction<Object>) colData::get)
+ boolean isEmpty = IntStream.range(0,
colData._size)
+ .mapToObj((IntFunction<Object>)
colData::get)
.allMatch(e -> ArrayUtils.contains(new
double[]{0.0, Double.NaN}, UtilFunctions.objectToDoubleSafe(type, e)));
+
+ if(!isEmpty) {
+ ret.appendColumn(_schema[i],
_coldata[i]);
+ columnMetadata.add(new
ColumnMetadata(_colmeta[i]));
+ }
}
+ } else {
+ if(select.getNonZeros() == getNumColumns())
+ return new FrameBlock(this);
- if((select != null && select.getValue(0, i) == 1) ||
(!isEmpty && select == null)) {
- Types.ValueType vt = _schema[i];
- ret.appendColumn(vt, _coldata[i].clone());
+ int[] indices =
DataConverter.convertVectorToIndexList(select);
+ int k = 0;
+ for(int i : indices) {
+ ret.appendColumn(_schema[i], _coldata[i]);
columnMetadata.add(new
ColumnMetadata(_colmeta[i]));
+ if(_colnames != null)
+ ret._colnames[k++] = _colnames[i];
}
}
diff --git a/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
b/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
index 1339a68..701423b 100644
--- a/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
+++ b/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
@@ -307,6 +307,45 @@ public class DataConverter {
return ret;
}
+ public static int[] convertVectorToIndexList(MatrixBlock mb)
+ {
+ int rows = mb.getNumRows();
+ int cols = mb.getNumColumns();
+
+ if( mb.isEmpty() )
+ return null;
+
+ if( mb.isInSparseFormat() ) {
+ if(rows == 1) {
+ // row vector
+ SparseBlock sb = mb.getSparseBlock();
+ int[] tmp = sb.indexes(0);
+ return (tmp.length == sb.size(0)) ? tmp :
+ Arrays.copyOfRange(tmp, 0, sb.size(0));
+ }
+ else {
+ // column vector
+ int index = 0;
+ int[] indices = new int[(int) mb.getNonZeros()];
+ Iterator<IJV> iter =
mb.getSparseBlockIterator();
+ while(iter.hasNext()) {
+ IJV cell = iter.next();
+ if(cell.getV() != 0.0)
+ indices[index++] = cell.getI()
* cols + cell.getJ();
+ }
+ return indices;
+ }
+ }
+ else {
+ int[] indices = new int[(int) mb.getNonZeros()];
+ for(int i = 0, aix=0, cix=0; i < rows; i++)
+ for(int j = 0; j < cols; j++, aix++)
+ if(mb.getValueDenseUnsafe(i, j) != 0.0)
+ indices[cix++] = aix;
+ return indices;
+ }
+ }
+
public static int[] convertToIntVector( MatrixBlock mb) {
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
diff --git a/src/main/python/tests/frame/test_slice.py
b/src/main/python/tests/frame/test_slice.py
index 0bb5a31..46fb70a 100644
--- a/src/main/python/tests/frame/test_slice.py
+++ b/src/main/python/tests/frame/test_slice.py
@@ -69,18 +69,26 @@ class TestFederatedAggFn(unittest.TestCase):
with self.assertRaises(ValueError):
self.sds.from_pandas(df)[[-1]]
- # https://issues.apache.org/jira/browse/SYSTEMDS-3203
- # def test_slice_first_third_col(self):
- # sm = self.sds.from_pandas(df)[:, [0, 2]]
- # sr = sm.compute()
- # e = df.loc[:, [0, 2]]
- # self.assertTrue((e.values == sr.values).all())
+ def test_slice_first_third_col(self):
+ sm = self.sds.from_pandas(df)[:, [0, 2]]
+ sr = sm.compute()
+ e = pd.DataFrame(
+ {
+ "col1": ["col1_hello_3", "col1_world_3", "col1_hello_3"],
+ "col3": [0.6, 0.7, 0.8],
+ }
+ )
+ self.assertTrue((e.values == sr.values).all())
- # def test_slice_single_col(self):
- # sm = self.sds.from_pandas(df)[:, [1]]
- # sr = sm.compute()
- # e = df.loc[:, [1]]
- # self.assertTrue((e.values == sr.values).all())
+ def test_slice_single_col(self):
+ sm = self.sds.from_pandas(df)[:, [1]]
+ sr = sm.compute()
+ e = pd.DataFrame(
+ {
+ "col2": [6, 7, 8]
+ }
+ )
+ self.assertTrue((e.values == sr.values).all())
def test_slice_row_col_both(self):
with self.assertRaises(NotImplementedError):
diff --git
a/src/test/java/org/apache/sysds/test/component/frame/FrameRemoveEmptyTest.java
b/src/test/java/org/apache/sysds/test/component/frame/FrameRemoveEmptyTest.java
index d3bbdc4..5f011da 100644
---
a/src/test/java/org/apache/sysds/test/component/frame/FrameRemoveEmptyTest.java
+++
b/src/test/java/org/apache/sysds/test/component/frame/FrameRemoveEmptyTest.java
@@ -19,6 +19,8 @@
package org.apache.sysds.test.component.frame;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -27,11 +29,11 @@ import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.apache.sysds.test.functions.unary.matrix.RemoveEmptyTest;
-import org.junit.Ignore;
import org.junit.Test;
public class FrameRemoveEmptyTest extends AutomatedTestBase {
private final static String TEST_NAME1 = "removeEmpty1";
+ private final static String TEST_NAME2 = "removeEmpty2";
private final static String TEST_DIR = "functions/frame/";
private static final String TEST_CLASS_DIR = TEST_DIR +
RemoveEmptyTest.class.getSimpleName() + "/";
@@ -43,32 +45,38 @@ public class FrameRemoveEmptyTest extends AutomatedTestBase
{
@Override
public void setUp() {
addTestConfiguration(TEST_NAME1, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] {"V"}));
+ addTestConfiguration(TEST_NAME2, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] {"V"}));
}
@Test
- public void testRemoveEmptyRowsDenseCP() {
- runTestRemoveEmpty(TEST_NAME1, "rows", Types.ExecType.CP,
false);
+ public void testRemoveEmptyRowsCP() {
+ runTestRemoveEmpty(TEST_NAME1, "rows", Types.ExecType.CP,
false, false);
}
@Test
- public void testRemoveEmptyRowsSparseCP() {
- runTestRemoveEmpty(TEST_NAME1, "cols", Types.ExecType.CP, true);
+ public void testRemoveEmptyColsCP() {
+ runTestRemoveEmpty(TEST_NAME1, "cols", Types.ExecType.CP,
false, false);
}
@Test
- @Ignore
- public void testRemoveEmptyRowsDenseSP() {
- runTestRemoveEmpty(TEST_NAME1, "rows", Types.ExecType.SPARK,
false);
+ public void testRemoveEmptyRowsSelectFullCP() {
+ runTestRemoveEmpty(TEST_NAME2, "rows", Types.ExecType.CP, true,
true);
}
@Test
- @Ignore
- public void testRemoveEmptyRowsSparseSP() {
- runTestRemoveEmpty(TEST_NAME1, "rows", Types.ExecType.SPARK,
true);
+ public void testRemoveEmptyColsSelectFullCP() {
runTestRemoveEmpty(TEST_NAME2, "cols", Types.ExecType.CP, true, true); }
+
+ @Test
+ public void testRemoveEmptyRowsSelectCP() {
+ runTestRemoveEmpty(TEST_NAME2, "rows", Types.ExecType.CP, true,
false);
+ }
+
+ @Test
+ public void testRemoveEmptyColsSelectCP() {
+ runTestRemoveEmpty(TEST_NAME2, "cols", Types.ExecType.CP, true,
false);
}
- private void runTestRemoveEmpty(String testname, String margin,
Types.ExecType et, boolean bSelectIndex) {
- // rtplatform for MR
+ private void runTestRemoveEmpty(String testname, String margin,
Types.ExecType et, boolean bSelectIndex, boolean fullSelect) {
Types.ExecMode platformOld = rtplatform;
switch(et) {
case SPARK:
@@ -90,24 +98,22 @@ public class FrameRemoveEmptyTest extends AutomatedTestBase
{
config.addVariable("cols", _cols);
loadTestConfiguration(config);
- /* This is for running the junit test the new way,
i.e., construct the arguments directly */
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + testname + ".dml";
- programArgs = new String[] {"-explain", "-args",
input("V"), margin, output("V")};
+ programArgs = new String[] {"-explain", "-args",
input("V"), input("I"), margin, output("V")};
- MatrixBlock in = createInputMatrix(margin, _rows,
_cols, _sparsityDense, bSelectIndex);
+ Pair<MatrixBlock, MatrixBlock> data =
createInputMatrix(margin, bSelectIndex, fullSelect);
+ MatrixBlock in = data.getKey();
+ MatrixBlock select = data.getValue();
runTest(true, false, null, -1);
+
double[][] outArray =
TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromOutputDir("V"));
MatrixBlock out = new MatrixBlock(outArray.length,
outArray[0].length, false);
out.init(outArray, outArray.length, outArray[0].length);
- MatrixBlock in2 = new MatrixBlock(_rows, _cols + 2,
0.0);
- in2.copy(0, _rows - 1, 0, _cols - 1, in, true);
- in2.copy(0, (_rows / 2) - 1, _cols, _cols + 1, new
MatrixBlock(_rows / 2, 2, 1.0), true);
- MatrixBlock expected = in2.removeEmptyOperations(new
MatrixBlock(), margin.equals("rows"), false, null);
- expected = expected.slice(0, expected.getNumRows() - 1,
0, expected.getNumColumns() - 3);
-
+ MatrixBlock expected = fullSelect ? in :
+ in.removeEmptyOperations(new MatrixBlock(),
margin.equals("rows"), false, select);
TestUtils.compareMatrices(expected, out, 0);
}
finally {
@@ -117,79 +123,83 @@ public class FrameRemoveEmptyTest extends
AutomatedTestBase {
}
}
- private MatrixBlock createInputMatrix(String margin, int rows, int
cols, double sparsity, boolean bSelectIndex) {
+ private Pair<MatrixBlock, MatrixBlock> createInputMatrix(String margin,
boolean bSelectIndex, boolean fullSelect) {
int rowsp = -1, colsp = -1;
if(margin.equals("rows")) {
- rowsp = rows / 2;
- colsp = cols;
+ rowsp = _rows / 2;
+ colsp = _cols;
}
else {
- rowsp = rows;
- colsp = cols / 2;
+ rowsp = _rows;
+ colsp = _cols / 2;
}
// long seed = System.nanoTime();
- double[][] V = getRandomMatrix(rows, cols, 0, 1, sparsity, 7);
+ double[][] V = getRandomMatrix(_rows, _cols, 0, 1,
+ FrameRemoveEmptyTest._sparsityDense, 7);
double[][] Vp = new double[rowsp][colsp];
- double[][] Ix = null;
+ double[][] Ix;
int innz = 0, vnnz = 0;
// clear out every other row/column
if(margin.equals("rows")) {
- Ix = new double[rows][1];
- for(int i = 0; i < rows; i++) {
+ Ix = new double[_rows][1];
+ for(int i = 0; i < _rows; i++) {
boolean clear = i % 2 != 0;
- if(clear) {
- for(int j = 0; j < cols; j++)
+ if(clear && !fullSelect) {
+ for(int j = 0; j < _cols; j++)
V[i][j] = 0;
Ix[i][0] = 0;
}
else {
boolean bNonEmpty = false;
- for(int j = 0; j < cols; j++) {
+ for(int j = 0; j < _cols; j++) {
Vp[i / 2][j] = V[i][j];
- bNonEmpty |= (V[i][j] != 0.0) ?
true : false;
+ bNonEmpty |= V[i][j] != 0.0;
vnnz += (V[i][j] == 0.0) ? 0 :
1;
}
- Ix[i][0] = (bNonEmpty) ? 1 : 0;
+ Ix[i][0] = (bNonEmpty || fullSelect) ?
1 : 0;
innz += Ix[i][0];
}
}
}
else {
- Ix = new double[1][cols];
- for(int j = 0; j < cols; j++) {
+ Ix = new double[1][_cols];
+ for(int j = 0; j < _cols; j++) {
boolean clear = j % 2 != 0;
- if(clear) {
- for(int i = 0; i < rows; i++)
+ if(clear && !fullSelect) {
+ for(int i = 0; i < _rows; i++)
V[i][j] = 0;
Ix[0][j] = 0;
}
else {
boolean bNonEmpty = false;
- for(int i = 0; i < rows; i++) {
+ for(int i = 0; i < _rows; i++) {
Vp[i][j / 2] = V[i][j];
- bNonEmpty |= (V[i][j] != 0.0) ?
true : false;
+ bNonEmpty |= V[i][j] != 0.0;
vnnz += (V[i][j] == 0.0) ? 0 :
1;
}
- Ix[0][j] = (bNonEmpty) ? 1 : 0;
+ Ix[0][j] = (bNonEmpty || fullSelect) ?
1 : 0;
innz += Ix[0][j];
}
}
}
- MatrixCharacteristics imc = new
MatrixCharacteristics(margin.equals("rows") ? rows : 1,
- margin.equals("rows") ? 1 : cols, 1000, innz);
- MatrixCharacteristics vmc = new MatrixCharacteristics(rows,
cols, 1000, vnnz);
+ MatrixCharacteristics imc = new
MatrixCharacteristics(margin.equals("rows") ? FrameRemoveEmptyTest._rows : 1,
+ margin.equals("rows") ? 1 : _cols, 1000, innz);
+ MatrixCharacteristics vmc = new MatrixCharacteristics(_rows,
_cols, 1000, vnnz);
- MatrixBlock in = new MatrixBlock(rows, cols, false);
+ MatrixBlock in = new MatrixBlock(_rows, _cols, false);
in.init(V, _rows, _cols);
+ MatrixBlock select = new MatrixBlock(Ix.length, Ix[0].length,
false);
+ select.init(Ix, Ix.length, Ix[0].length);
+
writeInputMatrixWithMTD("V", V, false, vmc); // always text
writeExpectedMatrix("V", Vp);
if(bSelectIndex)
writeInputMatrixWithMTD("I", Ix, false, imc);
- return in;
+ return new ImmutablePair<>(in, select);
}
}
diff --git a/src/test/scripts/functions/frame/removeEmpty1.dml
b/src/test/scripts/functions/frame/removeEmpty1.dml
index 696880e..1838a55 100644
--- a/src/test/scripts/functions/frame/removeEmpty1.dml
+++ b/src/test/scripts/functions/frame/removeEmpty1.dml
@@ -21,10 +21,6 @@
A = read($1, naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"])
-B = frame(data=["TRUE", "abc"], rows=nrow(A) / 2, cols=2, schema=["BOOLEAN",
"STRING"])
-C = frame(data=["FALSE", "0.0"], rows=nrow(A) / 2, cols=2, schema=["BOOLEAN",
"STRING"])
-D = rbind(B, C)
-V = cbind(as.frame(A), D)
-Vp = removeEmpty(target=V, margin=$2)
-X = as.matrix(Vp[, 1:(ncol(Vp)-2)])
-write(X, $3);
+V = as.frame(A)
+Vp = removeEmpty(target=V, margin=$3)
+write(Vp, $4);
diff --git a/src/test/scripts/functions/frame/removeEmpty1.dml
b/src/test/scripts/functions/frame/removeEmpty2.dml
similarity index 76%
copy from src/test/scripts/functions/frame/removeEmpty1.dml
copy to src/test/scripts/functions/frame/removeEmpty2.dml
index 696880e..561e812 100644
--- a/src/test/scripts/functions/frame/removeEmpty1.dml
+++ b/src/test/scripts/functions/frame/removeEmpty2.dml
@@ -21,10 +21,6 @@
A = read($1, naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"])
-B = frame(data=["TRUE", "abc"], rows=nrow(A) / 2, cols=2, schema=["BOOLEAN",
"STRING"])
-C = frame(data=["FALSE", "0.0"], rows=nrow(A) / 2, cols=2, schema=["BOOLEAN",
"STRING"])
-D = rbind(B, C)
-V = cbind(as.frame(A), D)
-Vp = removeEmpty(target=V, margin=$2)
-X = as.matrix(Vp[, 1:(ncol(Vp)-2)])
-write(X, $3);
+V = as.frame(A)
+Vp = removeEmpty(target=V, margin=$3, select=read($2))
+write(Vp, $4);