This is an automated email from the ASF dual-hosted git repository.
bchowdhury pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 5e132c0827 [SYSTEMDS-3652] Adding support for removing duplicate rows
to the unique() builtin function (#1949)
5e132c0827 is described below
commit 5e132c08279b19640dc24c4cdd4932e3ad0b5ddc
Author: Badrul Chowdhury <[email protected]>
AuthorDate: Tue Dec 5 10:23:27 2023 -0800
[SYSTEMDS-3652] Adding support for removing duplicate rows to the unique()
builtin function (#1949)
This patch adds support for removing duplicate rows to the unique() builtin
function. Currently, only CP is supported- support for SPARK will be added in a
subsequent patch. Furthermore, we we will explore possible optimizations of the
naive implementation using bitmaps in a subsequent patch.
---
.../sysds/runtime/matrix/data/LibMatrixSketch.java | 62 ++++++++++++++++-
.../sysds/test/functions/unique/UniqueRow.java | 80 ++++++++++++++++++++++
src/test/scripts/functions/unique/uniqueRow.dml | 24 +++++++
3 files changed, 164 insertions(+), 2 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
index 43a4c1bc5a..29089078a8 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
@@ -22,14 +22,15 @@ package org.apache.sysds.runtime.matrix.data;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.sysds.common.Types;
+import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
public class LibMatrixSketch {
public static MatrixBlock getUniqueValues(MatrixBlock blkIn,
Types.Direction dir) {
- //similar to R's unique, this operation takes a matrix and
computes the
- //unique values (or rows in case of multiple column inputs)
+ // similar to R's unique, this operation takes a matrix and
computes the unique values
+ // (or rows in case of multiple column inputs)
int rlen = blkIn.getNumRows();
int clen = blkIn.getNumColumns();
@@ -57,6 +58,63 @@ public class LibMatrixSketch {
break;
case Row:
+ ArrayList<double[]> retainedRows = new
ArrayList<>();
+
+ for (int i=0; i<rlen; ++i) {
+
+ // BitSet will not work because we need
2 pieces of info:
+ // 1. the index and
+ // 2. the value
+ // A BitSet gives us only whether there
is a value at a particular index, but not what that
+ // specific value is.
+
+ double[] currentRow = new double[clen];
+ for (int j=0; j<clen; ++j) {
+ double rawValue =
blkIn.getValue(i, j);
+ currentRow[j] = rawValue;
+ }
+
+ // no need to check for duplicates for
the first row
+ if (i == 0) {
+ retainedRows.add(currentRow);
+ continue;
+ }
+
+ // ensure we are not adding duplicate
rows to retainedRows array
+ int uniqueRowCount = 0;
+ for (int m=0; m<retainedRows.size();
++m) {
+
+ double[] prevRow =
retainedRows.get(m);
+
+ int n = 0;
+ while (n < clen) {
+ if (prevRow[n] !=
currentRow[n]) {
+ break;
+ }
+ n++;
+ }
+
+ // column check terminates
early only if there is a column-level mismatch, ie rows are different
+ if (n != clen) {
+ uniqueRowCount++;
+ }
+ }
+
+ // add current row to retainedRows iff
it is unique from all prev retained rows
+ if (uniqueRowCount ==
retainedRows.size()) {
+ retainedRows.add(currentRow);
+ }
+ }
+
+ blkOut = new MatrixBlock(retainedRows.size(),
blkIn.getNumColumns(), false);
+ for (int i=0; i<retainedRows.size(); ++i) {
+ for (int j=0; j<blkIn.getNumColumns();
++j) {
+ blkOut.quickSetValue(i, j,
retainedRows.get(i)[j]);
+ }
+ }
+
+ break;
+
case Col:
throw new NotImplementedException("Unique
Row/Col has not been implemented yet");
diff --git
a/src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java
b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java
new file mode 100644
index 0000000000..ee8c664efa
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.unique;
+
+import org.apache.sysds.common.Types;
+import org.junit.Test;
+
+public class UniqueRow extends UniqueBase {
+ private final static String TEST_NAME = "uniqueRow";
+ private final static String TEST_DIR = "functions/unique/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
UniqueRow.class.getSimpleName() + "/";
+
+
+ @Override
+ protected String getTestName() {
+ return TEST_NAME;
+ }
+
+ @Override
+ protected String getTestDir() {
+ return TEST_DIR;
+ }
+
+ @Override
+ protected String getTestClassDir() {
+ return TEST_CLASS_DIR;
+ }
+
+ @Test
+ public void testBaseCaseCP() {
+ double[][] inputMatrix = {{0}};
+ double[][] expectedMatrix = {{0}};
+ uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+ }
+
+ @Test
+ public void testSkinnyCP() {
+ double[][] inputMatrix =
{{1},{1},{6},{9},{4},{2},{0},{9},{0},{0},{4},{4}};
+ double[][] expectedMatrix = {{1},{6},{9},{4},{2},{0}};
+ uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+ }
+
+ @Test
+ public void testSquareCP() {
+ double[][] inputMatrix = {{1, 2, 3}, {4, 5, 6}, {1, 2, 3}};
+ double[][] expectedMatrix = {{1, 2, 3},{4, 5, 6}};
+ uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+ }
+
+ @Test
+ public void testWideCP() {
+ double[][] inputMatrix = {{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11,
12}, {1, 2, 3, 4, 5, 6}};
+ double[][] expectedMatrix = {{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10,
11, 12}};
+ uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+ }
+
+ @Test
+ public void testNoDuplicatesCP() {
+ double[][] inputMatrix = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+ double[][] expectedMatrix = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+ uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+ }
+}
diff --git a/src/test/scripts/functions/unique/uniqueRow.dml
b/src/test/scripts/functions/unique/uniqueRow.dml
new file mode 100644
index 0000000000..53baca49e0
--- /dev/null
+++ b/src/test/scripts/functions/unique/uniqueRow.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+input = read($1);
+res = unique(input, dir="r");
+write(res, $2, format="text");