This is an automated email from the ASF dual-hosted git repository.

bchowdhury pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 5e132c0827 [SYSTEMDS-3652] Adding support for removing duplicate rows 
to the unique() builtin function (#1949)
5e132c0827 is described below

commit 5e132c08279b19640dc24c4cdd4932e3ad0b5ddc
Author: Badrul Chowdhury <[email protected]>
AuthorDate: Tue Dec 5 10:23:27 2023 -0800

    [SYSTEMDS-3652] Adding support for removing duplicate rows to the unique() 
builtin function (#1949)
    
    This patch adds support for removing duplicate rows to the unique() builtin 
function. Currently, only CP is supported- support for SPARK will be added in a 
subsequent patch. Furthermore, we we will explore possible optimizations of the 
naive implementation using bitmaps in a subsequent patch.
---
 .../sysds/runtime/matrix/data/LibMatrixSketch.java | 62 ++++++++++++++++-
 .../sysds/test/functions/unique/UniqueRow.java     | 80 ++++++++++++++++++++++
 src/test/scripts/functions/unique/uniqueRow.dml    | 24 +++++++
 3 files changed, 164 insertions(+), 2 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
index 43a4c1bc5a..29089078a8 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
@@ -22,14 +22,15 @@ package org.apache.sysds.runtime.matrix.data;
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.sysds.common.Types;
 
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
 
 public class LibMatrixSketch {
 
        public static MatrixBlock getUniqueValues(MatrixBlock blkIn, 
Types.Direction dir) {
-               //similar to R's unique, this operation takes a matrix and 
computes the
-               //unique values (or rows in case of multiple column inputs)
+               // similar to R's unique, this operation takes a matrix and 
computes the unique values
+               // (or rows in case of multiple column inputs)
                
                int rlen = blkIn.getNumRows();
                int clen = blkIn.getNumColumns();
@@ -57,6 +58,63 @@ public class LibMatrixSketch {
                                break;
 
                        case Row:
+                               ArrayList<double[]> retainedRows = new 
ArrayList<>();
+
+                               for (int i=0; i<rlen; ++i) {
+
+                                       // BitSet will not work because we need 
2 pieces of info:
+                                       // 1. the index and
+                                       // 2. the value
+                                       // A BitSet gives us only whether there 
is a value at a particular index, but not what that
+                                       // specific value is.
+
+                                       double[] currentRow = new double[clen];
+                                       for (int j=0; j<clen; ++j) {
+                                               double rawValue = 
blkIn.getValue(i, j);
+                                               currentRow[j] = rawValue;
+                                       }
+
+                                       // no need to check for duplicates for 
the first row
+                                       if (i == 0) {
+                                               retainedRows.add(currentRow);
+                                               continue;
+                                       }
+
+                                       // ensure we are not adding duplicate 
rows to retainedRows array
+                                       int uniqueRowCount = 0;
+                                       for (int m=0; m<retainedRows.size(); 
++m) {
+
+                                               double[] prevRow = 
retainedRows.get(m);
+
+                                               int n = 0;
+                                               while (n < clen) {
+                                                       if (prevRow[n] != 
currentRow[n]) {
+                                                               break;
+                                                       }
+                                                       n++;
+                                               }
+
+                                               // column check terminates 
early only if there is a column-level mismatch, ie rows are different
+                                               if (n != clen) {
+                                                       uniqueRowCount++;
+                                               }
+                                       }
+
+                                       // add current row to retainedRows iff 
it is unique from all prev retained rows
+                                       if (uniqueRowCount == 
retainedRows.size()) {
+                                               retainedRows.add(currentRow);
+                                       }
+                               }
+
+                               blkOut = new MatrixBlock(retainedRows.size(), 
blkIn.getNumColumns(), false);
+                               for (int i=0; i<retainedRows.size(); ++i) {
+                                       for (int j=0; j<blkIn.getNumColumns(); 
++j) {
+                                               blkOut.quickSetValue(i, j, 
retainedRows.get(i)[j]);
+                                       }
+                               }
+
+                               break;
+
                        case Col:
                                throw new NotImplementedException("Unique 
Row/Col has not been implemented yet");
 
diff --git 
a/src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java 
b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java
new file mode 100644
index 0000000000..ee8c664efa
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRow.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.unique;
+
+import org.apache.sysds.common.Types;
+import org.junit.Test;
+
+public class UniqueRow extends UniqueBase {
+       private final static String TEST_NAME = "uniqueRow";
+       private final static String TEST_DIR = "functions/unique/";
+       private static final String TEST_CLASS_DIR = TEST_DIR + 
UniqueRow.class.getSimpleName() + "/";
+
+
+       @Override
+       protected String getTestName() {
+               return TEST_NAME;
+       }
+
+       @Override
+       protected String getTestDir() {
+               return TEST_DIR;
+       }
+
+       @Override
+       protected String getTestClassDir() {
+               return TEST_CLASS_DIR;
+       }
+
+       @Test
+       public void testBaseCaseCP() {
+               double[][] inputMatrix = {{0}};
+               double[][] expectedMatrix = {{0}};
+               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+       }
+
+       @Test
+       public void testSkinnyCP() {
+               double[][] inputMatrix = 
{{1},{1},{6},{9},{4},{2},{0},{9},{0},{0},{4},{4}};
+               double[][] expectedMatrix = {{1},{6},{9},{4},{2},{0}};
+               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+       }
+
+       @Test
+       public void testSquareCP() {
+               double[][] inputMatrix = {{1, 2, 3}, {4, 5, 6}, {1, 2, 3}};
+               double[][] expectedMatrix = {{1, 2, 3},{4, 5, 6}};
+               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+       }
+
+       @Test
+       public void testWideCP() {
+               double[][] inputMatrix = {{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 
12}, {1, 2, 3, 4, 5, 6}};
+               double[][] expectedMatrix = {{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 
11, 12}};
+               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+       }
+
+       @Test
+       public void testNoDuplicatesCP() {
+               double[][] inputMatrix = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+               double[][] expectedMatrix = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
+       }
+}
diff --git a/src/test/scripts/functions/unique/uniqueRow.dml 
b/src/test/scripts/functions/unique/uniqueRow.dml
new file mode 100644
index 0000000000..53baca49e0
--- /dev/null
+++ b/src/test/scripts/functions/unique/uniqueRow.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+input = read($1);
+res = unique(input, dir="r");
+write(res, $2, format="text");

Reply via email to