This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 096ca06a68 [SYSTEMDS-3498] Fix unique() for inputs with many distinct 
items
096ca06a68 is described below

commit 096ca06a686904a2422765dd0f45444c0fa4f446
Author: Matthias Boehm <[email protected]>
AuthorDate: Fri Feb 10 19:42:31 2023 +0100

    [SYSTEMDS-3498] Fix unique() for inputs with many distinct items
    
    This patch fixes the correctness of the existing unique() operation
    for obtaining the set of distinct items. So far the distinct items
    were placed by some heuristics of tall/wide matrices into the output,
    but if the output size exceeded the spark block size, crashed. R's
    unique semantics are to deduplicate values or rows (for multi-column
    inputs). For now, this patch ensures correctness but only supports
    input vectors which can be extended in the future.
---
 .../sysds/runtime/matrix/data/LibMatrixSketch.java | 94 ++++++----------------
 .../org/apache/sysds/test/AutomatedTestBase.java   |  4 +-
 .../sysds/test/functions/unique/UniqueRowCol.java  | 74 +----------------
 3 files changed, 26 insertions(+), 146 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
index 3793564dbe..a848ef9480 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
@@ -21,40 +21,39 @@ package org.apache.sysds.runtime.matrix.data;
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.common.Types;
-import org.apache.sysds.hops.OptimizerUtils;
 
-import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
-import java.util.List;
 
 public class LibMatrixSketch {
 
-       private enum MatrixShape {
-               SKINNY,  // rows > cols
-               WIDE,    // rows < cols
-       }
-
        public static MatrixBlock getUniqueValues(MatrixBlock blkIn, 
Types.Direction dir) {
-
-               int R = blkIn.getNumRows();
-               int C = blkIn.getNumColumns();
-               List<HashSet<Double>> hashSets = new ArrayList<>();
-
-               MatrixShape matrixShape = (R >= C)? MatrixShape.SKINNY : 
MatrixShape.WIDE;
-               MatrixBlock blkOut;
-               switch (dir)
-               {
+               //similar to R's unique, this operation takes a matrix and 
computes the
+               //unique values (or rows in case of multiple column inputs)
+               
+               int rlen = blkIn.getNumRows();
+               int clen = blkIn.getNumColumns();
+
+               MatrixBlock blkOut = null;
+               switch (dir) {
                        case RowCol:
+                               if( clen != 1 )
+                                       throw new 
NotImplementedException("Unique only support single-column vectors yet");
+                               // TODO optimize for dense/sparse/compressed 
(once multi-column support added)
+                               
+                               // obtain set of unique items (dense input 
vector)
                                HashSet<Double> hashSet = new HashSet<>();
-                               // TODO optimize for sparse and compressed 
inputs
-                               for (int i=0; i<R; ++i) {
-                                       for (int j=0; j<C; ++j) {
-                                               hashSet.add(blkIn.getValue(i, 
j));
-                                       }
+                               for( int i=0; i<rlen; i++ ) {
+                                       hashSet.add(blkIn.quickGetValue(i, 0));
+                               }
+                               
+                               // allocate output block and place values
+                               int rlen2 = hashSet.size();
+                               blkOut = new MatrixBlock(rlen2, 1, 
false).allocateBlock();
+                               Iterator<Double> iter = hashSet.iterator();
+                               for( int i=0; i<rlen2; i++ ) {
+                                       blkOut.quickSetValue(i, 0, iter.next());
                                }
-                               hashSets.add(hashSet);
-                               blkOut = serializeRowCol(hashSets, dir, 
matrixShape);
                                break;
 
                        case Row:
@@ -67,51 +66,4 @@ public class LibMatrixSketch {
 
                return blkOut;
        }
-
-       private static MatrixBlock serializeRowCol(List<HashSet<Double>> 
hashSets, Types.Direction dir, MatrixShape matrixShape) {
-
-               if (dir != Types.Direction.RowCol) {
-                       throw new IllegalArgumentException("Unrecognized 
direction: " + dir);
-               }
-
-               MatrixBlock blkOut;
-
-               if (hashSets.isEmpty()) {
-                       throw new IllegalArgumentException("Corrupt sketch: 
metadata cannot be empty");
-               }
-
-               int R, C;
-               HashSet<Double> hashSet = hashSets.get(0);
-               Iterator<Double> iter = hashSet.iterator();
-
-               if (hashSet.size() <= OptimizerUtils.DEFAULT_BLOCKSIZE) {
-                       if (matrixShape == MatrixShape.SKINNY) {
-                               // Rx1 column vector
-                               R = hashSet.size();
-                               C = 1;
-                       } else {  // WIDE
-                               // 1xC row vector
-                               R = 1;
-                               C = hashSet.size();
-                       }
-               } else {
-                       if (matrixShape == MatrixShape.SKINNY) {
-                               R = OptimizerUtils.DEFAULT_BLOCKSIZE;
-                               C = (hashSet.size() / 
OptimizerUtils.DEFAULT_BLOCKSIZE) + 1;
-                       } else {  // WIDE
-                               R = (hashSet.size() / 
OptimizerUtils.DEFAULT_BLOCKSIZE) + 1;
-                               C = OptimizerUtils.DEFAULT_BLOCKSIZE;
-                       }
-               }
-
-               blkOut = new MatrixBlock(R, C, false);
-               for (int i=0; i<R; ++i) {
-                       // C is guaranteed to be > 0
-                       for (int j=0; j<C; ++j) {
-                               blkOut.setValue(i, j, iter.next());
-                       }
-               }
-
-               return blkOut;
-       }
 }
diff --git a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java 
b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
index b7f272cdbd..4a20b25290 100644
--- a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
@@ -1947,8 +1947,8 @@ public abstract class AutomatedTestBase {
                                
TestUtils.compareDMLScalarWithJavaScalar(javaFile, dmlFile, epsilon);
                        }
                        else {
-                               TestUtils
-                                       
.compareDMLMatrixWithJavaMatrixRowsOutOfOrder(comparisonFiles[i], 
outputDirectories[i], epsilon);
+                               
TestUtils.compareDMLMatrixWithJavaMatrixRowsOutOfOrder(
+                                       comparisonFiles[i], 
outputDirectories[i], epsilon);
                        }
                }
        }
diff --git 
a/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java 
b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
index a8b2fc1ba7..7f6ad0ff56 100644
--- a/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
+++ b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
@@ -66,80 +66,8 @@ public class UniqueRowCol extends UniqueBase {
 
        @Test
        public void testWideSmallCP() {
-               double[][] inputMatrix = {{1,1,6,9,4,2,0,9,0,0,4,4}};
+               double[][] inputMatrix = 
{{1},{1},{6},{9},{4},{2},{0},{9},{0},{0},{4},{4}};
                double[][] expectedMatrix = {{1,6,9,4,2,0}};
                uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
        }
-
-       @Test
-       public void testSquareLargeCP() {
-               double[][] inputMatrix = new double[1000][1000];
-               // Input is a 1000 x 1000 matrix:
-               // [1, 1, ..., 1, 2, 2, .., 2]
-               // [1, 1, ..., 1, 2, 2, .., 2]
-               // ..
-               // [1, 1, ..., 1, 2, 2, .., 2]
-               // [2, 2, ..., 2, 1, 1, .., 1]
-               // [2, 2, ..., 2, 1, 1, .., 1]
-               // ..
-               // [2, 2, ..., 2, 1, 1, .., 1]
-               for (int i=0; i<500; ++i) {
-                       for (int j=0; j<500; ++j) {
-                               inputMatrix[i][j] = 1;
-                               inputMatrix[i+500][j+500] = 1;
-                       }
-               }
-               for (int i=500; i<1000; ++i) {
-                       for (int j=0; j<500; ++j) {
-                               inputMatrix[i][j] = 2;
-                               inputMatrix[i-500][j+500] = 2;
-                       }
-               }
-               // Expect the output to be a skinny matrix due to the following 
condition in code:
-               // (R >= C)? LibMatrixSketch.MatrixShape.SKINNY : 
LibMatrixSketch.MatrixShape.WIDE;
-               double[][] expectedMatrix = {{1},{2}};
-               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
-       }
-
-       @Test
-       public void testSkinnyLargeCP() {
-               double[][] inputMatrix = new double[2000][2];
-               // Input is a 2000 x 2 matrix:
-               // [1, 2]
-               // [1, 2]
-               // ..
-               // [1, 2]
-               // [2, 1]
-               // [2, 1]
-               // ..
-               // [2, 1]
-               for (int i=0; i<1000; ++i) {
-                       inputMatrix[i][0] = 1;
-                       inputMatrix[i][1] = 2;
-               }
-               for (int i=1000; i<2000; ++i) {
-                       inputMatrix[i][0] = 2;
-                       inputMatrix[i][1] = 1;
-               }
-               double[][] expectedMatrix = {{1}, {2}};
-               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
-       }
-
-       @Test
-       public void testWideLargeCP() {
-               double[][] inputMatrix = new double[2][2000];
-               // Input is a 2 x 2000 matrix:
-               // [1, 1, ..., 1, 2, 2, .., 2]
-               // [2, 2, ..., 2, 1, 1, .., 1]
-               for (int j=0; j<1000; ++j) {
-                       inputMatrix[0][j] = 1;
-                       inputMatrix[1][j+1000] = 1;
-               }
-               for (int j=1000; j<2000; ++j) {
-                       inputMatrix[0][j] = 2;
-                       inputMatrix[1][j-1000] = 2;
-               }
-               double[][] expectedMatrix = {{1,2}};
-               uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
-       }
 }

Reply via email to