This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 5b9db8767e [SYSTEMDS-3708] raJoin and raGroupby builtin functions 5b9db8767e is described below commit 5b9db8767ea3f0728afe0630138771a8f77c0369 Author: gghsu <ppp432...@gmail.com> AuthorDate: Sun Jun 30 10:16:51 2024 +0200 [SYSTEMDS-3708] raJoin and raGroupby builtin functions LDE project SoSe'24, part 2 Closes #2037. --- scripts/builtin/raGroupby.dml | 92 +++++++++ scripts/builtin/{raSelection.dml => raJoin.dml} | 39 ++-- scripts/builtin/raSelection.dml | 2 +- .../java/org/apache/sysds/common/Builtins.java | 2 + .../builtin/part2/BuiltinRaGroupbyTest.java | 164 ++++++++++++++++ .../functions/builtin/part2/BuiltinRaJoinTest.java | 212 +++++++++++++++++++++ src/test/scripts/functions/builtin/raGroupby.dml | 28 +++ src/test/scripts/functions/builtin/raJoin.dml | 29 +++ 8 files changed, 550 insertions(+), 18 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml new file mode 100644 index 0000000000..35f5996616 --- /dev/null +++ b/scripts/builtin/raGroupby.dml @@ -0,0 +1,92 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This raGroupby-function takes a matrix dataset as input from where it performs +# relational operations : groupby +# +# INPUT: +# ------------------------------------------------------------------------------ +# X Matrix of input data [shape: N x M] +# col Integer indicating the column index to execute grupby command +# method Groupby implemention method (nested-loop) +# ------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------ +# Y Matrix of selected data [shape N' x M] with N' <= N +# ------------------------------------------------------------------------------ + +m_raGroupby = function (Matrix[Double] X, Integer col, String method="nested-loop") + return (Matrix[Double] Y) +{ + # Extract and sort unique values from the specified column (1-based index) + uniqueValues = unique(X[, col]) + order_uniqueValues = order(target = uniqueValues, by = 1); + + # Calcute the number of groups + numGroups = nrow(uniqueValues) + + # Determine the maximum number of rows in any group + maxRowsInGroup = 0 + for(i in 1:numGroups){ + groupValue = uniqueValues[i,1] + groupRows = ( X[,col] == groupValue ) + + groupSize = sum(groupRows) + if( groupSize > maxRowsInGroup ){ + maxRowsInGroup = groupSize + } + } + + # Define a zero matrix to put the group data into + Y=matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) + + # Put the ordered uniqueValues into first column of Y as group_id + Y[,1] = order_uniqueValues + + # Loop for each group + for(i in 1:numGroups){ + index = 0 + + # Iterate each row in matrix X to deal with group data + for ( j in 1:nrow(X) ) { + if ( as.scalar( X[j,col] == order_uniqueValues[i,1] )) { + # Define the formula of the start and end column position + startCol = index*(ncol(X)-1) +2 + endCol = startCol + (ncol(X)-2) + + if (col == 1) { + # Case when the selected column is the first column + Y[i,startCol:endCol]=X[j,2:ncol(X)] + } else if (col == ncol(X)) { + # Case when the selected column is the last column + Y[i,startCol:endCol]=X[j,1:(ncol(X)-1)] + } else { + # General case + newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)]) + Y[i,startCol:endCol]=newRow + } + index = index +1 + } + } + } +} + diff --git a/scripts/builtin/raSelection.dml b/scripts/builtin/raJoin.dml similarity index 56% copy from scripts/builtin/raSelection.dml copy to scripts/builtin/raJoin.dml index 70d05b5cc8..333a1f3e8d 100644 --- a/scripts/builtin/raSelection.dml +++ b/scripts/builtin/raJoin.dml @@ -19,34 +19,39 @@ # #------------------------------------------------------------- -# This raSelection-function takes a matrix data set as input from where it performs -# relational operations : selection +# This raJoin-function takes two matrix datasets as input from where it performs +# relational operations : join # # INPUT: # ------------------------------------------------------------------------------ -# X Matrix of input data [shape: N x M] -# col Integer indicating the column index to execute selection command -# op String specifying the comparison operator (e.g., ">", "<", "=="). -# val Constant value to compare the column values "with col op val' +# A Matrix of left input data [shape: N x M] +# colA Integer indicating the column index of matrix A to execute inner join command +# B Matrix of right left data [shape: N x M] +# colA Integer indicating the column index of matrix B to execute inner join command +# method Join implementation method (nested-loop) # ------------------------------------------------------------------------------ # # OUTPUT: # ------------------------------------------------------------------------------ -# Y Matrix of selected data [shape N' x M] with N' <= N +# Y Matrix of joined data [shape N' x M] with N' <= N # ------------------------------------------------------------------------------ -m_raSelection = function (Matrix[Double] X, Integer col, String op, Double val) +m_raJoin = function (Matrix[Double] A, Integer colA, Matrix[Double] B, + Integer colB, String method="nested-loop") return (Matrix[Double] Y) { - # Dertimine the operators - I = ifelse(op == "==", X[,col] == val, - ifelse(op == "!=", X[,col] != val, - ifelse(op == "<", X[,col] < val, - ifelse(op == ">", X[,col] > val, - ifelse(op == "<=", X[,col] <= val, - X[,col] >= val))))) + # matrix of result data + Y = matrix(0, rows=0, cols=ncol(A) + ncol(B) ) - # Perform actual selection - Y = removeEmpty(target=X, margin="rows", select=I); + for (i in 1:nrow(A)) { + for (j in 1:nrow(B)) { + if (as.scalar(A[i, colA] == B[j, colB])) { + # Combine the matching row from A and B to match + match = cbind(A[i,], B[j,]) + # merge the match row into result Y + Y = rbind(Y, match) + } + } + } } diff --git a/scripts/builtin/raSelection.dml b/scripts/builtin/raSelection.dml index 70d05b5cc8..b94e8ad1c1 100644 --- a/scripts/builtin/raSelection.dml +++ b/scripts/builtin/raSelection.dml @@ -27,7 +27,7 @@ # X Matrix of input data [shape: N x M] # col Integer indicating the column index to execute selection command # op String specifying the comparison operator (e.g., ">", "<", "=="). -# val Constant value to compare the column values "with col op val' +# val Constant value to compare the column values "with col op val" # ------------------------------------------------------------------------------ # # OUTPUT: diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index a358b2e2bc..8baee4ec2f 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -275,6 +275,8 @@ public enum Builtins { RANDOM_FOREST("randomForest", true), RANDOM_FOREST_PREDICT("randomForestPredict", true), RANGE("range", false), + RAGROUPBY("raGroupby", true), + RAJOIN("raJoin", true), RASELECTION("raSelection", true), RBIND("rbind", false), RCM("rowClassMeet", "rcm", false, false, ReturnType.MULTI_RETURN), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinRaGroupbyTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinRaGroupbyTest.java new file mode 100644 index 0000000000..6db3c46d3b --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinRaGroupbyTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +import java.util.Arrays; +import java.util.HashMap; + +public class BuiltinRaGroupbyTest extends AutomatedTestBase +{ + private final static String TEST_NAME = "raGroupby"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinRaGroupbyTest.class.getSimpleName() + "/"; + private final static double eps = 1e-8; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"result"})); + } + + @Test + public void testRaGroupbyTest() { + //generate actual dataset and variables + double[][] X = { + {1, 2, 3}, + {4, 7, 8}, + {1, 3, 6}, + {4, 7, 8}, + {4, 8, 9}}; + int select_col = 1; + + // Expected output matrix + double[][] Y = { + {1, 2, 3, 3, 6, 0, 0}, + {4, 7, 8, 7, 8, 8, 9} + }; + + runRaGroupbyTest(X, select_col, Y); + } + + @Test + public void testRaGroupbyTestwithDifferentColumn() { + //generate actual dataset and variables + double[][] X = { + {1, 2, 3}, + {4, 7, 8}, + {1, 3, 6}, + {4, 7, 8}, + {4, 8, 9}}; + int select_col = 2; + + // Expected output matrix + double[][] Y = { + {2, 1, 3, 0, 0}, + {3, 1, 6, 0, 0}, + {7, 4, 8, 4, 8}, + {8, 4, 9, 0, 0} + }; + + runRaGroupbyTest(X, select_col, Y); + } + + @Test + public void testRaGroupbyTestwithNoGroup() { + // Test case with different values in select_col + double[][] X = { + {1, 1, 1}, + {2, 2, 2}, + {3, 1, 3}, + {4, 2, 4}, + {5, 1, 5}}; + int select_col = 3; + + // Expected output matrix + double[][] Y = { + {1, 1, 1}, + {2, 2, 2}, + {3, 3, 1}, + {4, 4, 2}, + {5, 5, 1} + }; + + runRaGroupbyTest(X, select_col, Y); + } + + @Test + public void testRaGroupbyTestwithOneGroup() { + //generate actual dataset and variables + double[][] X = { + {1, 2, 3, 8, 2}, + {4, 7, 8, 8, 3}, + {1, 3, 6, 8, 4}, + {4, 7, 8, 8, 5}, + {4, 8, 9, 8, 6}}; + int select_col = 4; + + // Expected output matrix + double[][] Y = { + {8, 1, 2, 3, 2, 4, 7, 8, 3, 1, 3, 6, 4, 4, 7, 8, 5, 4, 8, 9, 6}, + }; + + runRaGroupbyTest(X, select_col, Y); + } + + private void runRaGroupbyTest(double [][] X, int col, double [][] Y) + { + ExecMode platformOld = setExecMode(ExecMode.SINGLE_NODE); + + try + { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{"-stats", "-args", + input("X"), String.valueOf(col), output("result") }; + System.out.println(Arrays.deepToString(X)); + System.out.println(col); + //fullRScriptName = HOME + TEST_NAME + ".R"; + //rCmd = "Rscript" + " " + fullRScriptName + " " + // + inputDir() + " " + col + " " + expectedDir(); + + writeInputMatrixWithMTD("X", X, true); + System.out.println(Arrays.deepToString(X)); + //writeExpectedMatrix("result", Y); + + // run dmlScript and RScript + runTest(true, false, null, -1); + //runRScript(true); + + //compare matrices + HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("result"); + HashMap<CellIndex, Double> expectedOutput = TestUtils.convert2DDoubleArrayToHashMap(Y); + //HashMap<CellIndex, Double> rfile = readRMatrixFromExpectedDir("result"); + TestUtils.compareMatrices(dmlfile, expectedOutput, eps, "Stat-DML", "Expected"); + } + finally { + rtplatform = platformOld; + } + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinRaJoinTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinRaJoinTest.java new file mode 100644 index 0000000000..6c5ea9d8ac --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinRaJoinTest.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +import java.util.Arrays; +import java.util.HashMap; + +public class BuiltinRaJoinTest extends AutomatedTestBase +{ + private final static String TEST_NAME = "raJoin"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinRaJoinTest.class.getSimpleName() + "/"; + private final static double eps = 1e-8; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"result"})); + } + + @Test + public void testRaJoinTest() { + //generate actual dataset and variables + double[][] A = { + {1, 2, 3}, + {4, 7, 8}, + {1, 3, 6}, + {4, 3, 5}, + {5, 8, 9} + }; + double[][] B = { + {1, 2, 9}, + {3, 7, 6}, + {2, 8, 5}, + {4, 7, 8}, + {4, 5, 10} + }; + int colA = 1; + int colB = 1; + + // Expected output matrix + double[][] Y = { + {1, 2, 3, 1, 2, 9}, + {4, 7, 8, 4, 7, 8}, + {4, 7, 8, 4, 5, 10}, + {1, 3, 6, 1, 2, 9}, + {4, 3, 5, 4, 7, 8}, + {4, 3, 5, 4, 5, 10}, + }; + runRaJoinTest(A, colA, B, colB, Y); + } + + @Test + public void testRaJoinTestwithDifferentColumn() { + // Generate actual dataset and variables + double[][] A = { + {1, 5, 3}, + {2, 6, 8}, + {3, 7, 6}, + {4, 8, 5}, + {5, 9, 9} + }; + double[][] B = { + {1, 9, 2}, + {2, 8, 7}, + {3, 7, 6}, + {4, 5, 4}, + {5, 6, 1} + }; + int colA = 2; + int colB = 3; + + // Expected output matrix + double[][] Y = { + {2, 6, 8, 3, 7, 6}, + {3, 7, 6, 2, 8, 7} + }; + runRaJoinTest(A, colA, B, colB, Y); + } + + @Test + public void testRaJoinTestwithDifferentColumn2() { + // Generate actual dataset and variables + double[][] A = { + {1, 2, 3, 4, 5}, + {6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25} + }; + double[][] B = { + {3, 5, 100}, + {1, 10, 200}, + {50, 25, 500} + }; + int colA = 5; // Joining on the 5th column of A + int colB = 2; // Joining on the 1st column of B + + // Expected output matrix + double[][] Y = { + {1, 2, 3, 4, 5, 3, 5, 100}, + {6, 7, 8, 9, 10, 1, 10, 200}, + {21, 22, 23, 24, 25, 50, 25, 500} + }; + runRaJoinTest(A, colA, B, colB, Y); + } + + @Test + public void testRaJoinTestwithNoMatchingRows() { + // Generate actual dataset and variables + double[][] A = { + {1, 2, 3}, + {2, 3, 4}, + {3, 4, 5} + }; + double[][] B = { + {4, 5, 6}, + {5, 6, 7}, + {6, 7, 8} + }; + int colA = 1; + int colB = 1; + + // Expected output matrix (no matching rows) + double[][] Y = {}; + runRaJoinTest(A, colA, B, colB, Y); + } + + @Test + public void testRaJoinTestwithAllMatchingRows() { + // Generate actual dataset and variables + double[][] A = { + {1, 2, 3}, + {2, 3, 4}, + {3, 4, 5} + }; + double[][] B = { + {1, 2, 6}, + {2, 3, 7}, + {3, 4, 8} + }; + int colA = 1; + int colB = 1; + + // Expected output matrix (all rows match) + double[][] Y = { + {1, 2, 3, 1, 2, 6}, + {2, 3, 4, 2, 3, 7}, + {3, 4, 5, 3, 4, 8} + }; + runRaJoinTest(A, colA, B, colB, Y); + } + + private void runRaJoinTest(double [][] A, int colA, double [][] B, int colB, double [][] Y) + { + ExecMode platformOld = setExecMode(ExecMode.SINGLE_NODE); + + try + { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{"-stats", "-args", + input("A"), String.valueOf(colA), input("B"), String.valueOf(colB), output("result") }; + System.out.println(Arrays.deepToString(A)); + System.out.println(colA); + //fullRScriptName = HOME + TEST_NAME + ".R"; + //rCmd = "Rscript" + " " + fullRScriptName + " " + // + inputDir() + " " + col + " " + expectedDir(); + + writeInputMatrixWithMTD("A", A, true); + writeInputMatrixWithMTD("B", B, true); + + // run dmlScript and RScript + runTest(true, false, null, -1); + //runRScript(true); + + //compare matrices + HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("result"); + HashMap<CellIndex, Double> expectedOutput = TestUtils.convert2DDoubleArrayToHashMap(Y); + //HashMap<CellIndex, Double> rfile = readRMatrixFromExpectedDir("result"); + TestUtils.compareMatrices(dmlfile, expectedOutput, eps, "Stat-DML", "Expected"); + } + finally { + rtplatform = platformOld; + } + } +} diff --git a/src/test/scripts/functions/builtin/raGroupby.dml b/src/test/scripts/functions/builtin/raGroupby.dml new file mode 100644 index 0000000000..b93f9add14 --- /dev/null +++ b/src/test/scripts/functions/builtin/raGroupby.dml @@ -0,0 +1,28 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1) +col = as.integer($2) + +result = raGroupby(X, col, "nested-loop"); +write(result, $3); +print(toString(result)) + diff --git a/src/test/scripts/functions/builtin/raJoin.dml b/src/test/scripts/functions/builtin/raJoin.dml new file mode 100644 index 0000000000..08483b8ea8 --- /dev/null +++ b/src/test/scripts/functions/builtin/raJoin.dml @@ -0,0 +1,29 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +A= read($1) +colA = as.integer($2) +B = read($3) +colB = as.integer($4) + +result = raJoin(A, colA, B, colB, "nested-loop"); +write(result, $5); +