This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 2f3e381 [SYSTEMDS-2902] Minor built-ins for cleaning pipelines
2f3e381 is described below
commit 2f3e3816635a0248f18dde6dea8594b8e27ca2ed
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Mar 17 21:44:26 2021 +0100
[SYSTEMDS-2902] Minor built-ins for cleaning pipelines
---
scripts/builtin/imputeByMode.dml | 60 +++++++++++++++
scripts/builtin/splitBalanced.dml | 89 ++++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 2 +
.../builtin/BuiltinSplitBalancedTest.java | 84 ++++++++++++++++++++
.../scripts/functions/builtin/splitBalanced.dml | 36 +++++++++
5 files changed, 271 insertions(+)
diff --git a/scripts/builtin/imputeByMode.dml b/scripts/builtin/imputeByMode.dml
new file mode 100644
index 0000000..0d55de5
--- /dev/null
+++ b/scripts/builtin/imputeByMode.dml
@@ -0,0 +1,60 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
+
+# impute the data by mode value
+
+# INPUT PARAMETERS:
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# X Double --- Data Matrix (Recoded Matrix for
categorical features)
+#
---------------------------------------------------------------------------------------------
+
+
+#Output(s)
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# X Double --- imputed dataset
+
+
+m_imputeByMode = function(Matrix[Double] X)
+return(Matrix[Double] X)
+{
+
+ Mask = is.na(X)
+ X = replace(target=X, pattern=NaN, replacement=0)
+ colMode = matrix(0, 1, ncol(X))
+ for(i in 1: ncol(X)) {
+ X_c = removeEmpty(target=X[, i], margin = "rows", select=(X[, i] < 1)==0)
+ if(sum(X_c) == 0)
+ colMode[1, i] = 1
+ else {
+ cat_counts = table(X_c, 1, nrow(X_c), 1); # counts for each category
+ colMode[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
+ }
+ }
+ Mask = Mask * colMode
+ X = X + Mask
+}
+
diff --git a/scripts/builtin/splitBalanced.dml
b/scripts/builtin/splitBalanced.dml
new file mode 100644
index 0000000..4428443
--- /dev/null
+++ b/scripts/builtin/splitBalanced.dml
@@ -0,0 +1,89 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
+
+# Split input data X and Y into contiguous balanced ratio
+#
------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
------------------------------------------------------------------------------
+# X Matrix --- Input feature matrix
+# Y Matrix --- Input Labels
+# f Double 0.7 Train set fraction [0,1]
+# verbose Boolean FALSE print available
+#
------------------------------------------------------------------------------
+# X_train Matrix --- Train split of feature matrix
+# X_test Matrix --- Test split of feature matrix
+# y_train Matrix --- Train split of label matrix
+# y_test Matrix --- Test split of label matrix
+#
------------------------------------------------------------------------------
+
+m_splitBalanced = function(Matrix[Double] X, Matrix[Double] Y, Double
splitRatio, Boolean verbose)
+return (Matrix[Double] X_train, Matrix[Double] y_train, Matrix[Double] X_test,
+ Matrix[Double] y_test)
+{
+
+ XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
+ # get the class count
+ classes = table(XY[, 1], 1)
+ split = floor(nrow(X) * splitRatio)
+ start_class = 1
+ train_row_s = 1
+ test_row_s = 1
+ train_row_e = 0
+ test_row_e = 0
+ end_class = 0
+
+ outTrain = matrix(0, split+nrow(classes), ncol(XY))
+ outTest = matrix(0, (nrow(X) - split)+nrow(classes), ncol(XY))
+
+ classes_ratio_train = floor(classes*splitRatio)
+ classes_ratio_test = classes - classes_ratio_train
+ if(verbose) {
+ print("rows "+nrow(X))
+ print("classes \n"+toString(classes))
+ print("train ratio \n"+toString(classes_ratio_train))
+ print("test ratio \n"+toString(classes_ratio_test))
+ }
+ for(i in 1:nrow(classes))
+ {
+ end_class = end_class + as.scalar(classes[i])
+ class_t = XY[start_class:end_class, ]
+
+ train_row_e = train_row_e + as.scalar(classes_ratio_train[i])
+ test_row_e = test_row_e + as.scalar(classes_ratio_test[i])
+
+ outTrain[train_row_s:train_row_e, ] =
class_t[1:as.scalar(classes_ratio_train[i]), ]
+
+ outTest[test_row_s:test_row_e, ] =
class_t[as.scalar(classes_ratio_train[i])+1:nrow(class_t), ]
+
+ train_row_s = train_row_e + 1
+ test_row_s = test_row_e + 1
+ start_class = end_class + 1
+ }
+ outTrain = removeEmpty(target = outTrain, margin = "rows")
+ outTest = removeEmpty(target = outTest, margin = "rows")
+ y_train = outTrain[, 1]
+ X_train = outTrain[, 2:ncol(outTrain)]
+ y_test = outTest[, 1]
+ X_test = outTest[, 2:ncol(outTest)]
+
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 8854f69..353791c 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -132,6 +132,7 @@ public enum Builtins {
IMG_BRIGHTNESS("img_brightness", true),
IMPUTE_BY_MEAN("imputeByMean", true),
IMPUTE_BY_MEDIAN("imputeByMedian", true),
+ IMPUTE_BY_MODE("imputeByMode", true),
IMG_CROP("img_crop", true),
IMPUTE_FD("imputeByFD", true),
INTERQUANTILE("interQuantile", false),
@@ -221,6 +222,7 @@ public enum Builtins {
SMOTE("smote", true),
SOLVE("solve", false),
SPLIT("split", true),
+ SPLIT_BALANCED("splitBalanced", true),
STATSNA("statsNA", true),
SQRT("sqrt", false),
SUM("sum", false),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSplitBalancedTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSplitBalancedTest.java
new file mode 100644
index 0000000..7f47495
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSplitBalancedTest.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BuiltinSplitBalancedTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "splitBalanced";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinSplitTest.class.getSimpleName() + "/";
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"B",}));
+ }
+
+ public double eps = 0.00001;
+ public int cols = 10;
+ public int rows = 150;
+
+
+ @Test
+ public void test_CP1() {
+
+ runSplitTest(0.7, LopProperties.ExecType.CP);
+
+ }
+ @Test
+ public void test_CP2() {
+
+ runSplitTest(0.8, LopProperties.ExecType.CP);
+
+ }
+
+ @Test
+ public void test_Spark() {
+ runSplitTest( 0.8, LopProperties.ExecType.SPARK);
+ }
+
+ private void runSplitTest(double splitRatio, ExecType instType) {
+ ExecMode platformOld = setExecMode(instType);
+
+ try {
+ setOutputBuffering(true);
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[] {"-nvargs", "cols=" + cols,
"rows=" + rows, "split="+splitRatio, "out="+output("O")};
+
+ runTest(true, false, null, -1);
+
Assert.assertTrue(TestUtils.readDMLBoolean(output("O")));
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/splitBalanced.dml
b/src/test/scripts/functions/builtin/splitBalanced.dml
new file mode 100644
index 0000000..9fa3215
--- /dev/null
+++ b/src/test/scripts/functions/builtin/splitBalanced.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = rand(rows = $rows, cols=$cols, seed=1)
+Y = ceil(rand(rows = $rows, cols=1, seed=13, sparsity= (1-$split)))
+Y = Y+1
+
+classes = table(Y, 1)
+
+[Xtrain, Ytrain, Xtest, Ytest] = splitBalanced(X=X,Y=Y, splitRatio=$split,
verbose=FALSE)
+
+classCountTrain = table(Ytrain, 1)
+classCountTest = table(Ytest, 1)
+
+verify = as.scalar(classCountTest[2]) == ceil((as.scalar(classes[2]) *
(1-$split)))
+
+write(verify, $out, format="text")
\ No newline at end of file