[systemds] branch master updated: [MINOR] Cleaning Pipelines - Test added for logical enumeration

ssiddiqi Sat, 17 Jul 2021 09:43:18 -0700

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new 6a295da  [MINOR] Cleaning Pipelines - Test added for logical 
enumeration
6a295da is described below

commit 6a295dacf6507d7be527fd40877bb037034f10b1
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Sat Jul 17 18:40:32 2021 +0200

    [MINOR] Cleaning Pipelines - Test added for logical enumeration
---
 scripts/builtin/bandit.dml                         |   9 +-
 scripts/builtin/topk_cleaning.dml                  |   5 +-
 .../BuiltinTopkCleaningClassificationTest.java     |   9 +-
 ...cationTest.java => BuiltinTopkLogicalTest.java} |  45 +++--
 .../functions/pipelines/topkLogicalTest.dml        | 194 +++++++++++++++++++++
 5 files changed, 239 insertions(+), 23 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 0f99ff6..2254cb2 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -381,9 +381,9 @@ extractTopK = function(Frame[Unknown] pipeline, 
Matrix[Double] hyperparam,
     # remove the zero rows, identifiers of unique records
     dup =  removeEmpty(target = dup, margin="rows")
     # get the counts of duplicate tuples with their tuple id
-    countDist = table(dup, 1) > 0
-    countDist = countDist * seq(1, nrow(countDist))
-    countsVal = removeEmpty(target= countDist, margin="rows")
+    dist = table(dup, 1) > 0
+    dist = dist * seq(1, nrow(dist))
+    countsVal = removeEmpty(target= dist, margin="rows")
     indexes = table(seq(1, nrow(countsVal)),countsVal,1,nrow(countsVal), 
cols=nrow(forDedup))
 
     # for each duplicate record just take the one reocrd and strip the others
@@ -476,7 +476,8 @@ return (Matrix[Double] features)
   # OHE features 
   OHE = sum(colMaxs(X) * mask)
   features[1, 10] = OHE
-  distVal = countDistinct(Y)
+  tab = table(Y, 1)
+  distVal = nrow(tab)
   if(nrow(Y) > 1 &  distVal <= 10)
   {
     ctab = table(Y, 1)
diff --git a/scripts/builtin/topk_cleaning.dml 
b/scripts/builtin/topk_cleaning.dml
index 2ff7c8d..7ef6545 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -121,8 +121,9 @@ s_topk_cleaning = function(Frame[Unknown] data, 
Frame[Unknown] metaData = as.fra
                    "4", "MVI", "OTLR", "MVI", "SCALE"
                    ], rows=8, cols=5) 
                    
-
-  if((nrow(Y) > 0 & countDistinct(Y) < 10))
+  tab = table(Y, 1)
+  dist = nrow(tab)
+  if((nrow(Y) > 0 & dist < 10))
     logical = logicalSeedCI
   else 
     logical = logicalSeedNoCI
diff --git 
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
 
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index dae2918..33d2b96 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -47,7 +47,14 @@ public class BuiltinTopkCleaningClassificationTest extends 
AutomatedTestBase {
 
        @Test
        public void testFindBestPipeline() {
-               runtopkCleaning(0.1, 3,5, TEST_NAME1, 
Types.ExecMode.SINGLE_NODE);
+               runtopkCleaning(0.1, 3,5,
+                       TEST_NAME1, Types.ExecMode.SINGLE_NODE);
+       }
+
+       @Ignore
+       public void testFindBestPipelineHybrid() {
+               runtopkCleaning(0.1, 3,5,
+                       TEST_NAME1, Types.ExecMode.HYBRID);
        }
 
 
diff --git 
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
 
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
similarity index 64%
copy from 
src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
copy to 
src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
index dae2918..975836d 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+
 package org.apache.sysds.test.functions.pipelines;
 
 import org.apache.sysds.common.Types;
@@ -23,13 +24,13 @@ import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Assert;
+import org.junit.Ignore;
 import org.junit.Test;
 
-public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
-       private final static String TEST_NAME1 = 
"topkcleaningClassificationTest";
-       private final static String TEST_CLASS_DIR = SCRIPT_DIR + 
BuiltinTopkCleaningClassificationTest.class.getSimpleName() + "/";
+public class BuiltinTopkLogicalTest extends AutomatedTestBase {
+       private final static String TEST_NAME = "topkLogicalTest";
+       private final static String TEST_CLASS_DIR = SCRIPT_DIR + 
BuiltinTopkLogicalTest.class.getSimpleName() + "/";
 
-       private final static String TEST_DIR = "functions/pipelines/";
        private static final String RESOURCE = 
SCRIPT_DIR+"functions/pipelines/";
        private static final String DATA_DIR = DATASET_DIR+ "pipelines/";
 
@@ -39,29 +40,43 @@ public class BuiltinTopkCleaningClassificationTest extends 
AutomatedTestBase {
        private static final String PARAM_DIR = 
"./scripts/pipelines/properties/";
        private final static String PARAM = PARAM_DIR + "param.csv";
        private final static String PRIMITIVES = PARAM_DIR + 
"testPrimitives.csv";
+       private final static String OUTPUT = 
RESOURCE+"intermediates/logical.csv";
 
+       private final static double dirtyScore = 0.7;
        @Override
        public void setUp() {
-               addTestConfiguration(TEST_NAME1,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
+               addTestConfiguration(TEST_NAME,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"R"}));
+       }
+
+       @Test
+       public void testLogical1() {
+               runTestLogical(10,  5, 2, Types.ExecMode.SINGLE_NODE);
        }
 
        @Test
-       public void testFindBestPipeline() {
-               runtopkCleaning(0.1, 3,5, TEST_NAME1, 
Types.ExecMode.SINGLE_NODE);
+       public void testLogical2() {
+               runTestLogical(2,  3,  2,
+                        Types.ExecMode.SINGLE_NODE);
        }
 
+       @Test
+       public void testLogicalHybrid() {
+               runTestLogical(3,  3,  2,
+                       Types.ExecMode.HYBRID);
+       }
 
-       private void runtopkCleaning(Double sample, int topk, int resources, 
String testName, Types.ExecMode et) {
+       private void runTestLogical(int max_iter,  int num_inst, int num_exec,  
Types.ExecMode et) {
 
-               setOutputBuffering(true);
+               //              setOutputBuffering(true);
+               String HOME = SCRIPT_DIR+"functions/pipelines/" ;
                Types.ExecMode modeOld = setExecMode(et);
-               String HOME = SCRIPT_DIR + TEST_DIR;
                try {
-                       loadTestConfiguration(getTestConfiguration(testName));
-                       fullDMLScriptName = HOME + testName + ".dml";
+                       loadTestConfiguration(getTestConfiguration(TEST_NAME));
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
                        programArgs = new String[] {"-stats", "-exec", 
"singlenode", "-nvargs", "dirtyData="+DIRTY,
-                               "metaData="+META, "primitives="+PRIMITIVES, 
"parameters="+PARAM, "topk="+ topk, "rv="+ resources,
-                               "sample="+sample, "O="+output("O")};
+                               "metaData="+META, "primitives="+PRIMITIVES, 
"parameters="+PARAM, "max_iter="+ max_iter,
+                                "num_inst="+ num_inst, "num_exec="+ num_exec,
+                               "dirtyScore="+dirtyScore, "output="+OUTPUT, 
"O="+output("O")};
 
                        runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
 
@@ -72,6 +87,4 @@ public class BuiltinTopkCleaningClassificationTest extends 
AutomatedTestBase {
                        resetExecMode(modeOld);
                }
        }
-
-
 }
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml 
b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
new file mode 100644
index 0000000..688999d
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Generate the logical pipelines for data cleaning
+
+source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
+
+
+# read the inputs
+X = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
+  naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+dirtyScore = $dirtyScore
+
+max_iter = $max_iter
+num_inst = $num_inst
+num_exec = $num_exec
+
+
+getSchema = metaInfo[1, 2:ncol(metaInfo)]
+getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for 
FD computation
+  
+# encode the categorical data
+if(sum(getMask) > 0)
+{
+  # always recode the label
+  index = vectorToCsv(getMask)
+  jspecR = "{ids:true, recode:["+index+"]}"
+  [eX, X_meta] = transformencode(target=X, spec=jspecR);
+  # change the schema to reflect the encoded values
+  getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
+  getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
+
+} 
+# if no categorical value exist then just cast the frame into matrix
+else
+  eX = as.matrix(X)
+  
+# extract the class label  
+eY = eX[, ncol(eX)]
+eX = eX[, 1:ncol(eX) - 1]
+
+print("y classes \n"+toString(table(eY, 1)))
+getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
+getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
+getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
+
+metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0))
+
+logical =  frame([
+                   "1", "MVI", "0", "0", "0", "0", 
+                   # "1", "OTLR", "0", "0", "0", "0", 
+                   # "1", "CI", "0", "0", "0", "0", 
+                   # "2", "MVI", "CI", "0", "0", "0", 
+                   # "2", "MVI", "OTLR", "0", "0", "0",
+                   # "2", "MVI", "SCALE", "0", "0", "0", 
+                   # "3", "MVI", "SCALE", "OTLR", "0", "0",
+                   # "4", "MVI", "OTLR", "CI", "SCALE", "0", 
+                   # "4", "OTLR", "MVI", "CI", "SCALE", "0",
+                   "5", "MVI", "OTLR", "MVI", "CI", "SCALE"
+                   ], rows=2, cols=6)
+
+
+categories = frame(["MVI", "OTLR", "SCALE"], rows=1, cols=3)
+cmr = matrix("4 0.7 1", rows=1, cols=3)
+[bestLogical, score, T] = lg::enumerateLogical(X=eX, y=eY, cmr=cmr, 
cat=categories, population=logical,
+    max_iter=max_iter, metaList = metaList, 
evaluationFunc="evalClassification", evalFunHp=matrix("1 1e-3 1e-9 100", 
rows=1, cols=4), 
+    primitives=primitives, param=param , num_inst=num_inst, num_exec=num_exec, 
isTailed=TRUE, verbose=TRUE)
+
+print("score of pipeline: "+toString(score)+" in "+(T/60000)+" mins")
+print("bestLogical "+toString(bestLogical))
+
+result = dirtyScore < score  
+print("result satisfied ------------"+result)
+
+write(result , $O)
+
+
+
+# UDF for evaluation  
+# choice of parameters provided by API, X, Y, clone_X, evalFunHp 
(hyper-param), trainML (boolean for optimizing hp internally or passed by 
externally )
+evalClassification = function(Matrix[Double] X, Matrix[Double] Y, 
Matrix[Double] Xorig, List[Unknown] metaList,
+  Matrix[Double] evalFunHp, Integer trainML=0)
+  
+return(Matrix[Double] output)
+{
+  cv = 2
+  mask = as.matrix(metaList['mask'])
+  print("min and max of y in eval: "+min(Y)+" "+max(Y))
+  if(max(Y) == min(Y)) {
+    print("Y contains only one class")
+    accuracy = as.double(0)
+  }
+  else {
+    if(trainML == 1)
+    {
+      # do the gridsearch for hyper-parameters
+      params = list("icpt", "reg", "tol", "maxii")
+      paramRanges = list(seq(0, 2, 1), 10^seq(1,-4), 10^seq(1,-6), 
10^seq(1,3));
+
+      trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=-1, maxi=100, maxii=-1, 
verbose=FALSE);
+      [B1, opt] = utils::topk_gridSearch(X=X, y=Y, train="multiLogReg", 
predict="W", numB=ncol(X)+1, cv=TRUE, cvk=cv,
+        params=params, paramValues=paramRanges, trainArgs=trainArgs, 
verbose=FALSE);
+      evalFunHp = as.matrix(opt)  
+    }
+
+    # do the k = 3 cross validations
+    # evalFunHpM = as.matrix(evalFunHp)
+    [accuracyMatrix] = crossV(X, Y, cv, evalFunHp, FALSE)
+    accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
+    score = mean(accuracyMatrix)
+    print(cv +" validation accuracy "+score)
+  }
+  output = cbind(as.matrix(score), evalFunHp)
+}
+
+# # ######################################################################
+# # # # Function for cross validation using hold out method
+# # # # Inputs: The input dataset X, Y and the value of k validation, mask of 
the 
+# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # Output: It return a matrix having the accuracy of each fold.
+# # ######################################################################
+
+crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] MLhp, Boolean isWeighted) 
+return (Matrix[Double] accuracyMatrix)
+{
+  accuracyMatrix = matrix(0, k, 1)
+  dataList = list()
+  testL = list()
+  data = order(target = cbind(y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
+  classes = table(data[, 1], 1)
+  ins_per_fold = classes/k
+  start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
+  fold_idxes = cbind(start_fold, ins_per_fold)
+
+  start_i = 0; end_i = 0; idx_fold = 1;;
+  for(i in 1:k)
+  {
+    fold_i = matrix(0, 0, ncol(data))
+    start=0; end=0; 
+    for(j in 1:nrow(classes))
+    {
+      idx = as.scalar(classes[j, 1])
+      start = end + 1;
+      end = end + idx
+      class_j =  data[start:end, ]
+      start_i = as.scalar(fold_idxes[j, 1]);
+      end_i = as.scalar(fold_idxes[j, 2])
+      fold_i = rbind(fold_i, class_j[start_i:end_i, ])
+    }
+    dataList = append(dataList, fold_i)
+    fold_idxes[, 1] = fold_idxes[, 2] + 1
+    fold_idxes[, 2] += ins_per_fold
+  }
+
+  for(i in seq(1,k))
+  {
+    [trainList, hold_out] = remove(dataList, i)
+    trainset = rbind(trainList)
+    testset = as.matrix(hold_out)
+    trainX = trainset[, 2:ncol(trainset)]
+    trainy = trainset[, 1]
+    testX = testset[, 2:ncol(testset)]
+    testy = testset[, 1]
+    beta = multiLogReg(X=trainX, Y=trainy, icpt=as.scalar(MLhp[1,1]), 
reg=as.scalar(MLhp[1,2]), tol=as.scalar(MLhp[1,3]), 
+    maxi=as.scalar(MLhp[1,4]), maxii=50, verbose=FALSE);
+    [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
+    accuracy = getAccuracy(testy, yhat, isWeighted)
+    accuracyMatrix[i] = accuracy
+  }
+}

[systemds] branch master updated: [MINOR] Cleaning Pipelines - Test added for logical enumeration

Reply via email to