This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 6a295da [MINOR] Cleaning Pipelines - Test added for logical
enumeration
6a295da is described below
commit 6a295dacf6507d7be527fd40877bb037034f10b1
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Sat Jul 17 18:40:32 2021 +0200
[MINOR] Cleaning Pipelines - Test added for logical enumeration
---
scripts/builtin/bandit.dml | 9 +-
scripts/builtin/topk_cleaning.dml | 5 +-
.../BuiltinTopkCleaningClassificationTest.java | 9 +-
...cationTest.java => BuiltinTopkLogicalTest.java} | 45 +++--
.../functions/pipelines/topkLogicalTest.dml | 194 +++++++++++++++++++++
5 files changed, 239 insertions(+), 23 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 0f99ff6..2254cb2 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -381,9 +381,9 @@ extractTopK = function(Frame[Unknown] pipeline,
Matrix[Double] hyperparam,
# remove the zero rows, identifiers of unique records
dup = removeEmpty(target = dup, margin="rows")
# get the counts of duplicate tuples with their tuple id
- countDist = table(dup, 1) > 0
- countDist = countDist * seq(1, nrow(countDist))
- countsVal = removeEmpty(target= countDist, margin="rows")
+ dist = table(dup, 1) > 0
+ dist = dist * seq(1, nrow(dist))
+ countsVal = removeEmpty(target= dist, margin="rows")
indexes = table(seq(1, nrow(countsVal)),countsVal,1,nrow(countsVal),
cols=nrow(forDedup))
# for each duplicate record just take the one reocrd and strip the others
@@ -476,7 +476,8 @@ return (Matrix[Double] features)
# OHE features
OHE = sum(colMaxs(X) * mask)
features[1, 10] = OHE
- distVal = countDistinct(Y)
+ tab = table(Y, 1)
+ distVal = nrow(tab)
if(nrow(Y) > 1 & distVal <= 10)
{
ctab = table(Y, 1)
diff --git a/scripts/builtin/topk_cleaning.dml
b/scripts/builtin/topk_cleaning.dml
index 2ff7c8d..7ef6545 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -121,8 +121,9 @@ s_topk_cleaning = function(Frame[Unknown] data,
Frame[Unknown] metaData = as.fra
"4", "MVI", "OTLR", "MVI", "SCALE"
], rows=8, cols=5)
-
- if((nrow(Y) > 0 & countDistinct(Y) < 10))
+ tab = table(Y, 1)
+ dist = nrow(tab)
+ if((nrow(Y) > 0 & dist < 10))
logical = logicalSeedCI
else
logical = logicalSeedNoCI
diff --git
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index dae2918..33d2b96 100644
---
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -47,7 +47,14 @@ public class BuiltinTopkCleaningClassificationTest extends
AutomatedTestBase {
@Test
public void testFindBestPipeline() {
- runtopkCleaning(0.1, 3,5, TEST_NAME1,
Types.ExecMode.SINGLE_NODE);
+ runtopkCleaning(0.1, 3,5,
+ TEST_NAME1, Types.ExecMode.SINGLE_NODE);
+ }
+
+ @Ignore
+ public void testFindBestPipelineHybrid() {
+ runtopkCleaning(0.1, 3,5,
+ TEST_NAME1, Types.ExecMode.HYBRID);
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
similarity index 64%
copy from
src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
copy to
src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
index dae2918..975836d 100644
---
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkLogicalTest.java
@@ -16,6 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
+
package org.apache.sysds.test.functions.pipelines;
import org.apache.sysds.common.Types;
@@ -23,13 +24,13 @@ import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.junit.Assert;
+import org.junit.Ignore;
import org.junit.Test;
-public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
- private final static String TEST_NAME1 =
"topkcleaningClassificationTest";
- private final static String TEST_CLASS_DIR = SCRIPT_DIR +
BuiltinTopkCleaningClassificationTest.class.getSimpleName() + "/";
+public class BuiltinTopkLogicalTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "topkLogicalTest";
+ private final static String TEST_CLASS_DIR = SCRIPT_DIR +
BuiltinTopkLogicalTest.class.getSimpleName() + "/";
- private final static String TEST_DIR = "functions/pipelines/";
private static final String RESOURCE =
SCRIPT_DIR+"functions/pipelines/";
private static final String DATA_DIR = DATASET_DIR+ "pipelines/";
@@ -39,29 +40,43 @@ public class BuiltinTopkCleaningClassificationTest extends
AutomatedTestBase {
private static final String PARAM_DIR =
"./scripts/pipelines/properties/";
private final static String PARAM = PARAM_DIR + "param.csv";
private final static String PRIMITIVES = PARAM_DIR +
"testPrimitives.csv";
+ private final static String OUTPUT =
RESOURCE+"intermediates/logical.csv";
+ private final static double dirtyScore = 0.7;
@Override
public void setUp() {
- addTestConfiguration(TEST_NAME1,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
+ addTestConfiguration(TEST_NAME,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"R"}));
+ }
+
+ @Test
+ public void testLogical1() {
+ runTestLogical(10, 5, 2, Types.ExecMode.SINGLE_NODE);
}
@Test
- public void testFindBestPipeline() {
- runtopkCleaning(0.1, 3,5, TEST_NAME1,
Types.ExecMode.SINGLE_NODE);
+ public void testLogical2() {
+ runTestLogical(2, 3, 2,
+ Types.ExecMode.SINGLE_NODE);
}
+ @Test
+ public void testLogicalHybrid() {
+ runTestLogical(3, 3, 2,
+ Types.ExecMode.HYBRID);
+ }
- private void runtopkCleaning(Double sample, int topk, int resources,
String testName, Types.ExecMode et) {
+ private void runTestLogical(int max_iter, int num_inst, int num_exec,
Types.ExecMode et) {
- setOutputBuffering(true);
+ // setOutputBuffering(true);
+ String HOME = SCRIPT_DIR+"functions/pipelines/" ;
Types.ExecMode modeOld = setExecMode(et);
- String HOME = SCRIPT_DIR + TEST_DIR;
try {
- loadTestConfiguration(getTestConfiguration(testName));
- fullDMLScriptName = HOME + testName + ".dml";
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[] {"-stats", "-exec",
"singlenode", "-nvargs", "dirtyData="+DIRTY,
- "metaData="+META, "primitives="+PRIMITIVES,
"parameters="+PARAM, "topk="+ topk, "rv="+ resources,
- "sample="+sample, "O="+output("O")};
+ "metaData="+META, "primitives="+PRIMITIVES,
"parameters="+PARAM, "max_iter="+ max_iter,
+ "num_inst="+ num_inst, "num_exec="+ num_exec,
+ "dirtyScore="+dirtyScore, "output="+OUTPUT,
"O="+output("O")};
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
@@ -72,6 +87,4 @@ public class BuiltinTopkCleaningClassificationTest extends
AutomatedTestBase {
resetExecMode(modeOld);
}
}
-
-
}
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
new file mode 100644
index 0000000..688999d
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Generate the logical pipelines for data cleaning
+
+source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
+
+
+# read the inputs
+X = read($dirtyData, data_type="frame", format="csv", header=TRUE,
+ naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+dirtyScore = $dirtyScore
+
+max_iter = $max_iter
+num_inst = $num_inst
+num_exec = $num_exec
+
+
+getSchema = metaInfo[1, 2:ncol(metaInfo)]
+getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for
FD computation
+
+# encode the categorical data
+if(sum(getMask) > 0)
+{
+ # always recode the label
+ index = vectorToCsv(getMask)
+ jspecR = "{ids:true, recode:["+index+"]}"
+ [eX, X_meta] = transformencode(target=X, spec=jspecR);
+ # change the schema to reflect the encoded values
+ getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
+ getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
+
+}
+# if no categorical value exist then just cast the frame into matrix
+else
+ eX = as.matrix(X)
+
+# extract the class label
+eY = eX[, ncol(eX)]
+eX = eX[, 1:ncol(eX) - 1]
+
+print("y classes \n"+toString(table(eY, 1)))
+getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
+getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
+getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
+
+metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0))
+
+logical = frame([
+ "1", "MVI", "0", "0", "0", "0",
+ # "1", "OTLR", "0", "0", "0", "0",
+ # "1", "CI", "0", "0", "0", "0",
+ # "2", "MVI", "CI", "0", "0", "0",
+ # "2", "MVI", "OTLR", "0", "0", "0",
+ # "2", "MVI", "SCALE", "0", "0", "0",
+ # "3", "MVI", "SCALE", "OTLR", "0", "0",
+ # "4", "MVI", "OTLR", "CI", "SCALE", "0",
+ # "4", "OTLR", "MVI", "CI", "SCALE", "0",
+ "5", "MVI", "OTLR", "MVI", "CI", "SCALE"
+ ], rows=2, cols=6)
+
+
+categories = frame(["MVI", "OTLR", "SCALE"], rows=1, cols=3)
+cmr = matrix("4 0.7 1", rows=1, cols=3)
+[bestLogical, score, T] = lg::enumerateLogical(X=eX, y=eY, cmr=cmr,
cat=categories, population=logical,
+ max_iter=max_iter, metaList = metaList,
evaluationFunc="evalClassification", evalFunHp=matrix("1 1e-3 1e-9 100",
rows=1, cols=4),
+ primitives=primitives, param=param , num_inst=num_inst, num_exec=num_exec,
isTailed=TRUE, verbose=TRUE)
+
+print("score of pipeline: "+toString(score)+" in "+(T/60000)+" mins")
+print("bestLogical "+toString(bestLogical))
+
+result = dirtyScore < score
+print("result satisfied ------------"+result)
+
+write(result , $O)
+
+
+
+# UDF for evaluation
+# choice of parameters provided by API, X, Y, clone_X, evalFunHp
(hyper-param), trainML (boolean for optimizing hp internally or passed by
externally )
+evalClassification = function(Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xorig, List[Unknown] metaList,
+ Matrix[Double] evalFunHp, Integer trainML=0)
+
+return(Matrix[Double] output)
+{
+ cv = 2
+ mask = as.matrix(metaList['mask'])
+ print("min and max of y in eval: "+min(Y)+" "+max(Y))
+ if(max(Y) == min(Y)) {
+ print("Y contains only one class")
+ accuracy = as.double(0)
+ }
+ else {
+ if(trainML == 1)
+ {
+ # do the gridsearch for hyper-parameters
+ params = list("icpt", "reg", "tol", "maxii")
+ paramRanges = list(seq(0, 2, 1), 10^seq(1,-4), 10^seq(1,-6),
10^seq(1,3));
+
+ trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=-1, maxi=100, maxii=-1,
verbose=FALSE);
+ [B1, opt] = utils::topk_gridSearch(X=X, y=Y, train="multiLogReg",
predict="W", numB=ncol(X)+1, cv=TRUE, cvk=cv,
+ params=params, paramValues=paramRanges, trainArgs=trainArgs,
verbose=FALSE);
+ evalFunHp = as.matrix(opt)
+ }
+
+ # do the k = 3 cross validations
+ # evalFunHpM = as.matrix(evalFunHp)
+ [accuracyMatrix] = crossV(X, Y, cv, evalFunHp, FALSE)
+ accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
+ score = mean(accuracyMatrix)
+ print(cv +" validation accuracy "+score)
+ }
+ output = cbind(as.matrix(score), evalFunHp)
+}
+
+# # ######################################################################
+# # # # Function for cross validation using hold out method
+# # # # Inputs: The input dataset X, Y and the value of k validation, mask of
the
+# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # Output: It return a matrix having the accuracy of each fold.
+# # ######################################################################
+
+crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] MLhp, Boolean isWeighted)
+return (Matrix[Double] accuracyMatrix)
+{
+ accuracyMatrix = matrix(0, k, 1)
+ dataList = list()
+ testL = list()
+ data = order(target = cbind(y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
+ classes = table(data[, 1], 1)
+ ins_per_fold = classes/k
+ start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
+ fold_idxes = cbind(start_fold, ins_per_fold)
+
+ start_i = 0; end_i = 0; idx_fold = 1;;
+ for(i in 1:k)
+ {
+ fold_i = matrix(0, 0, ncol(data))
+ start=0; end=0;
+ for(j in 1:nrow(classes))
+ {
+ idx = as.scalar(classes[j, 1])
+ start = end + 1;
+ end = end + idx
+ class_j = data[start:end, ]
+ start_i = as.scalar(fold_idxes[j, 1]);
+ end_i = as.scalar(fold_idxes[j, 2])
+ fold_i = rbind(fold_i, class_j[start_i:end_i, ])
+ }
+ dataList = append(dataList, fold_i)
+ fold_idxes[, 1] = fold_idxes[, 2] + 1
+ fold_idxes[, 2] += ins_per_fold
+ }
+
+ for(i in seq(1,k))
+ {
+ [trainList, hold_out] = remove(dataList, i)
+ trainset = rbind(trainList)
+ testset = as.matrix(hold_out)
+ trainX = trainset[, 2:ncol(trainset)]
+ trainy = trainset[, 1]
+ testX = testset[, 2:ncol(testset)]
+ testy = testset[, 1]
+ beta = multiLogReg(X=trainX, Y=trainy, icpt=as.scalar(MLhp[1,1]),
reg=as.scalar(MLhp[1,2]), tol=as.scalar(MLhp[1,3]),
+ maxi=as.scalar(MLhp[1,4]), maxii=50, verbose=FALSE);
+ [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
+ accuracy = getAccuracy(testy, yhat, isWeighted)
+ accuracyMatrix[i] = accuracy
+ }
+}