This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 10c2165e13 [SYSTEMDS-3376] Adding apply_pipeline() builtin for
cleaning pipelines API - fit_pipeline() executes the best pipeline on train
and test data while apply_pipeline() transforms the test data using the
internal states of best pipeline without re-executing the best pipeline and
keeping the train data around.
10c2165e13 is described below
commit 10c2165e13faccf5ddf398d8d0ecabc953e583ab
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Thu May 12 13:54:25 2022 +0200
[SYSTEMDS-3376] Adding apply_pipeline() builtin for cleaning pipelines API
- fit_pipeline() executes the best pipeline on train and test data
while apply_pipeline() transforms the test data using the internal
states of
best pipeline without re-executing the best pipeline and keeping the
train data around.
---
scripts/builtin/apply_pipeline.dml | 213 +++++++++++++++++++++
scripts/builtin/bandit.dml | 5 +-
scripts/builtin/executePipeline.dml | 16 +-
scripts/builtin/fit_pipeline.dml | 26 +--
scripts/builtin/frameSort.dml | 3 +-
scripts/builtin/topk_cleaning.dml | 63 +++---
scripts/pipelines/scripts/enumerateLogical.dml | 50 ++++-
scripts/pipelines/scripts/utils.dml | 3 +-
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../instructions/cp/VariableCPInstruction.java | 7 +-
.../BuiltinTopkCleaningClassificationTest.java | 4 +-
.../functions/pipelines/fit_pipelineTest.dml | 17 +-
.../intermediates/classification/applyFunc.csv | 6 +-
.../intermediates/classification/bestAcc.csv | 6 +-
.../intermediates/classification/dirtyScore.csv | 2 +-
.../pipelines/intermediates/classification/hp.csv | 6 +-
.../pipelines/intermediates/classification/pip.csv | 6 +-
.../intermediates/regression/applyFunc.csv | 10 +-
.../pipelines/topkcleaningClassificationTest.dml | 5 +-
.../pipelines/topkcleaningRegressionTest.dml | 2 +-
20 files changed, 356 insertions(+), 95 deletions(-)
diff --git a/scripts/builtin/apply_pipeline.dml
b/scripts/builtin/apply_pipeline.dml
new file mode 100644
index 0000000000..d7cee4da3d
--- /dev/null
+++ b/scripts/builtin/apply_pipeline.dml
@@ -0,0 +1,213 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script will read the dirty and clean data, then it will apply the best
pipeline on dirty data
+# and then will classify both cleaned dataset and check if the cleaned dataset
is performing same as original dataset
+# in terms of classification accuracy
+
+# INPUT PARAMETERS:
+#
----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
----------------------------------------------------------------------------------------------------------------------
+# trainData Frame[Unknown] ---
+# testData Frame[Unknown] ---
+# metaData Frame[Unknown] as.frame("NULL")
+# lp Frame[Unknown] ---
+# pip Frame[Unknown] ---
+# hp Frame[Unknown] ---
+# evaluationFunc String ---
+# evalFunHp Matrix[Double] ---
+# isLastLabel Boolean TRUE
+# correctTypos Boolean FALSE
+#
+#
----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#
----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+#
----------------------------------------------------------------------------------------------------------------------
+# scores Matrix[Double] ---
+#
----------------------------------------------------------------------------------------------------------------------
+
+
+source("scripts/builtin/topk_cleaning.dml") as topk;
+
+s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData =
as.frame("NULL"), Frame[Unknown] pip,
+ Frame[Unknown] applyFunc, Matrix[Double] hp, Boolean isLastLabel =
TRUE,List[Unknown] exState, List[Unknown] iState, Boolean correctTypos=FALSE)
+ return (Matrix[Double] eXtest)
+{
+ no_of_flag_vars = 5
+ [schema, mask, fdMask, maskY] = topk::prepareMeta(testData, metaData)
+ pip = removeEmpty(target=pip, margin="cols")
+ applyFunc = removeEmpty(target=applyFunc, margin="cols")
+ metaList = list(mask=mask, schema=schema, fd=fdMask,
applyFunc=as.frame("NULL"))
+ ctx = list(prefix="----"); #TODO include seed
+ # separate the label
+ [Xtest, Ytest] = topk::getLabel(testData, isLastLabel)
+
+ # always recode the label
+ if(maskY == 1) {
+ M = as.frame(exState[1])
+ eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}",
meta=M);
+ }
+ else
+ {
+ eYtest = as.matrix(Ytest)
+ }
+ # # # when the evaluation function is called first we also compute and
keep hyperparams of target application
+ ctx = list(prefix="apply Pipeline")
+
+ [Xtest, Xt] = topk::runStringPipeline(Xtest, Xtest, schema, mask, FALSE,
correctTypos, ctx)
+
+ # # # if mask has 1s then there are categorical features
+ M = as.frame(exState[2])
+ index = vectorToCsv(mask)
+ jspecR = "{ids:true, recode:["+index+"]}"
+ eXtest = transformapply(target=Xtest, spec=jspecR, meta=M);
+ metaList["applyFunc"] = applyFunc
+
+ no_of_param = as.scalar(hp[1, 1]) + 1
+ hp_width= hp[1, 2:no_of_param]
+ hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
+ pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
+ for(i in 1:length(iState)) {
+ op = as.scalar(pip[1,i])
+ XtestClone = eXtest
+ applyOp = toString(as.scalar(applyFunc[1,i]))
+ dataFlag = as.scalar(hp_matrix[i, ncol(hp_matrix)])
+ [iState, L] = remove(iState, 1)
+ [eXtest, executeFlag] = getDataFromFlag(eXtest, mask, dataFlag)
+ L2 = list(eXtest)
+ L = as.list(L)
+ for(k in 1:length(L)) {
+ L2 = append(L2, L[k])
+ }
+ if(executeFlag == 1 & applyOp != "NA") {
+ eXtest = eval(applyOp, L2);
+ eXtest = confirmDataFromMask (eXtest, XtestClone, mask, dataFlag)
+ eXtest = confirmMetaFromMask (eXtest, mask)
+ }
+ else {
+ print("not applying "+op+" executeFlag = 0")
+ }
+ }
+
+}
+
+
+getDataFromFlag = function(Matrix[Double] X, Matrix[Double] mask, Integer
dataFlag)
+return(Matrix[Double] X,Integer executeFlag)
+{
+ executeFlag = 1
+ if(dataFlag == 0)
+ {
+ if(sum(mask) == ncol(mask))
+ executeFlag = 0
+ else {
+ # take numerics out and remove categorical
+ X = removeEmpty(target=X, margin = "cols", select = (mask == 0))
+ }
+ }
+ else if(dataFlag == 1)
+ {
+ if(sum(mask) == 0)
+ executeFlag = 0
+ else {
+ # take categorical out and remove numerics
+ X = removeEmpty(target=X, margin = "cols", select = mask)
+ }
+ }
+ else X = X
+}
+
+confirmMetaFromMask = function(Matrix[Double] X, Matrix[Double] mask)
+return (Matrix[Double] X)
+{
+ if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
+ {
+ # get the max + 1 for nan replacement
+ nanMask = is.na(X)
+ # replace nan
+ X = replace(target = X, pattern = NaN, replacement = 9999)
+ # take categorical out
+ cat = removeEmpty(target=X, margin="cols", select = mask)
+ # round categorical (if there is any floating point)
+ cat = round(cat)
+ less_than_1_mask = cat < 1
+ less_than_1 = less_than_1_mask * 9999
+ cat = (cat * (less_than_1_mask == 0)) + less_than_1
+ # reconstruct original X
+ X = X * (mask == 0)
+ q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
+ select=t(mask)), ncol(cat), ncol(X))
+ X = (cat %*% q) + X
+
+ # put nan back
+ nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
+ X = X + nanMask
+ }
+}
+
+
+confirmDataFromMask = function(Matrix[Double] nX, Matrix[Double] originalX,
Matrix[Double] mask, Integer dataFlag)
+return (Matrix[Double] X)
+{
+
+ if(dataFlag == 0 & (sum(mask) > 0) & (sum(mask) != ncol(originalX)))
+ {
+ maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1
+ nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
+ # X without numerics
+ Xcat = removeEmpty(target=originalX, margin="cols", select=mask)
+ nanMask = is.na(Xcat)
+ Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111)
+
+ # reconstruct the original matrix
+ p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
+ select=t(mask==0)), ncol(nX), ncol(originalX))
+ q = table(seq(1, ncol(Xcat)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
+ select=t(mask)), ncol(Xcat), ncol(originalX))
+ X = (nX %*% p) + (Xcat %*% q)
+
+ X = replace(target = X, pattern = maxDummy, replacement = NaN)
+ X = replace(target = X, pattern = -1111, replacement = NaN)
+ }
+ else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(originalX)))
+ {
+ maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1
+ nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
+ # X without categorical
+ Xnum = removeEmpty(target=originalX, margin="cols", select=(mask==0))
+ nanMask = is.na(Xnum)
+ Xnum = replace(target = Xnum, pattern = NaN, replacement = -1111)
+ # reconstruct the original matrix
+ p = table(seq(1, ncol(Xnum)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
+ select=t(mask==0)), ncol(Xnum), ncol(originalX))
+ q = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
+ select=t(mask)), ncol(nX), ncol(originalX))
+ X = (nX %*% q) + (Xnum %*% p)
+ X = replace(target = X, pattern = maxDummy, replacement = NaN)
+ X = replace(target = X, pattern = -1111, replacement = NaN)
+
+ }
+ else X = nX
+
+}
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 504b5bb53f..fa1ff1137d 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,7 +53,7 @@
m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train,
Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList,
String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp,
Matrix[Double] lpHp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k
= 3,
- Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref
= 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE,
String output="")
+ Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref
= 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE)
# return(Boolean perf)
return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,
Matrix[Double] bestAccuracy, Frame[String] applyFunc)
{
@@ -290,7 +290,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i = 1, Matrix[Do
hp = hp[, 2:totalVals]
applyFunctions = allApplyFunctions[i]
no_of_res = nrow(hp)
- # print("PIPELINE EXECUTION START ... "+toString(op))
+ print("PIPELINE EXECUTION START ... "+toString(op))
hpForPruning = matrix(0, rows=1, cols=ncol(op))
changesByOp = matrix(0, rows=1, cols=ncol(op))
metaList2 = metaList; #ensure metaList is no result var
@@ -564,6 +564,7 @@ return (Double accuracy, Matrix[Double] evalFunHp,
Matrix[Double] hpForPruning,
allChanges = min(allChanges)
changesByOp = colMaxs(cvChanges)
accuracy = mean(accuracyMatrix)
+ print("mean: \n"+toString(accuracyMatrix))
print("cv accuracy: "+toString(accuracy))
}
diff --git a/scripts/builtin/executePipeline.dml
b/scripts/builtin/executePipeline.dml
index a606df9a46..9eae1a8a74 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -57,9 +57,9 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList,
Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test
= FALSE, Boolean verbose)
return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest,
Matrix[Double] Ytest,
- Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double
changesAll)
+ Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double
changesAll, List[Unknown] internalStates)
{
-
+ internalStates = list()
mask=as.matrix(metaList['mask'])
FD = as.matrix(metaList['fd'])
applyFunc = as.frame(metaList['applyFunc'])
@@ -76,7 +76,7 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
for(i in 1:ncol(pipeline)) {
op = as.scalar(pipeline[1,i])
applyOp = toString(as.scalar(applyFunc[1,i]))
-
+ # print("op: "+op)
Xclone = Xtrain
XtestClone = Xtest
[hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask,
FD, hyperParameters[i], flagsCount, op)
@@ -86,11 +86,14 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
Xtrain = as.matrix(O)
if(applyOp != "NA") {
[Xtest, executeFlag] = applyDataFlag(Xtest, mask, dataFlag)
+ internalStates = append(internalStates, L)
L = append(L, list(X=Xtest));
Xtest = eval(applyOp, L);
- Xtest = confirmData(Xtest, XtestClone, mask, dataFlag, yFlag)
+ # print("L \n"+toString(L, rows=3))
+ Xtest = confirmData(Xtest, XtestClone, mask, dataFlag)
}
- Xtrain = confirmData(Xtrain, Xclone, mask, dataFlag, yFlag)
+ else internalStates = append(internalStates, as.frame("NA"))
+ Xtrain = confirmData(Xtrain, Xclone, mask, dataFlag)
# dataFlag 0 = only on numeric, 1 = on whole data
if(yFlag) {
@@ -98,6 +101,7 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
Ytrain = as.matrix(Y)
}
Xtrain = confirmMeta(Xtrain, mask)
+ Xtest = confirmMeta(Xtest, mask)
}
else {
print("not applying "+op+" executeFlag = 0")
@@ -225,7 +229,7 @@ return (Matrix[Double] X)
}
-confirmData = function(Matrix[Double] nX, Matrix[Double] originalX,
Matrix[Double] mask, Integer dataFlag, Integer yFlag)
+confirmData = function(Matrix[Double] nX, Matrix[Double] originalX,
Matrix[Double] mask, Integer dataFlag)
return (Matrix[Double] X)
{
diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml
index dae96a2a05..d67af62739 100644
--- a/scripts/builtin/fit_pipeline.dml
+++ b/scripts/builtin/fit_pipeline.dml
@@ -44,7 +44,7 @@
#
----------------------------------------------------------------------------------------------------------------------
# NAME TYPE MEANING
#
----------------------------------------------------------------------------------------------------------------------
-# result Matrix[Double] ---
+# scores Matrix[Double] ---
#
----------------------------------------------------------------------------------------------------------------------
source("scripts/pipelines/scripts/utils.dml") as utils;
@@ -54,10 +54,12 @@ source("scripts/builtin/bandit.dml") as bandit;
s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData,
Frame[Unknown] metaData = as.frame("NULL"),
Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, String
evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
-return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double]
cleanTest)
+return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double]
cleanTest, List[Unknown] externalState, List[Unknown] iState)
{
+ externalState = list()
no_of_flag_vars = 5
[schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData)
+
pip = removeEmpty(target=pip, margin="cols")
applyFunc = removeEmpty(target=applyFunc, margin="cols")
metaList = list(mask=mask, schema=schema, fd=fdMask,
applyFunc=as.frame("NULL"))
@@ -70,6 +72,7 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain,
Matrix[Double] cleanTe
if(maskY == 1) {
[eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true,
recode:[1]}");
eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}",
meta=M);
+ externalState = append(externalState, M)
}
else
{
@@ -83,14 +86,11 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain,
Matrix[Double] cleanTe
[Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask,
FALSE, correctTypos, ctx)
# # # if mask has 1s then there are categorical features
- [eXtrain, eXtest] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
+ [eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE,
"recode")
+ externalState = append(externalState, M1)
# # # do the early dropping
- [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList,
FALSE)
+ # [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList,
FALSE)
metaList["applyFunc"] = applyFunc
- # construct the parameter list for best hyper-parameters if the oversampling
technique is part of
- # pipeline then take it out because oversampling is not applied on test
dataset
- # this condition is unnecessary here in this case because the input dataset
is balanced and
- # instead of diving the dataset into train/test I am doing cross validations
no_of_param = as.scalar(hp[1, 1]) + 1
hp_width= hp[1, 2:no_of_param]
@@ -98,7 +98,7 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain,
Matrix[Double] cleanTe
pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
# # # now test accuracy
- [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(pipeline=pip,
Xtrain=eXtrain, Ytrain=eYtrain,
+ [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] =
executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix,
flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
@@ -110,10 +110,10 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain,
Matrix[Double] cleanTe
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest,
Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
testAccuracy = as.scalar(score[1, 1])
- result = matrix(0, rows=1, cols=3)
- result[1, 1] = dirtyScore
- result[1, 2] = trainAccuracy
- result[1, 3] = testAccuracy
+ scores = matrix(0, rows=1, cols=3)
+ scores[1, 1] = dirtyScore
+ scores[1, 2] = trainAccuracy
+ scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain, eYtrain)
cleanTest = cbind(eXtest, eYtest)
}
diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml
index 3cfeec7bca..cf447b4282 100644
--- a/scripts/builtin/frameSort.dml
+++ b/scripts/builtin/frameSort.dml
@@ -37,10 +37,9 @@
# f_odered Frame[String] sorted dataset by column 1 in
decreasing order
#
----------------------------------------------------------------------------------------------------------------------
-s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc
= TRUE )
+s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc
= TRUE)
return (Frame[String] f_odered)
{
- # idx[1,1] = 0 # to save accuracy column from encoding
index = vectorToCsv(mask)
# recode logical pipelines for easy handling
jspecR = "{ids:true, recode:["+index+"]}";
diff --git a/scripts/builtin/topk_cleaning.dml
b/scripts/builtin/topk_cleaning.dml
index 33ed3ddf37..b322eaf6b1 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -58,8 +58,7 @@ source("scripts/builtin/bandit.dml") as bandit;
s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest =
as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown]
primitives,
Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String
evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer
resource_val = 20, Integer max_iter = 10,
- Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean
cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean
correctTypos=FALSE, Boolean enablePruning = FALSE,
- String output)
+ Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean
cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean
correctTypos=FALSE, Boolean enablePruning = FALSE)
return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams,
Matrix[Double] topKScores,
Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc)
{
@@ -104,7 +103,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
print("---- feature transformations to numeric matrix");
[eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode")
# # # do the early dropping
- [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv)
+ # [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv)
# apply sampling on training data for pipeline enumeration
# TODO why recoding/sampling twice (within getDirtyScore)
print("---- class-stratified sampling of feature matrix w/ f="+sample);
@@ -148,7 +147,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
# stop("end of enumlp")
[topKPipelines, topKHyperParams, topKScores, applyFunc] =
bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,
metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical,
lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
- k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed,
enablePruning = enablePruning, output=output, verbose=TRUE);
+ k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed,
enablePruning = enablePruning, verbose=TRUE);
t7 = time(); print("-- Cleaning - Enum Physical Pipelines:
"+(t7-t6)/1e9+"s");
}
@@ -239,7 +238,7 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
}
recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest,
Matrix[Double] mask, Boolean cv, String code)
-return(Matrix[Double] eXtrain, Matrix[Double] eXtest)
+return(Matrix[Double] eXtrain, Matrix[Double] eXtest, Frame[Unknown] X_meta)
{
if(sum(mask) > 0)
{
@@ -257,31 +256,31 @@ return(Matrix[Double] eXtrain, Matrix[Double] eXtest)
}
}
-featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest,
List[Unknown] metaList, Boolean cv)
-return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList)
-{
- mask = as.matrix(metaList['mask'])
- fdMask = as.matrix(metaList['fd'])
- schema = as.frame(metaList['schema'])
- # # 1. if 90% of the column is empty
- # # # 2. if the column has only single value
- # # # have all unique values
- Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0)
- nullMask = is.na(eXtrain)
- singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) |
(colMaxs(Xtmp) == colMins(Xtmp))
- allmostEmpty = colSums(nullMask)
- allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9)
- allSum = singleValuesCol | allmostEmptyRatio
- if(sum(allSum) > 0) {
- eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum ==
0))
- if(!cv)
- eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum ==
0))
- mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0))
- fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0))
- schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0))
- metaList['mask'] = mask
- metaList['schema'] = schema
- metaList['fd'] = fdMask
- }
-}
+# featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest,
List[Unknown] metaList, Boolean cv)
+# return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList)
+# {
+ # mask = as.matrix(metaList['mask'])
+ # fdMask = as.matrix(metaList['fd'])
+ # schema = as.frame(metaList['schema'])
+ # # # 1. if 90% of the column is empty
+ # # # # 2. if the column has only single value
+ # # # # have all unique values
+ # Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0)
+ # nullMask = is.na(eXtrain)
+ # singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) |
(colMaxs(Xtmp) == colMins(Xtmp))
+ # allmostEmpty = colSums(nullMask)
+ # allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9)
+ # allSum = singleValuesCol | allmostEmptyRatio
+ # if(sum(allSum) > 0) {
+ # eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum ==
0))
+ # if(!cv)
+ # eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum ==
0))
+ # mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0))
+ # fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum ==
0))
+ # schema = removeEmpty(target=schema, margin="cols", select = (allSum ==
0))
+ # metaList['mask'] = mask
+ # metaList['schema'] = schema
+ # metaList['fd'] = fdMask
+ # }
+# }
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml
b/scripts/pipelines/scripts/enumerateLogical.dml
index d4f9891b27..a6d8459e06 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -87,7 +87,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp,
boolean converged, Do
pipelines = rbind(ref, pipelines)
population = pipelines
populationSize = nrow(pipelines)
- transitions = sample(3, (populationSize * max_iter), TRUE, seed)
+ transitions = sample(4, (populationSize * max_iter), TRUE, seed)
opToAdd = sample(nrow(allOps), (populationSize * max_iter), TRUE, seed)
# opToRemove = sample(max_iter, (populationSize * max_iter), TRUE, seed)
refChangesInternal = 0
@@ -117,18 +117,39 @@ return (Frame[Unknown] outputPip, Matrix[Double]
outputHp, boolean converged, Do
finalOutput = append(finalOutput, sortedPipelines)
finalOutputHp = append(finalOutputHp, sortedHp)
# # # if converged then stop otherwise generate new population
- children = frame(0, rows=populationSize, cols=ncol(sortedPipelines))
+ children = frame(0, rows=populationSize,
cols=ncol(sortedPipelines)+(ncol(sortedPipelines)/2))
sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)]
+ start = 1;
+ end = 0;
+ topk = frame(0, rows=round((populationSize/2)) * length(finalOutput) ,
cols=populationLength + 2)
+ for(i in 1:length(finalOutput))
+ {
+ pipFrame = as.frame(finalOutput[i])
+ end = end + nrow(pipFrame)
+ topk[start:end, 1:ncol(pipFrame)] = pipFrame
+ start = end + 1
+ }
+ sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1,
cols=ncol(topk) - 2))
+ topk = removeEmpty(target=topk, margin="rows")
+ topk = frameSort(topk, sort_mask, TRUE)
+ topk = topk[, 3:ncol(topk)]
# # randomly pick the pipelines for transitions
pipRand = sample(nrow(sortedPipelines), populationSize, TRUE, seed)
if(!converged) {
parfor(i in 1:nrow(children), check=0) {
idxR = (nrow(children) * (iter - 1)) + i
idx = as.scalar(pipRand[i])
- top = removeEmpty(target=sortedPipelines[idx], margin="cols")
- tail = top[, ncol(top)]
- if(sum(mask) > 0)
+ top = removeEmpty(target=topk[idx], margin="cols")
+ # top = removeEmpty(target=sortedPipelines[idx], margin="cols")
+ idx2 = min(max(pipRand), idx + 1)
+ top2 = removeEmpty(target=topk[idx2], margin="cols")
+ # top2 = removeEmpty(target=sortedPipelines[idx2], margin="cols")
+ if(sum(mask) > 0) {
+ tail = top[, ncol(top)]
+ tail2 = top2[, ncol(top2)]
top = top[, 1:ncol(top) - 1]
+ top2 = top2[, 1:ncol(top2) - 1]
+ }
random = ifelse(ncol(top) <=2, 1, as.scalar(transitions[idxR]))
if(random == 1)
@@ -137,6 +158,8 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp,
boolean converged, Do
c1 = mutation(top, seed)
else if(random == 3)
c1 = removal(top, seed)
+ else if(random == 4)
+ c1 = crossover(top, top2, seed)
if(sum(mask) > 0)
c1 = cbind(c1, tail)
@@ -171,11 +194,9 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp,
boolean converged, Do
refChanges = as.double(as.scalar(outputPip[nrow(outputPip), 2]))
acc = outputPip[, 1]
+ print(toString(outputPip))
outputPip = outputPip[,3:ncol(outputPip)]
- print(toString(outputHp))
-
-
}
addition = function(Frame[Unknown] top, Frame[Unknown] opToAdd)
@@ -214,6 +235,19 @@ return (Frame[Unknown] child)
}
}
+crossover = function(Frame[Unknown] p1, Frame[Unknown] p2, Integer seed)
+return(Frame[Unknown] child)
+{
+ # # randomly select the lengths to be append
+ lp1 = as.scalar(sample(ncol(p1), 1, FALSE, seed))
+ lp2 = as.scalar(sample(ncol(p2), 1, FALSE, seed))
+ child = cbind(p1[, 1:lp1], p2[, lp2:ncol(p2)])
+ print("p1 "+toString(p1))
+ print("p2 "+toString(p2))
+ print("child "+toString(child))
+}
+
+
getOps = function( Frame[string] allOps, Frame[String] refSol, Integer dist,
Integer n, Integer minValue)
return (Frame[String] allOps, Frame[String] refSol) {
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index 8f0a60dc5a..7fb95297df 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -59,9 +59,10 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio, Boolean
return (Matrix[Double] sampledX, Matrix[Double] sampledY)
{
MIN_SAMPLE = 1000
- sampled = floor(nrow(eX) * ratio)
sampledX = eX
sampledY = eY
+ ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
+ sampled = floor(nrow(eX) * ratio)
if(sampled > MIN_SAMPLE & ratio != 1.0)
{
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index e42d7f8bad..5e9509696b 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -46,6 +46,7 @@ public enum Builtins {
ALS_DS("alsDS", true),
ALS_PREDICT("alsPredict", true),
ALS_TOPK_PREDICT("alsTopkPredict", true),
+ APPLY_PIPELINE("apply_pipeline", true),
ARIMA("arima", true),
ASIN("asin", false),
ATAN("atan", false),
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
index 08e8e3f741..eb4065ccfc 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
@@ -610,9 +610,10 @@ public class VariableCPInstruction extends CPInstruction
implements LineageTrace
case CastAsListVariable:
ListObject lobj = ec.getListObject(getInput1());
if( lobj.getLength() != 1 || !(lobj.getData(0)
instanceof ListObject) )
- throw new RuntimeException("as.list() expects a
list input with one nested list: "
- + "length(list)="+lobj.getLength()+",
dt(list[0])="+lobj.getData(0).getDataType() );
- ec.setVariable(output.getName(), lobj.getData(0));
+ ec.setVariable(output.getName(), lobj);
+// throw new RuntimeException("as.list() expects a
list input with one nested list: "
+// + "length(list)="+lobj.getLength()+",
dt(list[0])="+lobj.getData(0).getDataType() );
+ else ec.setVariable(output.getName(), lobj.getData(0));
break;
case CastAsDoubleVariable:
diff --git
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index de880724a2..5ef02bef1f 100644
---
a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -48,13 +48,13 @@ public class BuiltinTopkCleaningClassificationTest extends
AutomatedTestBase {
@Ignore
public void testFindBestPipelineCompany() {
runtopkCleaning(DATA_DIR+ "company.csv", RESOURCE+
"meta/meta_company.csv", 1.0, 3,5,
- .0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
+ 5.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
}
@Test
public void testFindBestPipelineCensus() {
runtopkCleaning(DATA_DIR+ "dirty.csv", RESOURCE+
"meta/meta_census.csv", 1.0, 3,5,
- 27.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
+ 2.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
}
// this test is ignored due to it long running time in Git actions
diff --git a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
index 889b82c6e1..f0cb72656a 100644
--- a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
+++ b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
@@ -59,18 +59,25 @@ trainData = F[1:split,]
testData = F[split+1:nrow(F),]
-result = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,],
hp[1,], "evalClassification", evalHp, TRUE, FALSE)
+print("pipeline: "+toString(pip[1]))
+[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData,
metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE,
FALSE)
+eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,],
TRUE, exState, iState, FALSE)
+
-header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
result = as.frame(result)
+resultBool = as.scalar(result[1, 3] > result[1, 1])
+eXtest = replace(target=eXtest, pattern=NaN, replacement=0)
+tsX = replace(target=tsX, pattern=NaN, replacement=0)
+
+resApply = sum(eXtest - tsX[, 1:ncol(eXtest)]) == 0
+resultBool = resultBool & resApply
+write(resultBool, $6)
+header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
writeRes = rbind(header, result)
print(toString(writeRes))
-result = as.scalar(result[1, 3] > result[1, 1])
-write(result, $6)
-
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp
(hyper-param), trainML (boolean for optimizing hp internally or passed by
externally )
evalClassification = function(Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index b11da3e9ee..457f972871 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-scaleApply,dummycodingApply,0,0,0
-NA,scaleApply,NA,dummycodingApply,0
-winsorizeApply,NA,scaleApply,dummycodingApply,0
+NA,dummycodingApply,0
+NA,dummycodingApply,0
+NA,dummycodingApply,0
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index 746303da87..50c70d1152 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-93.69369369369369
-93.69369369369369
-93.69369369369369
+73.73188405797102
+70.1086956521739
+68.29710144927536
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
index d70d1d1953..4e5b1a5042 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
@@ -1 +1 @@
-71.17117117117117
\ No newline at end of file
+61.050724637681164
\ No newline at end of file
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 0f59fbc7a5..643e5d3472 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-16.0,2.0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,1.0,0.2,0,0,0,1.0,0,2.0,2.0,1.0,0,0,0,0,0,0,1.0,0.2,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,0.2,0,0,0,1.0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.10554249238742949,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 0080afe1c1..86a68a13a4 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-scale,dummycoding,0,0,0
-underSampling,scale,underSampling,dummycoding,0
-winsorize,underSampling,scale,dummycoding,0
+underSampling,dummycoding,0
+underSampling,dummycoding,0
+underSampling,dummycoding,0
diff --git
a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
index a0ecc4ac21..3ce56a2c04 100644
---
a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
@@ -1,5 +1,5 @@
-miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0,0
-miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0,0
-miceApply,forward_fill,winsorizeApply,normalizeApply,scaleApply,0,0,0
-winsorizeApply,forward_fill,miceApply,normalizeApply,scaleApply,0,0,0
-miceApply,forward_fill,normalizeApply,winsorizeApply,scaleApply,0,0,0
+winsorizeApply,imputeByMeanApply,normalizeApply,scaleApply,0,0,0
+miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0
+miceApply,imputeByMeanApply,forward_fill,normalizeApply,scaleApply,0,0
+normalizeApply,miceApply,forward_fill,scaleApply,0,0,0
+normalizeApply,miceApply,forward_fill,scaleApply,0,0,0
diff --git
a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 3f4b7a0dc5..296165c029 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -56,9 +56,10 @@ if(nrow(metaInfo) < 2)
metaInfo = metaInfo[, 2:ncol(metaInfo)]
# # # split in train/test 70/30
-[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp,
applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData,
metaData=metaInfo, primitives=primitives, parameters=param, refSol =
frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
+[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp,
applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData,
metaData=metaInfo, primitives=primitives, parameters=param,
+ refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK,
resource_val=resources, enablePruning=TRUE,
- expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV,
cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)
+ expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV,
cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE)
write(topKPipelines, output+"/pip.csv", format="csv")
write(topKHyperParams, output+"/hp.csv", format="csv")
diff --git
a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
index cdb4a155fa..4f2dbf3106 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
@@ -50,7 +50,7 @@ else {
#matrix("1 1e-6 1e-9 1000", rows=1, cols=4)
[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp,
applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData,
primitives=primitives, parameters=param, evaluationFunc=evalFunc,
evalFunHp=as.matrix(NaN),
- topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample,
isLastLabel=TRUE, correctTypos=FALSE, output=output)
+ topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample,
isLastLabel=TRUE, correctTypos=FALSE)
write(topKPipelines, output+"/pip.csv", format="csv")
write(topKHyperParams, output+"/hp.csv", format="csv")