This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 6840bfe [SYSTEMDS-3299] Seed value in genetic Algorithm - This
commit introduce seed parameter in genetic algorithm to control the random
additions and transitions in pipelines.
6840bfe is described below
commit 6840bfec8e6d2cf1418cbfb6841dcf1979abe292
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Thu Feb 24 12:03:24 2022 +0100
[SYSTEMDS-3299] Seed value in genetic Algorithm
- This commit introduce seed parameter in genetic algorithm to control the
random additions and transitions in pipelines.
---
scripts/builtin/bandit.dml | 320 ++++++---------------
scripts/builtin/executePipeline.dml | 42 +--
scripts/builtin/mice.dml | 6 +-
scripts/builtin/topk_cleaning.dml | 48 ++--
scripts/pipelines/scripts/enumerateLogical.dml | 108 +++----
scripts/pipelines/scripts/utils.dml | 17 +-
.../apache/sysds/runtime/util/UtilFunctions.java | 8 +-
.../functions/pipelines/applyEvaluateTest.dml | 2 +-
.../intermediates/classification/applyFunc.csv | 6 +-
.../intermediates/classification/bestAcc.csv | 6 +-
.../intermediates/classification/evalHp.csv | 2 +-
.../pipelines/intermediates/classification/hp.csv | 6 +-
.../pipelines/intermediates/classification/pip.csv | 6 +-
.../intermediates/regression/applyFunc.csv | 10 +-
.../functions/pipelines/topkLogicalTest.dml | 2 +-
.../pipelines/topkcleaningClassificationTest.dml | 2 +-
16 files changed, 222 insertions(+), 369 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index e24e851..06fa57a 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,13 +53,14 @@
m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train,
Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList,
String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp,
Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3,
- Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Boolean
verbose = TRUE, String output="")
+ Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref
= 0, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="")
return(Boolean perf)
# return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,
Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter)
{
print("Starting optimizer")
- NUM_FEATURES = 14
+ totalPruneCount = 0
FLAG_VARIABLE = 5
+ pipelines_executed = 0
HYPERPARAM_LENGTH = (ncol(lp) * FLAG_VARIABLE * 3) + 1 ## num of col in
logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
bestPipeline = frame("", rows=1, cols=1)
bestHyperparams = as.matrix(0)
@@ -67,19 +68,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
# initialize bandit variables
# variable names follow publication where algorithm is introduced
eta = 2 # the halving ratio is fixed to 2
- s_max = floor(log(R,eta));
- B = (s_max + 1) * R;
-
+ s_max = floor(log(R,eta)) - 1;
# initialize output variables
hparam = matrix(0, rows=k*(s_max+1), cols=HYPERPARAM_LENGTH)
pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
- startOut=0; endOut=0;
- feaFrameOuter = frame(data=["#MissingValues", "MinVla", "MaxVal",
"AverageMin", "AverageMax",
- "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers",
"#OHEfeatures", "#Classes",
- "Imbalance", "#rows", "#cols", "pipelines", "accuracy", "execution time in
ms", "CV time in ms"],
- rows = 1, cols = NUM_FEATURES + 4 )
+ endIdx = matrix(k, rows=(s_max+1), cols=1)
+ endIdx = cumsum(endIdx)
+ startIdx = (endIdx - k) + 1
- for(s in s_max:0) { # TODO convert to parfor
+ n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/(s_max + 1));)
+
+ for(s in s_max:0, check=0) { # TODO convert to parfor
# result variables
bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH)
@@ -87,21 +86,14 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
start=1; end=0;
# # compute the number of initial pipelines n
- n = ceil(floor(B/R/(s+1)) * eta^s);
r = R * eta^(-s);
- # get the physical pipelines, the pipelines, pipelines are recoded
- # [configurations, n] = get_physical_configurations(lp, n, primitives)
- n = min(nrow(lp), n)
- configurations = lp[1:n]
+ configurations = lp[1:(min(n, nrow(lp)))]
# append configuration keys for extracting the pipeline later on
id = seq(1, nrow(configurations))
configurations = cbind(as.frame(id), configurations)
# save the original configuration as a lookup table
lookup = configurations
-
- if(verbose)
- print("n "+ n +"\nR "+ R +"\ns_max "+ s_max +"\nB "+ B +"\nn "+ n +"\nr
"+ r)
-
+
for(i in 0:s) {
# successive halving
n_i = min(max(as.integer(floor(n * eta^(-i))), 1), nrow(configurations));
@@ -112,10 +104,11 @@ m_bandit = function(Matrix[Double] X_train,
Matrix[Double] Y_train, Matrix[Doubl
print("no of resources --------------"+r_i)
print("iteration ---------------------"+i+" out of "+s)
}
-
configurations = configurations[1:n_i, ]
- [outPip,outHp, feaFrameOuter] =
run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train,
Xtest=X_test, Ytest=Y_test,
- metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp,
param=param, featureFrameOuter=feaFrameOuter, cv=cv, cvk=cvk)
+ pipelines_executed = pipelines_executed + (n_i * r_i)
+ [outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations,
r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
+ evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param,
cv=cv, cvk=cvk, ref=ref, enablePruning=enablePruning)
+ totalPruneCount = totalPruneCount + pruneCount
# sort the pipelines by order of accuracy decreasing
a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
@@ -123,7 +116,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
# maintain the brackets results
end = end + rowIndex
- bracket_pipel[start:end, ] = a[1:rowIndex,]
+ bracket_pipel[start:end, 1:ncol(a)] = a[1:rowIndex,]
bracket_hp[start:end, 1:ncol(b)] = b[1:rowIndex,]
start = end + 1
@@ -133,20 +126,19 @@ m_bandit = function(Matrix[Double] X_train,
Matrix[Double] Y_train, Matrix[Doubl
configurations = frameSort(cbind(avergae_perf, configurations),
cbind(as.matrix(0), sortMask), TRUE)
configurations = configurations[, 2:ncol(configurations)]
}
+ if(n < nrow(lp))
+ lp = lp[n+1:nrow(lp),]
bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows")
bracket_hp = removeEmpty(target=bracket_hp, margin="rows")
# keep the best k results for each bracket
[bracket_bestPipeline, bracket_bestHyperparams] =
extractBracketWinners(bracket_pipel, bracket_hp, k, lookup)
# optimize by the features
- startOut = endOut + 1
- endOut = endOut + nrow(bracket_bestPipeline)
+ startOut = as.scalar(startIdx[s+1])
+ endOut = min(as.scalar(endIdx[s+1]), (startOut +
nrow(bracket_bestPipeline) - 1))
pipeline[startOut:endOut, ] = bracket_bestPipeline
-
hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] =
bracket_bestHyperparams
}
-
[bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam,
baseLineScore, k)
-
bestAccuracy = as.matrix(bestPipeline[,1])
bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)]
bestPipeline = bestPipeline[, 2:ncol(bestPipeline)]
@@ -166,11 +158,13 @@ m_bandit = function(Matrix[Double] X_train,
Matrix[Double] Y_train, Matrix[Doubl
print("topk scores: \n"+toString(bestAccuracy))
print("evalHp: \n"+toString(evalFunHp))
print("performance improvement "+ imp)
+ print("total physical pipelines to be executed: "+pipelines_executed)
+ print("prune count: "+totalPruneCount)
+ print("actual executed pipelines: "+(pipelines_executed - totalPruneCount))
}
write(bestPipeline, output+"/pip.csv", format="csv")
write(bestHyperparams, output+"/hp.csv", format="csv")
write(bestAccuracy, output+"/bestAcc.csv", format="csv")
- write(feaFrameOuter, output+"/featureFrame.csv", format="csv")
write(baseLineScore, output+"/dirtyScore.csv", format="csv")
write(evalFunHp, output+"/evalHp.csv", format="csv")
write(applyFunc, output+"/applyFunc.csv", format="csv")
@@ -179,7 +173,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
# this method will extract the physical pipelines for a given logical pipelines
get_physical_configurations = function(Frame[String] logical, Scalar[int]
numConfigs = 10,
Frame[Unknown] primitives)
- return(Frame[String] physical, Double min)
+ return(Frame[String] physical)
{
# load the primitives
physical = as.frame("NaN")
@@ -193,7 +187,7 @@ get_physical_configurations = function(Frame[String]
logical, Scalar[int] numCon
dim = primitives[, 8]
operator = frame(0, rows=nrow(primitives), cols=ncol(logical)) # combine
all logical primitives
- parfor(j in 1:ncol(logical))
+ parfor(j in 1:ncol(logical), check = 0)
{
# extract the physical primitives
if(as.scalar(logical[1,j]) == "ED")
@@ -208,60 +202,23 @@ get_physical_configurations = function(Frame[String]
logical, Scalar[int] numCon
operator[, j] = ci;
else if(as.scalar(logical[1,j]) == "DIM")
operator[, j] = dim;
- else if(as.scalar(logical[1,j]) == "DUMMY")
- operator[, j] = dummy;
+ else if(as.scalar(logical[1,j]) == "DUMMY")
+ operator[, j] = dummy;
else if(as.scalar(logical[1,j]) == "SCALE")
operator[, j] = scale;
else print("invalid operation "+as.scalar(logical[1,j]))
}
-
- idx = matrix(1, rows=1, cols=ncol(logical))
- # get the indexes of columns for recode transformation
- index = vectorToCsv(idx)
- # recode logical pipelines for easy handling
- jspecR = "{ids:true, recode:["+index+"]}";
- [X, M] = transformencode(target=operator, spec=jspecR);
- X = replace(target= X, pattern = NaN, replacement = 0)
-
- paramLens = matrix(0, ncol(logical), 1);
- parfor( j in 1:ncol(logical)) {
- vect = removeEmpty(target = X[,j], margin = "rows");
- paramLens[j,1] = nrow(vect);
- }
- min = prod(paramLens)
- numConfigs = ifelse(numConfigs == 0, min, numConfigs)
- sample = ifelse(min > numConfigs, TRUE, FALSE)
- paramVals = matrix(0, ncol(logical), max(paramLens));
- parfor( j in 1:ncol(logical) ) {
- vector = removeEmpty(target = X[,j], margin = "rows");
- paramVals[j,1:nrow(vector)] = t(vector);
- }
- cumLens = rev(cumprod(rev(paramLens))/rev(paramLens));
- XI = table(seq(1,nrow(cumLens)), sample(nrow(cumLens),nrow(cumLens)))
- cumLens = XI %*% cumLens
- # materialize hyper-parameter combinations
- HP = matrix(0, min(numConfigs, min), ncol(logical));
- pip = seq(1,nrow(HP))
- if(sample)
- pip = sample(nrow(HP),numConfigs)
- XI = table(seq(1,nrow(pip)), sample(nrow(pip),nrow(pip)))
- pip = XI %*% pip
-
- for( i in 1:nrow(HP)) {
- for( j in 1:ncol(logical) ) {
- HP[i,j] =
paramVals[j,as.scalar((as.scalar(pip[i,1])/cumLens[j,1])%%paramLens[j,1]+1)];
- }
- }
-
- physical = transformdecode(target=HP, spec=jspecR, meta=M);
+ physical = operator
}
# this method will call the execute pipelines with their hyper-parameters
run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i,
Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String
evaluationFunc, Matrix[Double] evalFunHp,
- Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean cv, Integer
cvk = 2, Boolean default = FALSE)
- return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam,
Frame[Unknown] featureFrameOuter)
+ Frame[Unknown] param, Boolean cv, Integer cvk = 2, Double ref = 0, Boolean
enablePruning = FALSE, Boolean default = FALSE)
+ return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam,
Integer pruneCount, Matrix[Double] changesByPipMatrix)
{
+ changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1)
+ pruneCount = 0
output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)-1) * 5 * 3)
output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
@@ -274,15 +231,13 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
id = as.matrix(ph_pip[, 1])
ph_pip = ph_pip[, 2:ncol(ph_pip)]
evalFunOutput = as.matrix(0)
- feaVec = gatherStats(X, Y, as.matrix(metaList['mask']))
for(i in 1:nrow(ph_pip))
{
# execute configurations with r resources
op = removeEmpty(target=ph_pip[i], margin="cols")
- [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op,
param, r_i, default)
- if(ncol(featureFrameOuter) > 1)
- feaFrame = frame("", rows = no_of_res, cols = ncol(featureFrameOuter))
+ print("PIPELINE EXECUTION START ... "+toString(op))
+ [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op,
param, r_i, default, enablePruning)
pip_toString = pipToString(op)
hpForPruning = matrix(0, rows=1, cols=ncol(op))
changesByOp = matrix(0, rows=1, cols=ncol(op))
@@ -297,25 +252,28 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
indexes = table(indexes, 1, 1, nrow(hp), 1)
hp_matrix = removeEmpty(target = hp, margin="rows", select = indexes)
# # check if the pruning could be applied to avoid unnecessary executions
- executionSingnal = pruningSignal(op, hp_matrix, hpForPruning,
changesByOp)
-
+ pruneSignal = pruningSignal(op, hp_matrix, hpForPruning, changesByOp)
+ executionSingnal = ifelse(enablePruning, pruneSignal, TRUE)
+ ref = ifelse(enablePruning, ref, 0)
if(executionSingnal)
{
t1 = time()
-
if(cv)
{
pipList = list(ph = op, hp = hp_matrix, flags = no_of_flag_vars)
- [accuracy, evalHp, hpForPruning, changesByOp] = crossV(X=X, y=Y,
cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList,
hpForPruning=hpForPruning,
- changesByOp=changesByOp, evalFunc=evaluationFunc)
+ [accuracy, evalHp, hpForPruning, changesByOp, changesByPip] =
crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp,
+ pipList=pipList, metaList=metaList, hpForPruning=hpForPruning,
+ changesByOp=changesByOp, evalFunc=evaluationFunc, ref=ref)
}
else
{
- [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] =
executePipeline(pipeline=op,
+ [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp,
changesByPip] = executePipeline(pipeline=op,
Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,
hyperParameters=hp_matrix, hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE,
verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
print("Y contains only one class")
+ else if(changesByPip < ref)
+ print("prunning alert 2: no training the model due to minimum
changes")
else
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain,
Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
accuracy = as.scalar(evalFunOutput[1, 1])
@@ -332,31 +290,26 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
Y = clone_Y
Xtest = clone_Xtest
Ytest = clone_Ytest
- if(ncol(featureFrameOuter) > 1) {
- feaFrame[r, 1:ncol(feaVec)] = as.frame(feaVec)
- feaFrame[r, (ncol(feaVec)+1)] = pip_toString
- feaFrame[r, (ncol(feaVec)+2)] = accuracy
- feaFrame[r, (ncol(feaVec)+3)] = accT #Tr
- feaFrame[r, (ncol(feaVec)+4)] = accT
- }
}
- else print("prunningAlert: not executing instance : "+r)
+ else
+ {
+ pruneCount = pruneCount + 1
+ print("prunningAlert: not executing instance : "+r+"
pruneCount"+pruneCount)
+ }
+ changesByPipMatrix[index] = changesByPip
index = index + 1
}
-
X = clone_X
Y = clone_Y
Xtest = clone_Xtest
Ytest = clone_Ytest
- if(ncol(featureFrameOuter) > 1)
- featureFrameOuter = rbind(featureFrameOuter, feaFrame)
}
output_hyperparam = removeEmpty(target=cbind(output_accuracy, output_hp),
margin="rows")
output_operator = removeEmpty(target=cbind(output_accuracy,
output_pipelines), margin="rows")
}
# extract the hyper-parameters for pipelines
-getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList,
Integer no_of_res, Boolean default)
+getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList,
Integer no_of_res, Boolean default, Boolean enablePruning)
return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer
no_of_res, Integer NUM_META_FLAGS)
{
@@ -384,6 +337,7 @@ getHyperparam = function(Frame[Unknown] pipeline,
Frame[Unknown] hpList, Intege
paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res,
cols=max(paramCount)+NUM_META_FLAGS+1)
for(i in 1:ncol(pipeline)) {
+ op = as.scalar(pipeline[1, i])
index = as.scalar(indexes[i])
no_of_param = as.integer(as.scalar(paramCount[i]))
# extract hasY and verbose flags
@@ -434,6 +388,8 @@ getHyperparam = function(Frame[Unknown] pipeline,
Frame[Unknown] hpList, Intege
typeIdx = typeIdx + 1
}
}
+ if((op == "outlierBySd" | op == "outlierByIQR" | op == "imputeByFd") &
no_of_res > 1 & enablePruning)
+ OpParam = order(target=OpParam, by = 1, decreasing = FALSE,
index.return = FALSE)
# hyper-parameter vector contains no. of hp, values of hp, and flag
values
OpParam = cbind(matrix(no_of_param, rows=nrow(OpParam), cols=1),OpParam,
attachMask,
attachFD, attachY, isVerbose, dataFlag)
@@ -454,50 +410,8 @@ extractTopK = function(Frame[Unknown] pipeline,
Matrix[Double] hyperparam,
Double baseLineScore, Integer k)
return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
{
- # # # take out the accuracy from pipelines
- pipeline = pipeline[, 2:ncol(pipeline)]
- idx = vectorToCsv(seq(1, ncol(pipeline)))
- jspecDC = "{ids:true, recode:["+idx+"]}";
- # OHE of categorical features
- [dpipeline, dM] = transformencode(target=pipeline, spec=jspecDC);
- # bind the pipelines and hyper-parameters into one matrix
- forDedup = cbind(dpipeline, hyperparam)
- # perform the similarity based deduplication
- dup = mdedup(cbind(pipeline, as.frame(hyperparam)), matrix(seq(2,
ncol(forDedup)), 1,
- ncol(forDedup)-1), matrix(1,1,ncol(forDedup)-1), as.matrix(1),
as.matrix(1), FALSE)
-
- if(sum(dup) > 0)
- {
- # take out the unique tuples
- uniqueTuples = removeEmpty(target=forDedup, margin="rows", select=(dup==0))
- # remove the zero rows, identifiers of unique records
- dup = removeEmpty(target=dup, margin="rows")
- # get the counts of duplicate tuples with their tuple id
- countDist = table(dup, 1) > 0
- countDist = countDist * seq(1, nrow(countDist))
- countsVal = removeEmpty(target=countDist, margin="rows")
- indexes = table(seq(1, nrow(countsVal)),countsVal,1,nrow(countsVal),
cols=nrow(forDedup))
-
- # for each duplicate record just take the one reocrd and strip the others
- deduplicates = indexes %*% forDedup
-
- # combine the deduplicated tuples and unique tuples again
- forDedup = rbind(uniqueTuples, deduplicates)
- }
-
- # decode the pipelines
- decoded = transformdecode(target=forDedup[, 1:ncol(pipeline)], meta=dM,
spec=jspecDC)
- # separate the pipelines and hyper-parameters
- pipeline = decoded[, 1:ncol(pipeline)]
- hyperparam = forDedup[, ncol(pipeline)+1:ncol(forDedup)]
-
- # sort results
- # # add accuracy back
- pipeline = cbind(as.frame(forDedup[, ncol(pipeline)+1]), pipeline)
hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE,
index.return=FALSE)
pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1,
cols=ncol(pipeline) - 1)), TRUE)
-
-
# remove the row with accuracy less than test accuracy
mask = (hyperparam[, 1] < baseLineScore) == 0
hyperparam = removeEmpty(target = hyperparam, margin = "rows", select = mask)
@@ -522,9 +436,8 @@ extractBracketWinners = function(Matrix[Double] pipeline,
Matrix[Double] hyperpa
bestPipeline = frame(data="|", rows=nrow(pipeline), cols=ncol(conf))
parfor(i in 1: nrow(pipeline)) {
index = as.scalar(pipeline[i, 3])
- out = conf[index, 2:ncol(conf)]
+ bestPipeline[i] = conf[index]
bestPipeline[i, 1] = as.frame(pipeline[i, 1])
- bestPipeline[i, 2:ncol(bestPipeline)] = out
}
}
@@ -539,90 +452,6 @@ return (Frame[Unknown] maxperconf)
maxperconf[1:ncol(tab),] = as.frame(t(colMaxs(tab)))
}
-
-###############################################################################################
-# The function will collect the features like statistics and pipelines and
accuracy
-# so that they could be used for training a model and predicting pipelines
without enumeration
-###############################################################################################
-gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask)
-return (Matrix[Double] features)
-{
-
- features = matrix(0, rows = 1, cols= 14)
- features[1, 1]= sum(is.na(X)) # number of missing values
- X = replace(target= X, pattern = NaN, replacement = 0)
- num = removeEmpty(target=X, margin="cols", select=(mask == 0))
- # get the stats
- features[1, 2] = min(num) # minimum value
- features[1, 3] = max(num)
- features[1, 4] = mean(colMins(num)) # average minimum value
- features[1, 5] = mean(colMaxs(num)) # average maximum value
- features[1, 6] = sum(mask) # number of categorical features
- features[1, 7] = sum(mask == 0) # number of numerical features
- features[1, 8] = mean(num) # mean value
- colSd = colSds(num)
- count3sdplus = sum(num > (colMeans(num) + 3*colSd ))
- count3sdminus = sum(num < (colMeans(num) - 3*colSd ))
- outliers = count3sdplus + count3sdminus
- features[1, 9] = outliers
- # OHE features
- OHE = sum(colMaxs(X) * mask)
- features[1, 10] = OHE
-
- if(nrow(Y) > 1 & min(Y) >= 1)
- {
- ctab = table(Y, 1)
- features[1, 11] = nrow(ctab) # number of classes
- minCat = min(ctab) / nrow(ctab)
- maxCat = max(ctab) / nrow(ctab)
- # class imabalance 1=YES, 0=NO
- features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
- }
- else
- {
- features[1, 11] = 0
- features[1, 12] = 0
- }
- features[1, 13] = nrow(X)
- features[1, 14] = ncol(X)
-
-}
-
-
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
-
-compareValue = function(Matrix[double] dirtyX, Matrix[double] fixedX,
Matrix[Double] cleanX, Matrix[Double] mask)
-return (Double precision, Double T)
-{
- t1 = time()
- DEFAULT = 404
- mv = is.na(dirtyX)
- correctionsRequired = 0
- mv = is.na(fixedX)
- dirtyX = replace(target= dirtyX, pattern=NaN, replacement=DEFAULT)
- cleanX = replace(target= cleanX, pattern=NaN, replacement=DEFAULT)
- fixedX = replace(target= fixedX, pattern=NaN, replacement=DEFAULT)
- diffCleanDirty = sum((abs(cleanX - dirtyX) < 0.001) < 1) #sum(cleanX ==
dirtyX) #
- print("dirty != clean: "+diffCleanDirty)
- correctionsRequired = (abs(cleanX - dirtyX) < 0.001) < 1#dirtyX != cleanX
- print("corrections required: "+sum(correctionsRequired))
- correctionsMade = sum(dirtyX != fixedX)
- print("corrections made: "+correctionsMade)
- dim = nrow(dirtyX) * ncol(dirtyX)
- match = (abs(cleanX - fixedX) < 0.001) * correctionsRequired
- print("total matches "+sum(match))
- # print("total matches \n"+toString(match))
- precision = max(0.001, sum(match) / max(1, correctionsMade))
- T = floor((time() - t1) / 1e+6)
- print("Precision: "+toString(precision) + " in "+T+" ms")
-}
-
pipToString = function(Frame[String] F)
return (String s)
{
@@ -632,16 +461,20 @@ return (String s)
}
-
crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk,
Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList,
- Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp =
as.matrix(0), String evalFunc)
-return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double]
hpForPruning, Matrix[Double] changesByOp)
+ Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp =
as.matrix(0), String evalFunc, Double ref = 0)
+return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double]
hpForPruning, Matrix[Double] changesByOp, Double allChanges)
{
+
+ # # in the below condition we compute the hp using cv method on train dataset
if(is.na(as.scalar(evalFunHp[1,1]))) {
forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y,
Xorig=as.matrix(0), evalFunHp=evalFunHp))
evalFunHp = forEvalHp[1, 2:ncol(forEvalHp)]
- }
+ }
+ changesByPip = 0
+ cvChanges = matrix(0, rows=cvk, cols=ncol(changesByOp))
accuracyMatrix = matrix(0, cvk, 1)
+ allChanges = matrix(0, cvk, 1)
#create empty lists
dataset_X = list(); #empty list
dataset_y = list();
@@ -655,28 +488,33 @@ return (Double accuracy, Matrix[Double] evalFunHp,
Matrix[Double] hpForPruning,
beta_list = list();
#keep one fold for testing in each iteration
- for (i in seq(1, cvk), check=0) {
+ for (i in seq(1, cvk)) {
[tmpX, testX] = remove(dataset_X, i);
[tmpy, testy] = remove(dataset_y, i);
trainX = rbind(tmpX);
trainy = rbind(tmpy);
testX = as.matrix(testX)
testy = as.matrix(testy)
- if(as.scalar(pipList['flags']) != 0)
+ if(as.scalar(pipList['flags']) != 0) # this flag is zero when CV is
called from the dirtyScore function, means only accuracy calculation but no
pipeline execution
{
- [trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp] =
executePipeline(pipeline=as.frame(pipList['ph']),
+ [trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp,
changesByPip] = executePipeline(pipeline=as.frame(pipList['ph']),
Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy,
metaList=metaList, hyperParameters=as.matrix(pipList['hp']),
hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']),
test=TRUE, verbose=FALSE)
+ cvChanges[cvk] = changesByOp
+ allChanges[i] = changesByPip
+ }
+ if(changesByPip < ref)
+ print("prunning alert 2: no training the model due to minimum changes")
+ else {
+ res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy,
Xorig=as.matrix(0), evalFunHp=evalFunHp))
+ accuracyMatrix[i] = res[1, 1]
}
- res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy,
Xorig=as.matrix(0), evalFunHp=evalFunHp))
- accuracyMatrix[i] = res[1, 1]
+
}
-
- print("----- cv mean accuracy ---")
- print(toString(accuracyMatrix))
+ allChanges = min(allChanges)
+ changesByOp = colMaxs(cvChanges)
accuracy = mean(accuracyMatrix)
- print("mean: "+toString(accuracy))
- # output = cbind(accuracy, evalFunHp)
+ print("cv accuracy: "+toString(accuracy))
}
pruningSignal = function(Frame[Unknown] ph_pip, Matrix[Double] hp_matrix,
Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
@@ -705,6 +543,8 @@ return(Boolean execute)
getParamMeta = function(Frame[Unknown] pipeline, Frame[Unknown] hpList)
return(Frame[Unknown] applyFunc, Matrix[Double] indexes, Matrix[Double]
paramCount)
{
+ # print("pipeline in meta "+toString(pipeline))
+ # while(FALSE){}
indexes = matrix(0, rows= ncol(pipeline), cols=1)
paramCount = matrix(0, rows= ncol(pipeline), cols=1)
applyList = hpList[, 1]
diff --git a/scripts/builtin/executePipeline.dml
b/scripts/builtin/executePipeline.dml
index 05761a7..641823b 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -57,17 +57,17 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList,
Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test
= FALSE, Boolean verbose)
return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest,
Matrix[Double] Ytest,
- Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
+ Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double
changesAll)
{
mask=as.matrix(metaList['mask'])
FD = as.matrix(metaList['fd'])
applyFunc = as.frame(metaList['applyFunc'])
-
+ changesAll = 0.0
+ d = ncol(Xtrain)
testRow = nrow(Xtest)
- Xout = Xtrain
+ Xorig = Xtest
t1 = time()
- print("PIPELINE EXECUTION START ... "+toString(pipeline))
if(verbose) {
print("checks rows in X = "+nrow(Xtrain)+" rows in Y = "+nrow(Ytrain)+"
cols in X = "+ncol(Xtrain)+" col in Y = "+ncol(Ytrain))
print("pipeline in execution "+toString(pipeline))
@@ -84,14 +84,7 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
L = evalList(op, hp)
[L, O] = remove(L, 1);
Xtrain = as.matrix(O)
- if(nrow(as.matrix(hp[1])) == nrow(Xtrain) & ncol(as.matrix(hp[1])) ==
ncol(Xtrain)) {
- changes = sum(abs(replace(target=Xtrain, pattern=NaN, replacement=0) -
replace(target=as.matrix(hp[1]), pattern=NaN, replacement=0)) > 0.001)
- print("# of changes values: "+toString(changes))
- }
- Xout = Xtrain
if(applyOp != "NA") {
- print("op: "+op)
- # print("dataFlag: "+dataFlag)
[Xtest, executeFlag] = applyDataFlag(Xtest, mask, dataFlag)
L = append(L, list(X=Xtest));
Xtest = eval(applyOp, L);
@@ -109,11 +102,13 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
else {
print("not applying "+op+" executeFlag = 0")
}
+ if(ncol(Xtest) == d) {
+ changesSingle = sum(abs(replace(target=Xtest, pattern=NaN,
replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0)) >
0.001 )
+ changesAll = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0)
- replace(target=Xorig, pattern=NaN, replacement=0)) > 0.001 )
-
- if(as.scalar(pipeline[1, i]) == "outlierBySd" | as.scalar(pipeline[1, i])
== "outlierByIQR" | as.scalar(pipeline[1, i]) == "imputeByFd") {
- changes = sum(abs(replace(target=Xout, pattern=NaN, replacement=0) -
replace(target=as.matrix(hp[1]), pattern=NaN, replacement=0)) > 0.001 )
- [hpForPruning, changesByOp] = storeDataForPrunning(pipeline,
hyperParameters, hpForPruning, changesByOp, changes, i)
+ if(as.scalar(pipeline[1, i]) == "outlierBySd" | as.scalar(pipeline[1,
i]) == "outlierByIQR" | as.scalar(pipeline[1, i]) == "imputeByFd") {
+ [hpForPruning, changesByOp] = storeDataForPrunning(pipeline,
hyperParameters, hpForPruning, changesByOp, changesSingle, i)
+ }
}
}
@@ -121,8 +116,6 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] Xtrain, Mat
if(nrow(Xtest) != testRow)
stop("executePipeline: test rows altered")
t2 = floor((time() - t1) / 1e+6)
-
- print("PIPELINE EXECUTION ENDED: "+t2+" ms")
}
# This function will convert the matrix row-vector into list
@@ -228,7 +221,6 @@ return (Matrix[Double] X)
# put nan back
nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
X = X + nanMask
- # print("X less than equal to zero "+sum(cat <= 0))
}
}
@@ -245,7 +237,6 @@ return (Matrix[Double] X)
Xcat = removeEmpty(target=originalX, margin="cols", select=mask)
nanMask = is.na(Xcat)
Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111)
- # print("unchanged data \n"+toString(originalX, rows=10))
# reconstruct the original matrix
p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
@@ -276,7 +267,6 @@ return (Matrix[Double] X)
}
else X = nX
- # print("recreated data \n"+toString(X, rows = 20))
}
@@ -390,9 +380,8 @@ return (Matrix[Double] X, Matrix[Double] Y)
minClass = min(classes)
maxClass = max(classes)
diff = (maxClass - minClass)/sum(classes)
- if(diff > 0.3)
+ if(diff > 0.2)
{
- #print("initiating oversampling")
XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
synthesized = matrix(0,0,0) # initialize variable
start_class = 1
@@ -403,7 +392,6 @@ return (Matrix[Double] X, Matrix[Double] Y)
outSet = matrix(0, 0, ncol(XY))
remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100
- (remainingRatio%%100)),
remainingRatio-(remainingRatio%%100))
- #print("remaining ratio: "+remainingRatio)
for(i in 1: nrow(k), check=0) {
end_class = end_class + as.scalar(classes[i])
class_t = XY[start_class:end_class, ]
@@ -419,9 +407,8 @@ return (Matrix[Double] X, Matrix[Double] Y)
Y = XY[, 1]
X = XY[, 2:ncol(XY)]
}
- else {
- print("smote not applicable")
-
+ else {
+ str = "smote not applicable"
}
}
}
@@ -477,7 +464,6 @@ return (Matrix[Double] X, Matrix[Double] Y)
{
Xcor = removeEmpty(target = X, margin = "rows", select = (inc==0))
Ycor = removeEmpty(target = Y, margin = "rows", select = (inc==0))
- # print("inc vector "+toString(inc))
Xinc = removeEmpty(target = X, margin = "rows", select = inc)
Yinc = removeEmpty(target = Y, margin = "rows", select = inc)
yhat = removeEmpty(target = yhat, margin = "rows", select = inc)
@@ -497,8 +483,6 @@ return (Matrix[Double] X, Matrix[Double] Y)
}
}
classes = table(Y, 1)
- print("class distribution after flipLabels")
- print(toString(classes))
}
# # # # wrapper for normalize
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index aaa41c7..acef187 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -49,7 +49,6 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3,
Double threshold = 0.8, Boolean verbose = FALSE)
return(Matrix[Double] output, Matrix[Double] meta, Double threshold,
Frame[String] dM, List[Unknown] betaList)
{
-
if(ncol(X) < 2)
stop("MICE can not be applied on single vectors.
expected number of columns > 1 found: "+ncol(X))
@@ -76,7 +75,6 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3,
X1 = X + (Mask1 * imputationVec)
d = ncol(X1)
n = nrow(X1)
-
# compute index of categorical features
index = vectorToCsv(cMask)
# specifications for one-hot encoding of categorical features
@@ -147,10 +145,10 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3,
prob = matrix(1, nrow(test_Y), 1)
}
else {
- beta = multiLogReg(X=train_X, Y=train_Y, icpt = 2, tol = 0.0001, reg
= 0.00001,
+ beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg
= 0.00001,
maxi = 100, maxii=50, verbose=FALSE)
# predicting missing values
- [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
+ [prob, pred, acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
prob = rowMaxs(prob)
}
validThreshold = prob > threshold
diff --git a/scripts/builtin/topk_cleaning.dml
b/scripts/builtin/topk_cleaning.dml
index 37cc1dc..028f6c6 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -58,7 +58,7 @@ source("scripts/builtin/bandit.dml") as bandit;
s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest =
as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown]
primitives,
Frame[Unknown] parameters, String evaluationFunc, Matrix[Double] evalFunHp,
Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, Double
sample = 1.0,
- Double expectedIncrease=1.0, Boolean cv=TRUE, Integer cvk = 2, Boolean
isLastLabel = TRUE, Boolean correctTypos=FALSE, String output)
+ Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk
= 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean
enablePruning = FALSE, String output)
return(Boolean perf)
# return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams,
Matrix[Double] topKScores, Frame[Unknown] bestLogical,
# Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp)
@@ -95,16 +95,33 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
print("-- Cleaning - Get Dirty Score: ");
[dirtyScore, evalFunHp] = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest,
Ytest=eYtest, evaluationFunc=evaluationFunc,
metaList=metaList, sample=sample, cv=cv, cvk=cvk, evalFunHp=evalFunHp,
ctx=ctx)
- t4 = time(); print("---- finalized in: "+(t4-t3)/1e9+"s");
-
+ t4 = time(); print("---- finalized in: "+(t4-t3)/1e9+"s");
+
# # do the string processing
print("-- Cleaning - Data Preparation (strings, transform, sample): ");
[Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, cv,
correctTypos, ctx)
-
# # if mask has 1s then there are categorical features
print("---- feature transformations to numeric matrix");
[eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode")
-
+ # # # do the early dropping
+ # # 1. if 70% of the column is empty
+ # # # 2. if the column has only single value
+ # # # have all unique values
+ Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0)
+ nullMask = is.na(eXtrain)
+ singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) |
(colMaxs(Xtmp) == colMins(Xtmp))
+ allmostEmpty = colSums(nullMask)
+ allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.7)
+ allSum = singleValuesCol | allmostEmptyRatio
+ if(sum(allSum) > 0) {
+ eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum ==
0))
+ if(!cv)
+ eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum ==
0))
+ mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0))
+ fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0))
+ schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0))
+ metaList = list(mask=mask, schema=schema, fd=fdMask,
applyFunc=as.frame("null"), distY=0)
+ }
# apply sampling on training data for pipeline enumeration
# TODO why recoding/sampling twice (within getDirtyScore)
print("---- class-stratified sampling of feature matrix w/ f="+sample);
@@ -137,25 +154,19 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
}
metaList['distY'] = dist
- if(sum(mask) > 0)
- {
- dummyEncode = frame("DUMMY", rows=nrow(logical), cols=1)
- logical = cbind(logical, dummyEncode)
- }
-
print("-- Cleaning - Enum Logical Pipelines: ");
- [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain,
Xtest=eXtest, ytest=eYtest,
- seed=logical, max_iter=max_iter, metaList = metaList,
+ [bestLogical, con, refChanges] = lg::enumerateLogical(X=eXtrain, y=eYtrain,
Xtest=eXtest, ytest=eYtest,
+ initial_population=logical, seed = seed, max_iter=max_iter, metaList =
metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives,
param=parameters,
dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE,
ctx=ctx)
t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
- # bestLogical = frame(["MVI", "OTLR", "DUMMY"], rows=1, cols=3)
+
topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0);
topKScores = matrix(0,0,0); features = as.frame("NULL")
# # [topKPipelines, topKHyperParams, topKScores, features] =
perf = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest,
Y_test=eYtest, metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical,
primitives=primitives, param=parameters, baseLineScore=dirtyScore,
- k=topK, R=resource_val, cv=cv, cvk=cvk, output=output, verbose=TRUE);
+ k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, enablePruning =
enablePruning, output=output, verbose=TRUE);
t7 = time(); print("-- Cleaning - Enum Physical Pipelines:
"+(t7-t6)/1e9+"s");
}
@@ -164,7 +175,6 @@ return(Frame[String] schema, Matrix[Double] mask,
Matrix[Double] fdMask, Integer
{
if(as.scalar(metaData[1, 1]) == "NULL")
{
- print("creating meta data")
r1 = detectSchema(data)
r2 = matrix(0, rows=1, cols=ncol(data))
for(i in 1 : ncol(r1))
@@ -204,7 +214,7 @@ runStringPipeline = function(Frame[Unknown] Xtrain,
Frame[Unknown] Xtest, Frame[
return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
{
if(cv)
- Xtrain = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask,
schema=schema, CorrectTypos=correctTypos, ctx=ctx)
+ Xtrain = utils::stringProcessing(train=Xtrain, test=matrix(0,0,0),
mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
else
{
# # # binding train and test to use same dictionary for both
@@ -231,10 +241,9 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
eXtest = replace(target=eXtest, pattern=NaN, replacement = 1)
print(prefix+" sample from train data and dummy code");
[eXtrain, Ytrain] = utils::doSample(eXtrain, Y, sample, TRUE)
- sliceX = eXtrain
[eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask,
cv, "dummycode")
pipList = list(lp = as.frame("NULL"), ph = as.frame("NULL"), hp =
as.matrix(0), flags = 0)
- print(prefix+" hyper-parameter tuning");
+ print(prefix+" hyper-parameter tuning and dirtyscore computation");
if(cv) {
[dirtyScore, evalFunHp] = bandit::crossV(X=eXtrain, y=Ytrain, cvk=cvk,
evalFunHp=evalFunHp,
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
@@ -246,7 +255,6 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
evalFunHp = res[1, 2:ncol(res)]
print("Dirty Accuracy holdout: "+dirtyScore)
}
-
}
recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest,
Matrix[Double] mask, Boolean cv, String code)
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml
b/scripts/pipelines/scripts/enumerateLogical.dml
index 235f8ae..2cacd65 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -52,10 +52,10 @@
source("scripts/builtin/bandit.dml") as bandit;
enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double]
Xtest, Matrix[Double] ytest,
- Frame[Unknown] seed, Integer max_iter=10, List[Unknown] metaList, String
evaluationFunc, Matrix[Double] evalFunHp,
+ Frame[Unknown] initial_population, Integer seed = -1, Integer max_iter=10,
List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79,
Boolean cv=FALSE, Boolean cvk=3,
Boolean verbose, List[Unknown] ctx=list(prefix="----"))
-return (Frame[Unknown] output, boolean converged)
+return (Frame[Unknown] output, boolean converged, Double refChanges)
{
finalOutput = list()
@@ -65,43 +65,43 @@ return (Frame[Unknown] output, boolean converged)
iter = 1
populationLength = 0
converged = FALSE
- # get the physical instances from logical ones
- # unrolled by physical pipelines
- pipelines = frame(0, rows=nrow(primitives)^ncol(primitives), cols=ncol(seed))
start = 1;
end = 0;
- allOps = param[, 2]
- dist = as.scalar(metaList['distY'])
- if(nrow(y) > 0 & min(y) >= 1 & dist <= 15)
- allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"frequencyEncode\")
- & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\"))?x:\"0\"")
- else
- allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"mice\") & !x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
- & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\") &
- !x.equals(\"abstain\") & !x.equals(\"underSampling\") &
!x.equals(\"flipLabels\") & !x.equals(\"SMOTE\"))?x:\"0\"")
- # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
- allOps = removeEmpty(target=allOps, margin="rows")
- for(i in 1:nrow(seed)) {
- pconf = bandit::get_physical_configurations(seed[i], 0, primitives)
+ [allOps, ref] = getOps(param[, 2], as.scalar(metaList['distY']), nrow(y),
min(y))
+
+ # unrolled by physical pipelines
+ pipelines = frame(0, rows=nrow(primitives)^ncol(primitives),
cols=max(ncol(initial_population), ncol(ref)))
+ for(i in 1:nrow(initial_population)) {
+ pconf = bandit::get_physical_configurations(initial_population[i], 0,
primitives)
end = end + nrow(pconf)
pipelines[start:end, 1:ncol(pconf)] = pconf
start = end + 1
}
+
pipelines = removeEmpty(target = pipelines, margin="rows")
+ if(sum(mask) > 0)
+ {
+ dummyEncode = frame("dummycoding", rows=nrow(pipelines), cols=1)
+ pipelines[, 2] = dummyEncode
+ }
+ pipelines = rbind(ref, pipelines)
population = pipelines
populationSize = nrow(pipelines)
-
+ randomOps = sample(3, (populationSize * max_iter), TRUE, seed)
+ transitions = sample(nrow(allOps), (populationSize * max_iter), TRUE, seed)
+ refChangesInternal = 0
while(!converged & iter <= max_iter)
{
populationLength = max(populationLength, ncol(population))
id = seq(1, nrow(population))
print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" );
# # # execute the physical pipelines
- [outPip, outHp, feaFrameOuter] =
bandit::run_with_hyperparam(cbind(as.frame(id), population),
- num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp,
param, as.frame(""), cv, cvk, TRUE)
+ [outPip, outHp, p, refChanges] =
bandit::run_with_hyperparam(cbind(as.frame(id), population),
+ num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp,
param, cv, cvk, 0, FALSE, TRUE)
# # sort the configurations score-wise
- actPip = cbind(as.frame(outPip[, 1]), population)
- sort_mask = cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(population)))
+ actPip = cbind(as.frame(outPip[, 1]), as.frame(refChanges))
+ actPip = cbind(actPip, population)
+ sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1,
cols=ncol(population)))
sortedPipelines = frameSort(actPip, sort_mask, TRUE)
converged = as.double(as.scalar(sortedPipelines[1, 1])) > dirtyScore
if(converged)
@@ -111,21 +111,22 @@ return (Frame[Unknown] output, boolean converged)
sortedPipelines = sortedPipelines[1:diR]
finalOutput = append(finalOutput, sortedPipelines)
# # # if converged then stop otherwise generate new population
- sortedPipelines = sortedPipelines[, 2:ncol(sortedPipelines)]
- children = frame(0, rows=populationSize, cols=ncol(sortedPipelines) + 1)
+ children = frame(0, rows=populationSize, cols=ncol(sortedPipelines))
+ sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)]
# # randomly pick the pipelines for transitions
pipRand = sample(nrow(sortedPipelines), populationSize, TRUE)
if(!converged) {
- parfor(i in 1:nrow(children), check=0) {
+ for(i in 1:nrow(children), check=0) {
+ idxR = (nrow(children) * (iter - 1)) + i
idx = as.scalar(pipRand[i])
top = removeEmpty(target=sortedPipelines[idx], margin="cols")
tail = top[, ncol(top)]
if(sum(mask) > 0)
top = top[, 1:ncol(top) - 1]
- random = ifelse(ncol(top) <=2, 1, as.scalar(sample(3, 1)))
+ random = ifelse(ncol(top) <=2, 1, as.scalar(randomOps[idxR]))
if(random == 1)
- c1 = addition(top, allOps)
+ c1 = addition(top, allOps[as.scalar(transitions[idxR])])
else if(random == 2)
c1 = mutation(top)
else if(random == 3)
@@ -143,8 +144,7 @@ return (Frame[Unknown] output, boolean converged)
print(prefix+" EnumLP did not converge after "+(iter - 1)+" / "+max_iter+"
iterations")
}
# # # prepare the final frame output
- output = frame(0, rows=round((populationSize/2)) * length(finalOutput) ,
cols=populationLength + 1)
- print("rows in output: "+nrow(output))
+ output = frame(0, rows=round((populationSize/2)) * length(finalOutput) ,
cols=populationLength + 2)
start = 1;
end = 0;
for(i in 1:length(finalOutput))
@@ -154,31 +154,19 @@ return (Frame[Unknown] output, boolean converged)
output[start:end, 1:ncol(pipFrame)] = pipFrame
start = end + 1
}
- sort_mask = cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(output) - 1))
+ sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1,
cols=ncol(output) - 2))
output = removeEmpty(target=output, margin="rows")
- output = frameSort(output, sort_mask, TRUE)
- print("final Pipelines")
- print(toString(output, rows=150))
- output = output[, 2:ncol(output)]
+ output = frameSort(output, sort_mask, FALSE)
+ refChanges = as.double(as.scalar(output[nrow(output), 2]))
+ output = output[, 3:ncol(output)]
}
-
addition = function(Frame[Unknown] top, Frame[Unknown] allOps)
return (Frame [Unknown] child)
{
- c = as.scalar(sample(nrow(allOps), 1))
- # place_to_add = as.scalar(sample(ncol(top), 1))
- # if(place_to_add == 1)
- child = cbind(allOps[c, 1], top)
- # else
- # {
- # start = top[, 1:place_to_add-1]
- # end = top[, place_to_add:ncol(top)]
- # child = cbind(cbind(start, allOps[c, 1]), end)
- # }
+ child = cbind(allOps, top)
}
-
mutation = function(Frame[Unknown] child)
return (Frame [Unknown] mChild)
{
@@ -198,14 +186,32 @@ removal = function(Frame[Unknown] child)
return (Frame[Unknown] child)
{
random = as.scalar(rand(rows=1, cols=1))
- print("before removal")
- print(toString(child))
if(ncol(child) >= 2)
{
idx = as.scalar(sample(ncol(child), 1))
child[1, idx] = as.frame(0)
child = removeEmpty(target=child, margin="cols")
}
- print("after removal")
- print(toString(child))
}
+
+getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer
minValue)
+ return (Frame[String] allOps, Frame[String] ref) {
+
+ # # # TODO fix the following hard-coded condition by taking a file input
+ # # allOps are the operation which are randomly added to a population, for
now I am reusing param file
+ # # so map condition with remove the operations which should not be added
twice in a pipeline i.e., dummycoding
+ # # for regression class imbalance operators are also removed
+ if(n > 0 & minValue >= 1 & dist <= 15) {
+ allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"frequencyEncode\")
+ & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\"))?x:\"0\"")
+ ref = frame(["imputeByMean", "winsorize", "scale", "dummycoding"], rows=1,
cols=4)
+ }
+ else {
+ allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
+ & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\") &
+ !x.equals(\"abstain\") & !x.equals(\"underSampling\") &
!x.equals(\"flipLabels\") & !x.equals(\"SMOTE\"))?x:\"0\"")
+ # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
+ ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
+ }
+ allOps = removeEmpty(target=allOps, margin="rows")
+}
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index b0e55bb..b4dd7df 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -157,7 +157,6 @@ return(Frame[Unknown] train, Frame[Unknown] test,
Matrix[Double] M)
# step 1 do the case transformations
print(prefix+" convert strings to lower case");
train = map(train, "x -> x.toLowerCase()")
-
# step 2 fix invalid lengths
# q0 = 0.05
# q1 = 0.95
@@ -201,7 +200,21 @@ return(Frame[Unknown] train, Frame[Unknown] test,
Matrix[Double] M)
test[, i] = correctTyposApply(test[, i], ft, dt, dm, fr);
}
}
-
+ # # step 7 convert date to decimal
+ isDate = map(train[1:10], "x -> UtilFunctions.isDateColumn(x)")
+ isDate = replace(target = as.matrix(isDate), pattern = NaN, replacement = 0)
+ isDate = (colMaxs(isDate)) & as.matrix(schema == frame("STRING", rows=1,
cols=ncol(schema)))
+ if(sum(isDate) > 0) {
+ print(prefix+" changing date to timestamp")
+ dateColIdx = removeEmpty(target = isDate * t(seq(1, ncol(isDate))),
margin="cols")
+ for(i in 1:ncol(dateColIdx))
+ {
+ idx = as.scalar(dateColIdx[i])
+ train[, idx] = map(train[, idx], "x -> UtilFunctions.getTimestamp(x)",
margin=2)
+ if(length(test) > 0)
+ test[, idx] = map(test[, idx], "x -> UtilFunctions.getTimestamp(x)",
margin=2)
+ }
+ }
# TODO add deduplication
print(prefix+" deduplication via entity resolution");
diff --git a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
index 17db8f6..cb7d19b 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
@@ -493,6 +493,8 @@ public class UtilFunctions {
}
public static double objectToDoubleSafe(ValueType vt, Object in) {
+ if(vt == ValueType.STRING && in == null)
+ return 0.0;
if(vt == ValueType.STRING && !NumberUtils.isCreatable((String)
in)) {
return 1.0;
} else return objectToDouble(vt, in);
@@ -909,11 +911,13 @@ public class UtilFunctions {
if (maxMatches <= 0 || dateCol < 0){
//ERROR - no date column found
- throw new DMLRuntimeException("No date column found.");
+ System.out.println("No date column in the dataset");
}
return dateCol;
}
-
+ public static String isDateColumn (String values) {
+ return DATE_FORMATS.keySet().parallelStream().anyMatch(e ->
values.toLowerCase().matches(e))?"1":"0";
+ }
public static String[] getDominantDateFormat (String[] values) {
String[] output = new String[values.length];
Map<String, String> date_formats = DATE_FORMATS;
diff --git a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
index 9f8a681..320eb12 100644
--- a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
+++ b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
@@ -95,7 +95,7 @@ return(Matrix[Double] output, Matrix[Double] error)
}
else {
beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]),
reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
- maxi=as.scalar(evalFunHp[1,4]), maxii=0, verbose=FALSE);
+ maxi=1000, maxii=0, verbose=FALSE);
[prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
error = yhat != Ytest
a = getAccuracy(Ytest, yhat, TRUE)
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index fd464fe..5a57be4 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-NA,dummycodingApply
-NA,dummycodingApply
-NA,dummycodingApply
+NA,dummycodingApply,0,0
+NA,dummycodingApply,0,0
+imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index ae312ae..274dcca 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-73.73188405797102
-69.7463768115942
-69.02173913043478
+70.83333333333334
+69.38405797101449
+68.65942028985508
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
index dcb46fe..ec20472 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
@@ -1 +1 @@
-2.0,10.0,0.001,1000.0
+2.0,10.0,0.001
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index ef64dd0..231e789 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-14.0,1.0,0.2750943835009122,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-14.0,1.0,0.4614295314769764,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-14.0,1.0,0.49358019629519945,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.44724177618347905,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.3017247635995244,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.016068274841623598,0.9737026111609255,0,0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index bdfc48a..d6f15ed 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-underSampling,dummycoding
-underSampling,dummycoding
-underSampling,dummycoding
+underSampling,dummycoding,0,0
+underSampling,dummycoding,0,0
+imputeByMean,winsorize,scale,dummycoding
diff --git
a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
index d7b7ef0..46d28d3 100644
---
a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
@@ -1,5 +1,5 @@
-imputeByFdApply,outlierBySdApply,dummycodingApply,0,0,0,0,0,0
-imputeByMeanApply,imputeByFdApply,outlierBySdApply,dummycodingApply,0,0,0,0,0
-imputeByFdApply,outlierBySdApply,dummycodingApply,dummycodingApply,0,0,0,0,0
-imputeByFdApply,outlierBySdApply,dummycodingApply,dummycodingApply,0,0,0,0,0
-imputeByMeanApply,imputeByFdApply,outlierBySdApply,dummycodingApply,0,0,0,0,0
+outlierBySdApply,fillDefaultApply,outlierByIQRApply,scaleApply,0,0,0
+normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply,0
+miceApply,normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply
+normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply,0
+normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply,0
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
index 02c1429..890dd9b 100644
--- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -92,7 +92,7 @@ testY = eY[split+1:nrow(eX),]
[bestLogical, converged] = lg::enumerateLogical(X=trainX, y=trainY,
Xtest=testX, ytest=testY,
- seed=logical, max_iter=max_iter, metaList = metaList,
evaluationFunc="evalML", dirtyScore = dirtyScore + expectedIncrease,
+ initial_population=logical, seed = 42, max_iter=max_iter, metaList =
metaList, evaluationFunc="evalML", dirtyScore = dirtyScore + expectedIncrease,
evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives,
param=param,
cv=FALSE, verbose=TRUE)
diff --git
a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 74bea3d..56a82c8 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -59,7 +59,7 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)]
# [topKPipelines, topKHyperParams, topKScores, bestLogical, features,
dirtyScore, evalHp] =
result = topk_cleaning(dataTrain=trainData, dataTest=testData,
metaData=metaInfo, primitives=primitives, parameters=param,
evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK,
resource_val=resources,
- expectedIncrease=expectedIncrease, max_iter=max_iter, cv=testCV, cvk=cvk,
sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)
+ expectedIncrease=expectedIncrease, seed = 42, max_iter=max_iter, cv=testCV,
cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)
write(result, $O)