This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new ecee97f [MINOR] Various cleanups in cleaning pipelines 1. FDs are
passed as meta data 2. All meta variables are passed via a list 3. Test
added for classification and compare target applications 4. warning are fixed
by initializing variables 5. Possible deduplication of tasks in separate
namespaces TODO : Add tests for clustering and regression. Fix logical
pipelines
ecee97f is described below
commit ecee97f56a4b93b1cf7dd08934f04a8795c710ac
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Mon Mar 29 18:00:08 2021 +0200
[MINOR] Various cleanups in cleaning pipelines
1. FDs are passed as meta data
2. All meta variables are passed via a list
3. Test added for classification and compare target applications
4. warning are fixed by initializing variables
5. Possible deduplication of tasks in separate namespaces
TODO : Add tests for clustering and regression. Fix logical pipelines
---
scripts/builtin/bandit.dml | 442 ++++++++-------------
scripts/builtin/discoverFD.dml | 1 +
scripts/builtin/executePipeline.dml | 132 +++++-
scripts/builtin/mice.dml | 6 +-
scripts/builtin/multiLogRegPredict.dml | 2 +-
scripts/builtin/pca.dml | 7 +-
scripts/builtin/splitBalanced.dml | 1 +
scripts/pipelines/properties/param.csv | 32 +-
scripts/pipelines/properties/primitives.csv | 3 +-
scripts/pipelines/scripts/logicalFunc.dml | 79 ++--
scripts/pipelines/scripts/utils.dml | 156 +++++++-
...ngTest.java => CleaningTestClassification.java} | 23 +-
...{CleaningTest.java => CleaningTestCompare.java} | 51 +--
.../functions/pipelines/compareAccuracy.dml | 57 ++-
.../functions/pipelines/intermediates/acc.csv | 5 -
.../functions/pipelines/intermediates/hp.csv | 5 -
.../functions/pipelines/intermediates/pip.csv | 5 -
.../scripts/functions/pipelines/mainScript.dml | 394 +++++++++---------
.../functions/pipelines/meta/meta_census.csv | 2 +-
.../functions/pipelines/testClassification.dml | 203 ++++++++++
.../scripts/functions/pipelines/testCompare.dml | 138 +++++++
21 files changed, 1097 insertions(+), 647 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index feb16c4..a2399af 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -18,12 +18,13 @@
# under the License.
#
#-------------------------------------------------------------
+source("scripts/pipelines/scripts/utils.dml") as utils;
-m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train,
Matrix[Double] mask, Matrix[Double] MLhp,
- Frame[Unknown] schema, Frame[Unknown] lp, Frame[Unknown] primitives,
Frame[Unknown] param, Integer k = 3,
- Double testAccuracy = 0.8, Boolean isWeighted, Integer R=50, Integer cv=3,
Boolean verbose = TRUE)
- return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,
Matrix[Double] bestAccuracy)
+m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train,
List[Unknown] metaList, List[Unknown] targetList,
+ Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param, Integer
k = 3, Integer R=50, Boolean verbose = TRUE)
+ return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,
Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter)
{
+ NUM_FEATURES = 14
print("null in data "+sum(is.na(X_train)))
bestPipeline = frame("", rows=1, cols=1)
bestHyperparams = as.matrix(0)
@@ -38,6 +39,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
hparam = matrix(0, rows=k*(s_max+1), cols=55)
pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
startOut=0; endOut=0;
+ feaFrameOuter = frame("", rows = 1, cols = NUM_FEATURES + ncol(lp) + 1 )
+
for(s in s_max:0, check = 0) {
# result variables
@@ -50,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
r = R * eta^(-s);
# get the physical pipelines, the pipelines, pipelines are recoded
[configurations, n] = get_physical_configurations(lp, n, primitives)
-
+
# append configuration keys for extracting the pipeline later on
id = seq(1, nrow(configurations))
configurations = cbind(as.frame(id), configurations)
@@ -72,13 +75,13 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
}
configurations = configurations[1:n_i, ]
- [outPip,outHp] = run_with_hyperparam(configurations, r_i, X_train,
Y_train, mask,
- MLhp, schema, param, isWeighted, testAccuracy, cv, verbose)
+ [outPip,outHp, feaFrameOuter] = run_with_hyperparam(configurations, r_i,
X_train, Y_train, metaList,
+ targetList, param, feaFrameOuter, verbose)
# sort the pipelines by order of accuracy decreasing
a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
rowIndex = ifelse(nrow(a) >= k, k, nrow(a))
-
+
# maintain the brackets results
end = end + rowIndex
bracket_pipel[start:end, ] = a[1:rowIndex,]
@@ -86,9 +89,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
start = end + 1
# sort the configurations fro successive halving
- avergae_perf = getMaxPerConf(outPip) #as.frame(aggregate(target=a[,
1], groups=a[, 2], fn="mean"))
- print("configurations "+toString(configurations))
- while(FALSE){}
+ avergae_perf = getMaxPerConf(outPip)
configurations = frameSort(cbind(avergae_perf, configurations))
configurations = configurations[, 2:ncol(configurations)]
}
@@ -96,27 +97,15 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
bracket_hp = removeEmpty(target=bracket_hp, margin="rows")
# keep the best k results for each bracket
[bracket_bestPipeline, bracket_bestHyperparams] =
extractBracketWinners(bracket_pipel, bracket_hp, k, lookup)
-
- # print("after "+i+" bracket ")
- # print(toString(bracket_bestPipeline))
- # print("------------------")
- # print(toString(bracket_bestHyperparams))
- # while(FALSE){}
-
+ # optimize by the features
+
startOut = endOut + 1
endOut = endOut + nrow(bracket_bestPipeline)
pipeline[startOut: endOut, ] = bracket_bestPipeline
hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] =
bracket_bestHyperparams
}
-
- # print("after all brackets ")
- # while(FALSE){}
- # print(toString(pipeline))
- # print("------------------")
- # print(toString(hparam))
- # while(FALSE){}
- # extract best top k from all iterations
- [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam,
testAccuracy, k)
+
+ [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam,
as.scalar(targetList['dirAcc']), k)
bestAccuracy = as.matrix(bestPipeline[,1])
bestPipeline = bestPipeline[,2:ncol(bestPipeline)]
@@ -126,7 +115,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, Matrix[Doubl
print("best pipeline"+ toString(bestPipeline))
print("best hyper-parameters \n"+ toString(bestHyperparams))
print("best accuracy \n"+ toString(bestAccuracy))
- print("dirty accuracy "+testAccuracy)
+ if(as.scalar(targetList['target']) != "compare")
+ print("dirty accuracy "+as.scalar(targetList['dirAcc']))
}
}
@@ -207,42 +197,59 @@ get_physical_configurations = function(Frame[String]
logical, Scalar[int] numCon
}
# this method will call the execute pipelines with their hyper-parameters
-run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i,
Matrix[Double] X, Matrix[Double] Y,
- Matrix[Double] mask, Matrix[Double] MLhp, Frame[Unknown] schema,
Frame[Unknown] param, Boolean isWeighted,
- Double testAccuracy, Integer cv=3, Boolean verbose)
- return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam) {
+run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i,
Matrix[Double] X, Matrix[Double] Y, List[Unknown] metaList,
+ List[Unknown] targetList, Frame[Unknown] param, Frame[Unknown]
featureFrameOuter, Boolean verbose)
+ return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam,
Frame[Unknown] featureFrameOuter) {
output_hp = matrix(0, nrow(ph_pip)*r_i, 50)
output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
-
+
# rows in validation set
clone_X = X
clone_Y = Y
index = 1
id = as.matrix(ph_pip[, 1])
ph_pip = ph_pip[, 2:ncol(ph_pip)]
+
+ feaVec = utils::gatherStats(X, Y, as.matrix(metaList['mask']),
as.scalar(targetList['target']))
+
for(i in 1:nrow(ph_pip))
{
# execute configurations with r resources
- hp = getHyperparam(ph_pip[i], param, r_i)
- for(r in 1:r_i)
- {
- [X, Y] = executePipeline(ph_pip[i], X, Y, mask, hp, r, FALSE)
- accuracy = fclassify(X, Y, mask, MLhp, testAccuracy, isWeighted, cv)
- hp_vec = listToVector(hp, FALSE)
+ [hp, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i)
+ feaFrame = frame("", rows = no_of_res, cols = ncol(feaVec) + ncol(ph_pip)
+ 1)
+ for(r in 1:no_of_res)
+ {
+ # as the matrix first block of r rows belongs to first operator and r+1
block of rows to second operator
+ # we need to extract a row from each block
+ indexes = matrix(no_of_res, rows=ncol(ph_pip), cols=1)
+ indexes[1, 1] = r
+ indexes = cumsum(indexes)
+ indexes = table(indexes, 1, 1, nrow(hp), 1)
+ hp_matrix = removeEmpty(target = hp, margin="rows", select = indexes)
+ [X, Y] = executePipeline(ph_pip[i], X, Y, as.matrix(metaList['mask']),
as.matrix(metaList['fd']), hp_matrix, no_of_flag_vars, FALSE)
+ if(as.scalar(targetList['target']) == "compare")
+ accuracy = utils::compareValue(clone_X, X,
as.matrix(targetList['cleanData']), as.matrix(metaList['mask']))
+ else
+ accuracy = fclassify(X, Y, as.matrix(metaList['mask']),
as.matrix(targetList['mlHp']), as.scalar(targetList['dirAcc']),
+ as.scalar(targetList['wAccuracy']), as.scalar(targetList['cv']))
+ matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix))
+ hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1,
cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE))
output_accuracy[index, 1] = accuracy
output_hp[index, 1:ncol(hp_vec)] = hp_vec
output_pipelines[index, ] = cbind(as.matrix(i), id[i,1])
X = clone_X
Y = clone_Y
- while(FALSE){}
- index = index + 1
- # hp = getHyperparam(ph_pip[i,], param)
+ index = index + 1
+ feaFrame[r, 1:ncol(feaVec)] = as.frame(feaVec)
+ feaFrame[r, ncol(feaVec)+1:ncol(feaVec)+ncol(ph_pip[1])] = ph_pip[i]
+ feaFrame[r, ncol(feaFrame)] = accuracy
}
X = clone_X
Y = clone_Y
+ featureFrameOuter = rbind(featureFrameOuter, feaFrame)
}
output_hyperparam = removeEmpty(target=cbind(output_accuracy, output_hp),
margin="rows")
output_operator = removeEmpty(target=cbind(output_accuracy,
output_pipelines) ,margin="rows")
@@ -250,13 +257,17 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
# extract the hyper-parameters for pipelines
getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList,
Integer no_of_res)
- return (List[Unknown] paramList)
+ return (Matrix[Double] paramMatrix, Integer no_of_res, Integer
NUM_META_FLAGS)
{
+
+ allParam = 0;
+ START_INDEX = 8 # value from where the hyper-params starts after skipping
meta flags
+ NUM_META_FLAGS = 5
# load the hyper-parameters values
paramList = list()
- allParam = 0;
# store the row indexes of the operator matches
indexes = matrix(0, rows= ncol(pipeline), cols=1)
+ paramCount = matrix(0, rows= ncol(pipeline), cols=1)
for(k in 1:ncol(pipeline))
{
op = as.scalar(pipeline[1,k])
@@ -268,104 +279,71 @@ getHyperparam = function(Frame[Unknown] pipeline,
Frame[Unknown] hpList, Intege
index = m_hasParam * seq(1, nrow(m_hasParam))
index = as.scalar(removeEmpty(target = index, margin = "rows"))
indexes[k] = index
- no_of_param = as.integer(as.scalar(hpList[index, 2]))
- allParam = no_of_param + allParam
+ paramCount[k] = as.integer(as.scalar(hpList[index, 2]))
}
# if there are no hyper-parameters than change the values of resources
# so that the pipeline is only executed once and no resource are wasted,
saving looping
- no_of_res = ifelse(allParam > 0, no_of_res, 1)
-
+ no_of_res = ifelse(sum(paramCount) > 0, no_of_res, 1)
+ # the below matrix stores the different combinations of hyper-parameter
value for each pipeline
+ # if the resource value is greater than zero this means for 1 pipeline it
will store r rows where each row store set
+ # of hyperparameter values for ith pipeline. If resource value rv = 10 and
ncol(pip) = 3 then the output matrix will have
+ # 10*3= 30 rows and 1:10 hyper-paramters for i-the pipeline 11:20 for
(i+1)-th pipeline and so on
+ # this matrix stores no. of hps, values of hps, and flags
+ paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res,
cols=max(paramCount)+NUM_META_FLAGS+1)
+
for(i in 1:ncol(pipeline)) {
index = as.scalar(indexes[i])
- no_of_param = as.integer(as.scalar(hpList[index, 2]))
-
+ no_of_param = as.integer(as.scalar(paramCount[i]))
# extract hasY and verbose flags
attachMask = matrix(as.scalar(hpList[index, 3]), rows=no_of_res, cols=1)
- attachY = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
- isVerbose = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
- dataFlag = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
+ attachFD = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
+ attachY = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
+ isVerbose = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
+ dataFlag = matrix(as.scalar(hpList[index, 7]), rows=no_of_res, cols=1)
if(no_of_param > 0) {
- start = 7
- t = 7
- OpParam = matrix(0, no_of_res, no_of_param)
+ paramIdx = START_INDEX
+ typeIdx = START_INDEX
+ OpParam = matrix(0, rows=no_of_res, cols=max(paramCount))
+
for(j in 1:no_of_param) {
- type = as.scalar(hpList[index, t])
- paramValIndex = (no_of_param) + start
+ type = as.scalar(hpList[index, typeIdx])
+ paramValIndex = (no_of_param) + paramIdx
minVal = as.scalar(hpList[index, paramValIndex])
maxVal = as.scalar(hpList[index, paramValIndex + 1])
if(type == "FP") {
- val = rand(rows=no_of_res, cols=1, min=minVal,
- max=maxVal, pdf="uniform");
- OpParam[, j] = val
- }
- else if(type == "INT") {
- # val = ifelse(minVal == maxVal , minVal, as.scalar(sample(maxVal,
1)));
- val = sample(maxVal, no_of_res, TRUE)
- less_than_min = val < minVal
- val = (less_than_min * minVal) + val
+ val = rand(rows=no_of_res, cols=1, min=minVal,max=maxVal,
pdf="uniform");
+ OpParam[, j] = val;
+ } else if(type == "INT") {
+ val = sample(maxVal, no_of_res, TRUE);
+ less_than_min = val < minVal;
+ val = (less_than_min * minVal) + val;
OpParam[, j] = val
- }
- else if(type == "BOOL") {
+ } else if(type == "BOOL") {
if(maxVal == 1) {
- s = sample(2, no_of_res, TRUE)
- b = s - 1
- OpParam[, j] = b
- }
- else OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
- }
- else {
+ s = sample(2, no_of_res, TRUE);
+ b = s - 1;
+ OpParam[, j] = b;
+ } else
+ OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
+ } else {
# TODO handle string set something like {,,}
print("invalid data type")
}
- start = start + 2
- t = t + 1
+ paramIdx = paramIdx + 2
+ typeIdx = typeIdx + 1
}
- OpParam = cbind(OpParam, attachMask, attachY, isVerbose, dataFlag)
+ # hyper-parameter vector contains no. of hp, values of hp, and flag
values
+ OpParam = cbind(matrix(no_of_param, rows=nrow(OpParam), cols=1),OpParam,
attachMask,
+ attachFD, attachY, isVerbose, dataFlag)
}
else {
- OpParam = cbind(attachMask, attachY)
+ # no hyper-parameters, so create a dummy matrix of zeros so flags are
always aligned
+ dummy = matrix(0, rows=no_of_res, cols=max(paramCount)+1)
+ OpParam = cbind(dummy, attachMask, attachFD, attachY)
OpParam = cbind(OpParam, isVerbose, dataFlag)
}
- while(FALSE){}
- paramList = append(paramList, OpParam)
- }
-}
-
-
-# method to convert the operators from a list to a vector representation
-# so that the could be append in an output matrix
-listToVector = function(List[Unknown] hp, Boolean verbose)
-return (Matrix[Double] hp_vec)
-{
- hp_vec = matrix(0,1,1)
- len = length(hp)
- for(k in 1:len) {
- mat = as.matrix(hp[k])
- hpy = cbind(as.matrix(ncol(mat)), mat)
- hp_vec = cbind(hp_vec, hpy)
- }
- hp_vec = hp_vec[1, 2:ncol(hp_vec)]
-}
-
-# function to classify the data using cross validation
-fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask,
Matrix[Double] MLhp,
- Double testAccuracy, Boolean isWeighted, Integer cv=3)
- return (Double accuracy)
-{
-
- if(max(Y) == min(Y)) {
- print("Y contains only one class")
- accuracy = as.double(0)
- }
- else {
- print("STARTING "+cv+" CROSS VALIDATIONS")
- # do the k = 3 cross validations
- accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
- accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
- acc = colMeans(accuracyMatrix)
- accuracy = as.scalar(acc[1,1])
- print("validation accuracy "+accuracy)
+ paramMatrix[((i-1)*no_of_res)+1:i*no_of_res, 1:ncol(OpParam)] = OpParam
}
}
@@ -453,176 +431,104 @@ extractBracketWinners = function(Matrix[Double]
pipeline, Matrix[Double] hyperpa
}
-
-
-# smote wrapper for doing relative over-sampling
-SMOTE = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask,
Integer remainingRatio, Boolean verbose)
-return (Matrix[Double] XY)
-{
- XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
- # get the class count
- classes = table(XY[, 1], 1)
- print("before smote")
- print(toString(classes))
- while(FALSE){}
- start_class = 1
- end_class = 0
- k = table(XY[, 1], 1)
- getMax = max(k)
- maxKIndex = as.scalar(rowIndexMax(t(k)))
- outSet = matrix(0, 0, ncol(XY))
- print("remaining ration before "+remainingRatio)
- remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100 -
(remainingRatio%%100)),
- remainingRatio-(remainingRatio%%100))
- print("remaining ration after "+remainingRatio)
- for(i in 1: nrow(k)) {
- end_class = end_class + as.scalar(classes[i])
- class_t = XY[start_class:end_class, ]
- # remainingRatio = (round(getMax/nrow(class_t)) - 1) * 100
- if((i != maxKIndex)) {
- synthesized = smote(class_t[, 2:ncol(XY)], mask, remainingRatio, 1,
FALSE)
- synthesized = cbind(matrix(as.scalar(class_t[2,1]), nrow(synthesized),
1), synthesized)
- outSet = rbind(outSet, synthesized)
- if(verbose) {
- print("max value: "+getMax)
- print("values of i: "+i)
- print("remaining ratio: "+remainingRatio)
- }
- }
- start_class = end_class + 1
- }
-
- XY = rbind(XY, synthesized)
- Y = XY[, 1]
- X = XY[, 2:ncol(XY)]
- XY = cbind(X,Y)
- classes = table(Y, 1)
- print("after smote")
- print(toString(classes))
-}
-
-# constraints over hyper parameters
-verifyHp = function(Integer index, Frame[Unknown] pip, Double minVal, Double
maxVal, Integer paraNo)
-return (Double minVal, Double maxVal) {
- op = as.scalar(pip[1,index])
- # 1. if next op is pca then current op should not leave NaNs in data
- # 2. if next op is mice then current op should not replace NaNs with zeros
-
- if((op == "outlierBySd" | op == "outlierByIQR") & index < ncol(pip) & paraNo
== 2)
- {
- nextOp = as.scalar(pip[1, index + 1])
- if(nextOp == "pca" | nextOp == "abstain" | nextOp == "SMOTE")
- {
- maxVal = 1.0
- }
- if(nextOp == "mice")
- {
- minVal = 2.0
- }
- }
- # print("now min and max val ")
- # print(minVal+" "+maxVal)
-
-}
-
-
-#####################################
-# The function will replace the null with default values
-######################################
-fillDefault = function(Matrix[Double] X)
-return(Matrix[Double] X){
- defaullt = round(colMaxs(X) - colMins(X))
- Mask = is.na(X)
- X = replace(target=X, pattern=NaN, replacement=0)
- Mask = Mask * defaullt
- X = X + Mask
-}
-
-#####################################
+###########################################################################
# The function will return the max performance by each individual pipeline
-######################################
+############################################################################
getMaxPerConf = function(Matrix[Double] pipelines)
return (Frame[Unknown] maxperconf)
{
- tab = removeEmpty(target=table(pipelines[, 2], pipelines[, 3], pipelines[,
1]), margin="cols")
+ tab = removeEmpty(target=table(pipelines[, 2], pipelines[, 3], pipelines[,
1]), margin="cols")
maxperconf = frame(0, rows=max(pipelines[, 2]), cols=1)
- maxperconf = as.frame(t(colMaxs(tab)))
-
+ maxperconf[1:ncol(tab),] = as.frame(t(colMaxs(tab)))
}
-#####################################
-# The function will check if the pipeline have zero hyper-parameters
-# then it should not use more resource iterations and should be executed once
-######################################
-isResourceOptimal = function(List[Unknown] param, Boolean verbose)
-return(Boolean validForResources)
+# function to classify the data using cross validation
+fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask,
Matrix[Double] MLhp,
+ Double testAccuracy, Boolean isWeighted, Integer cv=3)
+ return (Double accuracy)
{
- validForResources = FALSE
-
- count = 0
- for(i in 1:length(param))
- {
- hp = as.matrix(param[i])
- if(ncol(hp) > 4)
- count += 1
+
+ if(max(Y) == min(Y)) {
+ print("Y contains only one class")
+ accuracy = as.double(0)
+ }
+ else {
+ print("STARTING "+cv+" CROSS VALIDATIONS")
+ # do the k = 3 cross validations
+ accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
+ accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
+ acc = colMeans(accuracyMatrix)
+ accuracy = as.scalar(acc[1,1])
+ print("validation accuracy "+accuracy)
}
- validForResources = count > 0
}
+# # ######################################################################
+# # # # Function for cross validation using hold out method
+# # # # Inputs: The input dataset X, Y and the value of k validation, mask of
the
+# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # Output: It return a matrix having the accuracy of each fold.
+# # ######################################################################
+
+crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
+ Matrix[Double] MLhp, Boolean isWeighted)
+return (Matrix[Double] accuracyMatrix)
+{
+ accuracyMatrix = matrix(0, k, 1)
-#######################################################################
-# Wrapper of transformencode OHE call, to call inside eval as a function
-# Inputs: The input dataset X, and mask of the columns
-# Output: OHEd matrix X
-#######################################################################
+ dataList = list()
+ testL = list()
+ data = order(target = cbind(y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
+ classes = table(data[, 1], 1)
+ ins_per_fold = classes/k
+ start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
+ fold_idxes = cbind(start_fold, ins_per_fold)
-dummycoding = function(Matrix[Double] X, Matrix[Double] mask)
-return (Matrix[Double] dX_train) {
- X = replace(target=X, pattern=NaN, replacement=0)
- idx = vectorToCsv(mask)
-
- # specifications for one-hot encoding of categorical features
- jspecDC = "{ids:true, dummycode:["+idx+"]}";
- # OHE of categorical features
- [dX_train, dM] = transformencode(target=as.frame(X), spec=jspecDC);
-
-}
+ start_i = 0; end_i = 0; idx_fold = 1;;
+ for(i in 1:k)
+ {
+ fold_i = matrix(0, 0, ncol(data))
+ start=0; end=0;
+ for(j in 1:nrow(classes))
+ {
+ idx = as.scalar(classes[j, 1])
+ start = end + 1;
+ end = end + idx
+ class_j = data[start:end, ]
+ start_i = as.scalar(fold_idxes[j, 1]);
+ end_i = as.scalar(fold_idxes[j, 2])
-#######################################################################
-# Wrapper of imputeByFD OHE call, to call inside eval as a function
-# Inputs: The input dataset X, and mask of the columns and threshold value
-# Output: filled matrix X
-#######################################################################
+ fold_i = rbind(fold_i, class_j[start_i:end_i, ])
+ }
-imputeByFd = function(Matrix[Double] X, Matrix[Double] mask, Double threshold)
-return (Matrix[Double] X_filled)
-{
-
- FD = discoverFD(replace(target=X, pattern=NaN, replacement=1), mask,
threshold)
- diagonal = diag(FD)
+ dataList = append(dataList, fold_i)
+ fold_idxes[, 1] = fold_idxes[, 2] + 1
+ fold_idxes[, 2] += ins_per_fold
+ while(FALSE){}
+ }
- for(i in 1: nrow(FD))
+ for(i in seq(1,k))
{
- for(j in 1:ncol(FD)) {
- if(as.scalar(FD[i, j]) > threshold)
- X = imputeByFD(X, i, j, threshold, FALSE)
-
- }
+ [trainList, hold_out] = remove(dataList, i)
+ trainset = rbind(trainList)
+ testset = as.matrix(hold_out)
+ trainX = trainset[, 2:ncol(trainset)]
+ trainy = trainset[, 1]
+ testX = testset[, 2:ncol(testset)]
+ testy = testset[, 1]
+ beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]),
tol= 1e-9,
+ maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
+ [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
+ accuracy = getAccuracy(testy, yhat, isWeighted)
+ accuracyMatrix[i] = accuracy
}
- X_filled = X
}
-#######################################################################
-# Wrapper of na_lof to call inside eval as a function
-# Output: filled matrix X
-#######################################################################
-forward_fill = function(Matrix[Double] X, Boolean op, Boolean verbose)
-return (Matrix[Double] X_filled)
-{
- option = ifelse(op, "locf", "nocb")
- X_filled = na_locf(X=X, option=option, verbose=verbose)
-}
+# data=["#MissingValues", "MinVla", "MaxVal", "AverageMin", "AverageMax",
+# "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers",
"#OHEfeatures", "#Classes",
+# "Imbalance", "#rows", "#cols", ""]
\ No newline at end of file
diff --git a/scripts/builtin/discoverFD.dml b/scripts/builtin/discoverFD.dml
index 49d013b..1bb0a21 100644
--- a/scripts/builtin/discoverFD.dml
+++ b/scripts/builtin/discoverFD.dml
@@ -41,6 +41,7 @@
m_discoverFD = function(Matrix[Double] X, Matrix[Double] Mask, Double
threshold)
return(Matrix[Double] FD)
{
+
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1]
found "+threshold)
diff --git a/scripts/builtin/executePipeline.dml
b/scripts/builtin/executePipeline.dml
index 90a9902..3ca0240 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -20,7 +20,7 @@
#-------------------------------------------------------------
s_executePipeline = function(Frame[String] pipeline, Matrix[Double] X,
Matrix[Double] Y, Matrix[Double] mask,
- List[Unknown] hyperParameters, Integer resource_index, Boolean verbose)
+ Matrix[Double] FD, Matrix[Double] hyperParameters, Integer flagsCount,
Boolean verbose)
return (Matrix[Double] X, Matrix[Double] Y)
{
@@ -30,16 +30,13 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] X, Matrix[D
print("checks rows in X = "+nrow(X)+" rows in Y = "+nrow(Y)+" cols in X
= "+ncol(X)+" col in Y = "+ncol(Y))
print("pipeline in execution "+toString(pipeline))
print("pipeline hps "+toString(hyperParameters))
- print("index "+toString(resource_index))
while(FALSE){}
}
for(i in 1:ncol(pipeline)) {
-
op = as.scalar(pipeline[1,i])
- [hp, withClass, dataFlag] = matrixToList(X, Y, mask,
as.matrix(hyperParameters[i]), resource_index, op)
+ [hp, withClass, dataFlag] = matrixToList(X, Y, mask, FD,
hyperParameters[i], flagsCount, op)
Xclone = X
X = eval(op, hp)
- while(FALSE){}
# dataFlag 0 = only on numeric, 1 = on whole data
X = confirmData(X, Xclone, mask, dataFlag)
if(withClass)
@@ -50,21 +47,21 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] X, Matrix[D
X = confirmMeta(X, mask)
}
- print("END OF PIPELINE"+toString(pipeline))
- while(FALSE){}
}
# This function will convert the matrix row-vector into list
-matrixToList = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, Matrix[Double] p, Integer resource_index, String op)
+matrixToList = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, Matrix[Double] FD,
+ Matrix[Double] p, Integer flagsCount, String op)
return (List[Unknown] l, Boolean hasY, Integer dataFlag)
{
-
+ NUM_META_FLAGS = flagsCount;
hasY = FALSE
dataFlag = as.integer(as.scalar(p[1, ncol(p)]))
hasVerbose = as.scalar(p[1, ncol(p) - 1])
yFlag = as.scalar(p[1, ncol(p) - 2])
- maskFlag = as.integer(as.scalar(p[1, ncol(p)-3]))
+ fDFlag = as.integer(as.scalar(p[1, ncol(p)-3]))
+ maskFlag = as.integer(as.scalar(p[1, ncol(p)-4]))
######################################################
# CHECK FOR DATA FLAG
@@ -90,6 +87,13 @@ matrixToList = function(Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] mask
hasY = TRUE
}
######################################################
+ # CHECK FOR FD APPEND FLAG
+ if(fDFlag == 1)
+ {
+ l = append(l, FD)
+ }
+
+ ######################################################
# CHECK FOR MASK APPEND FLAG
if(maskFlag == 1)
{
@@ -97,20 +101,22 @@ matrixToList = function(Matrix[Double] X, Matrix[Double]
Y, Matrix[Double] mask
}
#####################################################
# POPULATE HYPER PARAM
- if(ncol(p) > 4) {
- if(op == "pca") {
- ratio = as.scalar(p[resource_index,1])
- p[resource_index, 1] = as.integer(ncol(X) - ratio)
- }
- for(i in 1:ncol(p)-4)
- l = append(l, as.scalar(p[resource_index,i]))
+ # get the number of hyper-parameters and loop till that
+ no_of_hyperparam = as.scalar(p[1,1])
+ if(no_of_hyperparam > 0) {
+ # if(op == "pca") {
+ # # convert the number parameters to a ration related to OHE columns
+ # ratio = as.scalar(p[resource_index,1])
+ # p[resource_index, 1] = as.integer(ncol(X) - ratio)
+ # }
+ for(i in 1:no_of_hyperparam)
+ l = append(l, as.scalar(p[1,(i+1)]))
}
######################################################
# CHECK FOR VERBOSE FLAG
if(hasVerbose == 1)
l = append(l, FALSE)
- # print("+++++++++++HP++++++++++++++")
- # print(toString(l, rows=2))
+
}
confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
@@ -188,6 +194,7 @@ return (Matrix[Double] X)
# print("recreated data \n"+toString(X, rows = 20))
}
+
#######################################################################
# Wrapper of transformencode OHE call, to call inside eval as a function
# Inputs: The input dataset X, and mask of the columns
@@ -205,3 +212,90 @@ return (Matrix[Double] dX_train) {
[dX_train, dM] = transformencode(target=as.frame(X), spec=jspecDC);
}
+
+
+
+#######################################################################
+# Wrapper of imputeByFD OHE call, to call inside eval as a function
+# Inputs: The input dataset X, and mask of the columns and threshold value
+# Output: filled matrix X
+#######################################################################
+
+imputeByFd = function(Matrix[Double] X, Matrix[Double] FD, Double threshold)
+return (Matrix[Double] X_filled)
+{
+
+ for(i in 1: nrow(FD))
+ {
+ for(j in 1:ncol(FD)) {
+ if(as.scalar(FD[i, j]) > 0 & (min(X[, i]) != 0) & (min(X[, j]) != 0) &
(sum(FD[, j]) != nrow(FD)))
+ X = imputeByFD(X, i, j, threshold, FALSE)
+ }
+ }
+ X_filled = X
+}
+
+#######################################################################
+# Wrapper of na_lof to call inside eval as a function
+# Output: filled matrix X
+#######################################################################
+
+forward_fill = function(Matrix[Double] X, Boolean op, Boolean verbose)
+return (Matrix[Double] X_filled)
+{
+ option = ifelse(op, "locf", "nocb")
+ X_filled = na_locf(X=X, option=option, verbose=verbose)
+}
+
+
+
+# smote wrapper for doing relative over-sampling
+SMOTE = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask,
Integer remainingRatio, Boolean verbose)
+return (Matrix[Double] XY)
+{
+ XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
+ synthesized = matrix(0,0,0) # initialize variable
+ # get the class count
+ classes = table(XY[, 1], 1)
+ start_class = 1
+ end_class = 0
+ k = table(XY[, 1], 1)
+ getMax = max(k)
+ maxKIndex = as.scalar(rowIndexMax(t(k)))
+ outSet = matrix(0, 0, ncol(XY))
+ remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100 -
(remainingRatio%%100)),
+ remainingRatio-(remainingRatio%%100))
+ for(i in 1: nrow(k)) {
+ end_class = end_class + as.scalar(classes[i])
+ class_t = XY[start_class:end_class, ]
+ if((i != maxKIndex)) {
+ synthesized = smote(class_t[, 2:ncol(XY)], mask, remainingRatio, 1,
FALSE)
+ synthesized = cbind(matrix(as.scalar(class_t[2,1]), nrow(synthesized),
1), synthesized)
+ outSet = rbind(outSet, synthesized)
+ }
+ start_class = end_class + 1
+ }
+
+ XY = rbind(XY, synthesized)
+ Y = XY[, 1]
+ X = XY[, 2:ncol(XY)]
+ XY = cbind(X,Y)
+ classes = table(Y, 1)
+}
+
+
+
+
+########################################################
+# The function will replace the null with default values
+########################################################
+fillDefault = function(Matrix[Double] X)
+return(Matrix[Double] X){
+ defaullt = round(colMaxs(X) - colMins(X))
+ Mask = is.na(X)
+ X = replace(target=X, pattern=NaN, replacement=0)
+ Mask = Mask * defaullt
+ X = X + Mask
+}
+
+
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index eddbd73..3372886 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -54,7 +54,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3,
expected number of columns > 1 found: "+ncol(X))
if(ncol(cMask) != ncol(X))
- stop("Dimension mismatch: the columns in X != columns in mask")
+ stop("MICE Dimension mismatch: the columns in X != columns in mask")
lastIndex = ncol(X);
@@ -204,8 +204,9 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3,
colDist= function(Matrix[Double] X, Matrix[Double] mask)
return (Matrix[Double] dist){
+
dist = matrix(1, 1, ncol(X))
- X = replace(target=X, pattern=0, replacement=min(X))
+ X = replace(target=X, pattern=0, replacement=max(X)+1)
parfor(i in 1:ncol(X))
{
if(as.scalar(mask[,i]) == 1)
@@ -214,5 +215,6 @@ return (Matrix[Double] dist){
dist[1, i] = sum(distT != 0)
}
}
+
}
diff --git a/scripts/builtin/multiLogRegPredict.dml
b/scripts/builtin/multiLogRegPredict.dml
index a2c7e8c..a7bbc23 100644
--- a/scripts/builtin/multiLogRegPredict.dml
+++ b/scripts/builtin/multiLogRegPredict.dml
@@ -51,7 +51,7 @@ m_multiLogRegPredict = function(Matrix[Double] X,
Matrix[Double] B, Matrix[Doubl
}
if(ncol(X) < nrow(B)-1)
stop("multiLogRegPredict: mismatching ncol(X) and nrow(B): "+ncol(X)+"
"+nrow(B));
-
+ accuracy = 0.0 # initialize variable
beta = B[1:ncol(X), ];
intercept = ifelse(ncol(X)==nrow(B), matrix(0,1,ncol(B)), B[nrow(B),]);
linear_terms = X %*% beta + matrix(1,nrow(X),1) %*% intercept;
diff --git a/scripts/builtin/pca.dml b/scripts/builtin/pca.dml
index 1cd7cfd..3054d4c 100644
--- a/scripts/builtin/pca.dml
+++ b/scripts/builtin/pca.dml
@@ -26,7 +26,7 @@
# NAME TYPE DEFAULT MEANING
#
---------------------------------------------------------------------------------------------
# X Matrix --- Input feature matrix
-# K Int 2 Number of reduced dimensions (i.e., columns)
+# K Int 2 Number of reduced dimensions (i.e., columns)
# Center Boolean TRUE Indicates whether or not to center the feature matrix
# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
@@ -41,6 +41,11 @@
m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean
scale=TRUE)
return (Matrix[Double] Xout, Matrix[Double] Mout, Matrix[Double] Centering,
Matrix[Double] ScaleFactor)
{
+ if(K > ncol(X)) {
+ print("PCA: invalid parameter value, the value of k should not be greater
than the no. of columns in X ")
+ print("setting k = ncol(X)")
+ K = ncol(X)
+ }
N = nrow(X);
D = ncol(X);
diff --git a/scripts/builtin/splitBalanced.dml
b/scripts/builtin/splitBalanced.dml
index 4428443..32b87d7 100644
--- a/scripts/builtin/splitBalanced.dml
+++ b/scripts/builtin/splitBalanced.dml
@@ -63,6 +63,7 @@ return (Matrix[Double] X_train, Matrix[Double] y_train,
Matrix[Double] X_test,
print("train ratio \n"+toString(classes_ratio_train))
print("test ratio \n"+toString(classes_ratio_test))
}
+
for(i in 1:nrow(classes))
{
end_class = end_class + as.scalar(classes[i])
diff --git a/scripts/pipelines/properties/param.csv
b/scripts/pipelines/properties/param.csv
index 1ab4218..c533e07 100644
--- a/scripts/pipelines/properties/param.csv
+++ b/scripts/pipelines/properties/param.csv
@@ -1,16 +1,16 @@
-name,param_no,maskFlag,yFlag,verboseFlag,dataFlag,dataType,ranges,st1,en1,st2,en1,en2,st3,en3
-outlierByIQR,3,0,0,1,0,FP,INT,INT,1,5,1,2,1,10
-outlierBySd,3,0,0,1,0,FP,INT,INT,1,5,2,2,1,10
-winsorize,0,0,0,1,0,,,,,,,,,
-imputeByMean,0,1,0,0,2,,,,,,,,,
-imputeByMedian,0,1,0,0,2,,,,,,,,,
-mice,2,1,0,1,2,INT,FP,1,3,0.5,0.9,,,
-abstain,1,0,1,1,2,FP,0.6,0.8,,,,,,
-SMOTE,1,1,1,1,2,INT,100,200,,,,,,
-downSample,0,0,1,0,2,,,,,,,,,
-pca,3,0,0,0,2,INT,BOOL,BOOL,1,10,0,1,0,0
-fillDefault,0,0,0,0,2,,,,,,,,,
-dummycoding,0,1,0,0,2,,,,,,,,,
-scale,2,0,0,0,0,BOOL,BOOL,0,1,0,1,,,
-forward_fill,1,0,0,1,0,BOOL,0,1,,,,,,
-imputeByFd,1,1,0,0,2,FP,0.7,1,,,,,,
\ No newline at end of file
+name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,dataType,ranges,st1,en1,st2,en1,en2,st3,en3
+outlierByIQR,3,0,0,0,1,0,FP,INT,INT,1,5,1,2,1,10
+outlierBySd,3,0,0,0,1,0,FP,INT,INT,1,5,2,2,1,10
+winsorize,0,0,0,0,1,0,,,,,,,,,
+imputeByMean,0,1,0,0,0,2,,,,,,,,,
+imputeByMedian,0,1,0,0,0,2,,,,,,,,,
+mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1.0,,,
+abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,
+SMOTE,1,1,0,1,1,2,INT,100,200,,,,,,
+downSample,0,0,0,1,0,2,,,,,,,,,
+pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0
+fillDefault,0,0,0,0,0,2,,,,,,,,,
+dummycoding,0,1,0,0,0,2,,,,,,,,,
+scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,
+forward_fill,1,0,0,0,1,0,BOOL,0,1,,,,,,
+imputeByFd,1,0,1,0,0,2,FP,0.55,1,,,,,,
\ No newline at end of file
diff --git a/scripts/pipelines/properties/primitives.csv
b/scripts/pipelines/properties/primitives.csv
index 19eb7d8..98f3874 100644
--- a/scripts/pipelines/properties/primitives.csv
+++ b/scripts/pipelines/properties/primitives.csv
@@ -2,5 +2,4 @@ OTLR,MVI,NR,CI,DIM,DUMMY,SCALE
winsorize,imputeByMean,abstain,SMOTE,pca,dummycoding,scale
outlierBySd,imputeByMedian,,,,,
outlierByIQR,mice,,,,,
-,fillDefault,,,,,
-,forward_fill,,,,,
\ No newline at end of file
+,fillDefault,,,,,
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/logicalFunc.dml
b/scripts/pipelines/scripts/logicalFunc.dml
index 8f5365e..0ddc1bb 100644
--- a/scripts/pipelines/scripts/logicalFunc.dml
+++ b/scripts/pipelines/scripts/logicalFunc.dml
@@ -23,11 +23,10 @@
source("scripts/pipelines/scripts/utils.dml") as utils;
# incomplete implementation of automatic logical pipelines
-generateLogicalSeed = function(Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] mask)
+generateLogicalSeed = function(Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] mask, String target)
return(Frame[String] logical){
- # detection =
- logical = as.frame("")
+ logical = frame(data=["NULL"], rows=1, cols=1, schema=["STRING"])
no_of_mv = sum(is.na(X))
X = replace(target= X, pattern = NaN, replacement = 0)
@@ -42,10 +41,13 @@ return(Frame[String] logical){
count3sdplus = sum(X > (colMean + 3*colSd ))
count3sdminus = sum(X < (colMean - 3*colSd ))
outliers = count3sdplus + count3sdminus
- ctab = table(Y, 1)
- minCatPer = min(ctab) / nrow(ctab)
- maxCat = max(ctab) / nrow(ctab)
-
+ minCat = 0.0 # initialize variables
+ maxCat = 0.0
+ if(target != "compare") {
+ ctab = table(Y, 1)
+ minCat = min(ctab)
+ maxCat = max(ctab)
+ }
mv_to_data_ratio = no_of_mv/(nrow(X) * ncol(X))
out_to_data_ratio = outliers/ (nrow(X) * ncol(X))
@@ -53,17 +55,22 @@ return(Frame[String] logical){
logical = cbind(logical, as.frame("MVI"))
if(out_to_data_ratio > 0.1)
logical = cbind(logical, as.frame("OTLR"))
- if(maxVal - minVal > 1000)
- logical = cbind(logical, as.frame("SCALE"))
- if((maxCat - minCatPer) > 0.3)
- logical = cbind(logical, as.frame("CI"))
- if(sum(mask) > 0) {
- logical = cbind(logical, as.frame("DUMMY"))
- if(sum(distinctCategories) > 5*ncol(X))
- logical = cbind(logical, as.frame("DIM"))
-
- logical = logical[, 2:ncol(logical)]
+ if(target != "compare") {
+ if(maxVal - minVal > 1000 )
+ logical = cbind(logical, as.frame("SCALE"))
+ if((maxCat - minCat) > (minCat/2))
+ logical = cbind(logical, as.frame("CI"))
+ if(sum(mask) > 0) {
+ logical = cbind(logical, as.frame("DUMMY"))
+ if(sum(distinctCategories) > 5*ncol(X))
+ logical = cbind(logical, as.frame("DIM"))
+ }
}
+
+ if(ncol(logical) == 1)
+ logical = frame(["OTLR", "MVI"], rows=1, cols=2, schema=["STRING",
"STRING"])
+ else
+ logical = logical[, 2:ncol(logical)]
}
@@ -73,26 +80,28 @@ return(Frame[Unknown] transformLogical) {
transformLogical = frame(0, rows=3, cols= ncol(seed)+2)
# case 1: MVI and OTLR
- if(as.scalar(seed[1,1]) == "MVI" & as.scalar(seed[1,2]) == "OTLR")
+ if(ncol(seed) > 1)
{
- # t1: swap MV and OTLR
- transformLogical[2,1] = seed[1,2]
- transformLogical[2,2] = seed[1,1]
- transformLogical[2, 3:ncol(seed)] = seed[1,3:ncol(seed)]
+ if(as.scalar(seed[1,1]) == "MVI" & as.scalar(seed[1,2]) == "OTLR") {
+ # t1: swap MV and OTLR
+ transformLogical[2,1] = seed[1,2]
+ transformLogical[2,2] = seed[1,1]
+ transformLogical[2, 3:ncol(seed)] = seed[1,3:ncol(seed)]
- # t2: if the sequence is MVI, OTLR then introduce an MVI after to avoid
nulls
-
- transformLogical[3,1:2] = seed[1,1:2]
- transformLogical[3,3] = seed[1,1]
- transformLogical[3, 4:ncol(seed)] = seed[1,3:ncol(seed)]
- }
- # case 2: OTLR
- else if(as.scalar(seed[1, 1]) == "OTLR")
- {
- # if first operation is OTLR then add a MVI to fill in MVs introduced by
OTLR
- transformLogical[2,1] = seed[1, 1]
- transformLogical[2,2] = "MVI"
- transformLogical[2, 3:ncol(seed)] = seed[1,2:ncol(seed)]
+
+ # t2: if the sequence is MVI, OTLR then introduce an MVI after to avoid
null
+ transformLogical[3,1:2] = seed[1,1:2]
+ transformLogical[3,3] = seed[1,1]
+ transformLogical[3, 4:ncol(seed)] = seed[1,3:ncol(seed)]
+ }
+ # case 2: OTLR
+ else if(as.scalar(seed[1, 1]) == "OTLR" & as.scalar(seed[1, 2]) != "MVI" )
+ {
+ # if first operation is OTLR then add a MVI to fill in MVs introduced by
OTLR
+ transformLogical[2,1] = seed[1, 1]
+ transformLogical[2,2] = "MVI"
+ transformLogical[2, 3:ncol(seed)] = seed[1,2:ncol(seed)]
+ }
}
transformLogical[1, 1:ncol(seed)] = seed
transformLogical = map(transformLogical, "var -> var.replace(\"0\", \"\")")
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index 1214d73..aa9338a 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -114,11 +114,11 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio)
}
}
-#######################################################################
-# Wrapper of transformencode OHE call, to call inside eval as a function
-# Inputs: The input dataset X, and mask of the columns
-# Output: OHEd matrix X
-#######################################################################
+# #######################################################################
+# # Wrapper of transformencode OHE call, to call inside eval as a function
+# # Inputs: The input dataset X, and mask of the columns
+# # Output: OHEd matrix X
+# #######################################################################
dummycoding = function(Matrix[Double] X, Matrix[Double] mask)
return (Matrix[Double] dX_train) {
@@ -132,6 +132,26 @@ return (Matrix[Double] dX_train) {
}
+# # function to classify the data using cross validation
+# fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, Matrix[Double] MLhp,
+ # Double testAccuracy, Boolean isWeighted, Integer cv=3)
+ # return (Double accuracy)
+# {
+
+ # if(max(Y) == min(Y)) {
+ # print("Y contains only one class")
+ # accuracy = as.double(0)
+ # }
+ # else {
+ # print("STARTING "+cv+" CROSS VALIDATIONS")
+ # # do the k = 3 cross validations
+ # accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
+ # accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
+ # acc = colMeans(accuracyMatrix)
+ # accuracy = as.scalar(acc[1,1])
+ # print("validation accuracy "+accuracy)
+ # }
+# }
@@ -155,13 +175,13 @@ return (Matrix[Double] dX_train) {
# }
-# # ######################################################################
-# # # # Function for cross validation using hold out method
-# # # # Inputs: The input dataset X, Y and the value of k validation, mask of
the
-# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # # # via gridsearch and a boolean value of (un)weighted accuracy.
-# # # # Output: It return a matrix having the accuracy of each fold.
-# # ######################################################################
+# # # ######################################################################
+# # # # # Function for cross validation using hold out method
+# # # # # Inputs: The input dataset X, Y and the value of k validation, mask
of the
+# # # # # dataset for OHE of categorical columns, vector of ML
hyper-parameters identified
+# # # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # # Output: It return a matrix having the accuracy of each fold.
+# # # ######################################################################
# crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
# Matrix[Double] MLhp, Boolean isWeighted)
@@ -190,7 +210,6 @@ return (Matrix[Double] dX_train) {
# end = end + idx
# class_j = data[start:end, ]
-
# start_i = as.scalar(fold_idxes[j, 1]);
# end_i = as.scalar(fold_idxes[j, 2])
@@ -218,8 +237,117 @@ return (Matrix[Double] dX_train) {
# accuracy = getAccuracy(testy, yhat, isWeighted)
# accuracyMatrix[i] = accuracy
# }
-
# }
+######################################################################
+# # Function for cross validation using hold out method
+# # Inputs: The input dataset X, Y and the value of k validation, mask of the
+# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# # via grid-search and a boolean value of (un)weighted accuracy.
+# # Output: It return a matrix having the accuracy of each fold.
+######################################################################
+
+compareValue = function(Matrix[double] dirtyX, Matrix[double] cleanX,
Matrix[Double] fixedX, Matrix[Double] mask)
+return (Double precision)
+{
+ dirtyX = replace(target= dirtyX, pattern=NaN, replacement=0)
+ cleanX = replace(target= cleanX, pattern=NaN, replacement=0)
+ fixedX = replace(target= fixedX, pattern=NaN, replacement=0)
+ correctionsRequired = dirtyX != cleanX
+ correctionsMade = (dirtyX != fixedX)
+ allCorrections_ = sum(correctionsMade)
+ match = (abs(cleanX - fixedX) < 0.1) * correctionsRequired
+ precision = max(0.001, sum(match) / allCorrections_)
+ print("---------------------------------true positives are
"+toString(precision))
+}
+# constraints over hyper parameters
+verifyHp = function(Integer index, Frame[Unknown] pip, Double minVal, Double
maxVal, Integer paraNo)
+return (Double minVal, Double maxVal) {
+ op = as.scalar(pip[1,index])
+ # 1. if next op is pca then current op should not leave NaNs in data
+ # 2. if next op is mice then current op should not replace NaNs with zeros
+
+ if((op == "outlierBySd" | op == "outlierByIQR") & index < ncol(pip) & paraNo
== 2)
+ {
+ nextOp = as.scalar(pip[1, index + 1])
+ if(nextOp == "pca" | nextOp == "abstain" | nextOp == "SMOTE")
+ {
+ maxVal = 1.0
+ }
+ if(nextOp == "mice")
+ {
+ minVal = 2.0
+ }
+ }
+
+}
+
+
+#####################################
+# The function will check if the pipeline have zero hyper-parameters
+# then it should not use more resource iterations and should be executed once
+######################################
+isResourceOptimal = function(List[Unknown] param, Boolean verbose)
+return(Boolean validForResources)
+{
+ validForResources = FALSE
+
+ count = 0
+ for(i in 1:length(param))
+ {
+ hp = as.matrix(param[i])
+ if(ncol(hp) > 4)
+ count += 1
+ }
+ validForResources = count > 0
+}
+
+
+###############################################################################################
+# The function will collect the features like statistics and pipelines and
accuracy
+# so that they could be used for training a model and predicting pipelines
without enumeration
+###############################################################################################
+gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, String target)
+return (Matrix[Double] features)
+{
+
+ features = matrix(0, rows = 1, cols= 14)
+ features[1, 1]= sum(is.na(X)) # number of missing values
+ X = replace(target= X, pattern = NaN, replacement = 0)
+ num = removeEmpty(target=X, margin="cols", select=(mask == 0))
+ # get the stats
+ features[1, 2] = min(num) # minimum value
+ features[1, 3] = max(num)
+ features[1, 4] = mean(colMins(num)) # average minimum value
+ features[1, 5] = mean(colMaxs(num)) # average maximum value
+ features[1, 6] = sum(mask) # number of categorical features
+ features[1, 7] = sum(mask == 0) # number of numerical features
+ features[1, 8] = mean(num) # mean value
+ colSd = colSds(num)
+ count3sdplus = sum(num > (colMeans(num) + 3*colSd ))
+ count3sdminus = sum(num < (colMeans(num) - 3*colSd ))
+ outliers = count3sdplus + count3sdminus
+ features[1, 9] = outliers
+ # OHE features
+ OHE = sum(colMaxs(X) * mask)
+ features[1, 10] = OHE
+ if(target != "compare")
+ {
+ ctab = table(Y, 1)
+ features[1, 11] = nrow(ctab) # number of classes
+ minCat = min(ctab) / nrow(ctab)
+ maxCat = max(ctab) / nrow(ctab)
+ # class imabalance 1=YES, 0=NO
+ features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
+ }
+ else
+ {
+ features[1, 11] = 0
+ features[1, 12] = 0
+ }
+ features[1, 13] = nrow(X)
+ features[1, 14] = ncol(X)
+
+}
\ No newline at end of file
diff --git
a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
similarity index 82%
copy from
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
copy to
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
index e74662e..0ad15e0 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
@@ -27,10 +27,10 @@ import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
-public class CleaningTest extends AutomatedTestBase {
- private final static String TEST_NAME1 = "mainScript";
+public class CleaningTestClassification extends AutomatedTestBase {
+ private final static String TEST_NAME1 = "testClassification";
private final static String TEST_NAME2 = "compareAccuracy";
- private final static String TEST_CLASS_DIR = SCRIPT_DIR +
CleaningTest.class.getSimpleName() + "/";
+ private final static String TEST_CLASS_DIR = SCRIPT_DIR +
CleaningTestClassification.class.getSimpleName() + "/";
protected static final String RESOURCE =
SCRIPT_DIR+"functions/pipelines/";
protected static final String DATA_DIR = RESOURCE+"data/";
@@ -51,10 +51,10 @@ public class CleaningTest extends AutomatedTestBase {
}
- @Ignore
+ @Test
public void testCP1() {
- runFindPipelineTest(1.0, 5,10, 2,
- true, Types.ExecMode.SINGLE_NODE);
+ runFindPipelineTest(0.5, 5,10, 2,
+ true, "classification", Types.ExecMode.SINGLE_NODE);
}
@Test
@@ -63,7 +63,7 @@ public class CleaningTest extends AutomatedTestBase {
}
private void runFindPipelineTest(Double sample, int topk, int
resources, int crossfold,
- boolean weightedAccuracy, Types.ExecMode et) {
+ boolean weightedAccuracy, String target, Types.ExecMode et) {
setOutputBuffering(true);
String HOME = SCRIPT_DIR+"functions/pipelines/" ;
@@ -71,10 +71,11 @@ public class CleaningTest extends AutomatedTestBase {
try {
loadTestConfiguration(getTestConfiguration(TEST_NAME1));
fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
-
- programArgs = new String[] {"-stats", "-exec",
"singlenode", "-args", DIRTY, META, PRIMITIVES,
- PARAM, String.valueOf(sample),
String.valueOf(topk), String.valueOf(resources),
- String.valueOf(crossfold),
String.valueOf(weightedAccuracy), output("O"), OUTPUT };
+ programArgs = new String[] {"-stats", "-exec",
"singlenode", "-nvargs", "dirtyData="+DIRTY, "metaData="+META,
+ "primitives="+PRIMITIVES, "parameters="+PARAM,
"sampleSize="+String.valueOf(sample),
+ "topk="+String.valueOf(topk),
"rv="+String.valueOf(resources), "cv="+String.valueOf(crossfold),
+ "weighted="+ String.valueOf(weightedAccuracy),
"output="+OUTPUT, "target="+target, "cleanData="+CLEAN,
+ "O="+output("O")};
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
diff --git
a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
similarity index 63%
rename from
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
rename to
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
index e74662e..36adfbb 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
@@ -27,10 +27,9 @@ import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
-public class CleaningTest extends AutomatedTestBase {
- private final static String TEST_NAME1 = "mainScript";
- private final static String TEST_NAME2 = "compareAccuracy";
- private final static String TEST_CLASS_DIR = SCRIPT_DIR +
CleaningTest.class.getSimpleName() + "/";
+public class CleaningTestCompare extends AutomatedTestBase {
+ private final static String TEST_NAME1 = "testCompare";
+ private final static String TEST_CLASS_DIR = SCRIPT_DIR +
CleaningTestCompare.class.getSimpleName() + "/";
protected static final String RESOURCE =
SCRIPT_DIR+"functions/pipelines/";
protected static final String DATA_DIR = RESOURCE+"data/";
@@ -47,23 +46,17 @@ public class CleaningTest extends AutomatedTestBase {
@Override
public void setUp() {
addTestConfiguration(TEST_NAME1,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
- addTestConfiguration(TEST_NAME2,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2,new String[]{"R"}));
}
- @Ignore
- public void testCP1() {
- runFindPipelineTest(1.0, 5,10, 2,
- true, Types.ExecMode.SINGLE_NODE);
- }
-
@Test
- public void testCP2() {
- runCleanAndCompareTest( Types.ExecMode.SINGLE_NODE);
+ public void testCP1() {
+ runFindPipelineTest(0.5, 5,10, 2,
+ true, "compare", Types.ExecMode.SINGLE_NODE);
}
private void runFindPipelineTest(Double sample, int topk, int
resources, int crossfold,
- boolean weightedAccuracy, Types.ExecMode et) {
+ boolean weightedAccuracy, String target, Types.ExecMode et) {
setOutputBuffering(true);
String HOME = SCRIPT_DIR+"functions/pipelines/" ;
@@ -71,35 +64,11 @@ public class CleaningTest extends AutomatedTestBase {
try {
loadTestConfiguration(getTestConfiguration(TEST_NAME1));
fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
-
- programArgs = new String[] {"-stats", "-exec",
"singlenode", "-args", DIRTY, META, PRIMITIVES,
- PARAM, String.valueOf(sample),
String.valueOf(topk), String.valueOf(resources),
- String.valueOf(crossfold),
String.valueOf(weightedAccuracy), output("O"), OUTPUT };
-
- runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-
- //expected loss smaller than default invocation
-
Assert.assertTrue(TestUtils.readDMLBoolean(output("O")));
- }
- finally {
- resetExecMode(modeOld);
- }
- }
-
- private void runCleanAndCompareTest( Types.ExecMode et) {
- setOutputBuffering(true);
- String HOME = SCRIPT_DIR+"functions/pipelines/";
- Types.ExecMode modeOld = setExecMode(et);
- try {
- loadTestConfiguration(getTestConfiguration(TEST_NAME2));
- fullDMLScriptName = HOME + TEST_NAME2 + ".dml";
-
- programArgs = new String[] {"-stats", "-exec",
- "singlenode", "-args", DIRTY, CLEAN, META,
OUTPUT, output("O")};
+ programArgs = new String[] {"-stats", "-exec",
"singlenode", "-nvargs", "dirtyData="+DIRTY, "metaData="+META,
+ "primitives="+PRIMITIVES, "parameters="+PARAM,
"topk="+String.valueOf(topk), "rv="+String.valueOf(resources),
+ "output="+OUTPUT, "target="+target,
"cleanData="+CLEAN, "O="+output("O")};
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-
- //expected loss smaller than default invocation
Assert.assertTrue(TestUtils.readDMLBoolean(output("O")));
}
finally {
diff --git a/src/test/scripts/functions/pipelines/compareAccuracy.dml
b/src/test/scripts/functions/pipelines/compareAccuracy.dml
index a8aa0d8..5cd7cda 100644
--- a/src/test/scripts/functions/pipelines/compareAccuracy.dml
+++ b/src/test/scripts/functions/pipelines/compareAccuracy.dml
@@ -49,11 +49,12 @@ O = read($2, data_type="frame", format="csv", header=FALSE,
metaInfo = read($3, data_type="frame", format="csv", header=FALSE);
input = $4
-pip = read(input+"pip.csv", data_type="frame", format="csv", header=FALSE);
-hp = read(input+"hp.csv", data_type="matrix", format="csv", header=FALSE);
+pip = read(input+"pipelines.csv", data_type="frame", format="csv",
header=FALSE);
+hp = read(input+"hyperparams.csv", data_type="matrix", format="csv",
header=FALSE);
getSchema = metaInfo[1, 2:ncol(metaInfo)]
getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for
FD computation
# # 1. dropInvalid function will remove the values which are not the part
# # of the column data type
@@ -80,32 +81,54 @@ eX = eX[, 1:ncol(eX) - 1]
# strip the mask of class label
getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
-
+getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
# construct hyper-parameters
ls = list();
i = 1; k = 1
-# take the oversampling out from the test processing
-pip1 = as.frame("")
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1),
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
+FD = FD > 0
# construct the parameter list for best hyper-parameters if the oversampling
technique is part of
# pipeline then take it out because oversampling is not applied on test dataset
# this condition is unnecessary here in this case because the input dataset is
balanced and
# instead of diving the dataset into train/test I am doing cross validations
-while(k <= ncol(pip))
-{
- end = as.integer(i+as.integer(as.scalar(hp[1,i])))
- mat = hp[1, i+1:end]
- i = end + 1
- if(as.scalar(pip[1,k]) != "SMOTE") {
- pip1 = cbind(pip1, pip[1,k] )
- ls = append(ls, mat)
- }
- k = k + 1
-}
+
+print("hp matrix")
+no_of_param = as.scalar(hp[1, 1]) + 1
+hp_width= hp[1, 2:no_of_param]
+hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
+index = 1
+# for(i in 1:ncol(pip))
+# {
+ # no_of_param = as.scalar(hp[1, index])
+ # hp_matrix[i, 1:no_of_param] = hp[1, 2:no_of_param+1]
+ # index = index + no_of_param + 2
+# }
+
+
+
+print(toString(hp_matrix))
+
+# while(k <= ncol(pip))
+# {
+ # end = as.integer(i+as.integer(as.scalar(hp[1,i])))
+ # mat = hp[1, i+1:end]
+ # i = end + 1
+ # if(as.scalar(pip[1,k]) != "SMOTE") {
+ # pip1 = cbind(pip1, pip[1,k] )
+ # ls = append(ls, mat)
+ # }
+ # k = k + 1
+# }
+
+
+print("ncol in X "+ncol(eX))
+print("ncol in mask "+ncol(getMask))
# # clean using best pipeline
-[cX , cY] = executePipeline(pip1[, 2:ncol(pip1)], eX, eY, getMask, ls, 1,
FALSE)
+[cX , cY] = executePipeline(pip[1], eX, eY, getMask, FD, hp_matrix, 5, FALSE)
if(sum(getMask) > 0)
{
diff --git a/src/test/scripts/functions/pipelines/intermediates/acc.csv
b/src/test/scripts/functions/pipelines/intermediates/acc.csv
deleted file mode 100644
index f6b666b..0000000
--- a/src/test/scripts/functions/pipelines/intermediates/acc.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-76.14285714285714
-76.0
-75.85714285714286
-75.85714285714286
-75.85714285714286
diff --git a/src/test/scripts/functions/pipelines/intermediates/hp.csv
b/src/test/scripts/functions/pipelines/intermediates/hp.csv
deleted file mode 100644
index 385ecad..0000000
--- a/src/test/scripts/functions/pipelines/intermediates/hp.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-4.0,1.0,0,0,2.0,6.0,0,0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,5.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,1.0,0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,5.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,0,1.0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,2.0,1.0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,1.0,1.0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,10.0,1.0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,1.0,1.0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/pip.csv
b/src/test/scripts/functions/pipelines/intermediates/pip.csv
deleted file mode 100644
index 834e793..0000000
--- a/src/test/scripts/functions/pipelines/intermediates/pip.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-imputeByMean,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
-imputeByMean,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
diff --git a/src/test/scripts/functions/pipelines/mainScript.dml
b/src/test/scripts/functions/pipelines/mainScript.dml
index 3999e94..5422ae6 100644
--- a/src/test/scripts/functions/pipelines/mainScript.dml
+++ b/src/test/scripts/functions/pipelines/mainScript.dml
@@ -25,16 +25,20 @@ source("scripts/pipelines/scripts/logicalFunc.dml") as
logical;
source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
# read the inputs
-F = read($1, data_type="frame", format="csv", header=FALSE,
+F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
+ naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+sample = $sampleSize
+topK = $topk
+resources = $rv
+crossValidations = $cv
+weightedAccuracy = $weighted # accuracy flag
+targetApplicaton = $target # accuracy flag
+cleanData = read($cleanData, data_type="frame", format="csv", header=TRUE,
naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
-metaInfo = read($2, data_type="frame", format="csv", header=FALSE);
-primitives = read($3, data_type = "frame", format="csv", header= TRUE)
-param = read($4, data_type = "frame", format="csv", header= TRUE)
-sample = $5
-topK = $6
-resources = $7
-crossValidations = $8
-weightedAccuracy = $9 # accuracy flag
if(nrow(metaInfo) < 2)
@@ -45,12 +49,13 @@ if(nrow(metaInfo) < 2)
getSchema = metaInfo[1, 2:ncol(metaInfo)]
getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for
FD computation
# 1. dropInvalid function will remove the values which are not the part
# of the column data type
-X = dropInvalidType(F, getSchema)
- # X = F
+# X = dropInvalidType(F, getSchema)
+ X = F
# 2. encode the categorical data
if(sum(getMask) > 0)
@@ -58,60 +63,108 @@ if(sum(getMask) > 0)
# always recode the label
index = utils::vectorToCsv(getMask)
jspecR = "{ids:true, recode:["+index+"]}"
- [eX, X_meta] = transformencode(target=X, spec=jspecR);
+ if(targetApplicaton == "compare") {
+ [eX, X_meta] = transformencode(target=rbind(cleanData, X), spec=jspecR);
+ cleanX = eX[1:nrow(cleanData)]
+ eX = eX[nrow(cleanData)+1:nrow(eX)]
+ }
+ else
+ [eX, X_meta] = transformencode(target=X, spec=jspecR);
# change the schema to reflect the encoded values
getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
-
+
+
}
# if no categorical value exist then just cast the frame into matrix
else
eX = as.matrix(X)
+
+
# 3. extract the class label
-eY = eX[, ncol(eX)]
-eX = eX[, 1:ncol(eX) - 1]
-
-
-getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
-getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
-
-
+if(targetApplicaton == "classification")
+{
+ eY = eX[, ncol(eX)]
+ eX = eX[, 1:ncol(eX) - 1]
+ getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
+ getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class
label
+ getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class
label
+}
+
# get the logical seed
-lgSeed = logical::generateLogicalSeed(eX, eY, getMask)
+if(targetApplicaton == "compare")
+ lgSeed = logical::generateLogicalSeed(eX, as.matrix(0), getMask,
targetApplicaton)
+else
+ lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
allLgs = logical::transformLogical(lgSeed)
-# 4. perform the sampling
-[eX, eY] = utils::doSample(eX, eY, sample)
-
-# 5. get train test and validation set with balanced class distribution
-[X_train, y_train, X_test, y_test] = splitBalanced(eX, eY, 0.7, FALSE)
-
-# 6. find the best hyper parameters for classification algorithm
-# for now only find the best values for intercept and maximum outer iteration
-params = list("reg", "maxi");
-paramRanges = list(10^seq(0,-10), seq(10,100, 10));
-
-
-dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), pattern
= NaN, replacement=0), getMask)
-dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),]
-dX_train = dX_train[1:nrow(y_train),]
-
-# [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test,
- # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-opt = matrix("0 100", 1, 2)
-
-# 7. get the cross validated accuracy on dirty dataset (only on training set)
-d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy,
crossValidations)
-# print("dirty accuracy is "+d_accuracy)
-# # [eX, eY] = prioritise(eX, eY, getMask)
-
+d_accuracy = 0
+# 4. perform the sampling
+if(targetApplicaton != "compare") {
+ [eX, eY] = utils::doSample(eX, eY, sample)
+
+ # 5. get train test and validation set with balanced class distribution
+ [X_train, y_train, X_test, y_test] = splitBalanced(eX, eY, 0.7, FALSE)
+
+ # 6. find the best hyper parameters for classification algorithm
+ # for now only find the best values for intercept and maximum outer iteration
+ params = list("reg", "maxi");
+ paramRanges = list(10^seq(0,-10), seq(10,100, 10));
+
+ # if(sum(getMask) > 0)
+ # {
+ # dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test),
pattern = NaN, replacement=0), getMask)
+ # dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),]
+ # dX_train = dX_train[1:nrow(y_train),]
+ # [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test,
+ # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+# }
+ # else
+ # [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test,
+ # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+ opt = matrix("0 100", 1, 2)
+
+ # 7. get the cross validated accuracy on dirty dataset (only on training set)
+ d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy,
crossValidations)
+ # print("dirty accuracy is "+d_accuracy)
+ # # [eX, eY] = prioritise(eX, eY, getMask)
+}
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1),
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
+FD = FD > 0
+
+logical1 = frame(["4", "MVI", "SCALE", "DUMMY", "DIM", "0", "0", "0"],
rows=1, cols=8)
+logical2 = frame(["2", "MVI", "DUMMY", "0", "0", "0", "0", "0"], rows=1,
cols=8)
+logical3 = frame(["3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0"], rows=1,
cols=8)
+logical4 = frame(["7", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"],
rows=1, cols=8)
+logical5 = frame(["7", "MVI", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM"],
rows=1, cols=8)
+logical6 = frame(["6", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM", "0"],
rows=1, cols=8)
+
+log = rbind(logical1, logical2)
+log = rbind(log, logical3)
+log = rbind(log, logical4)
+log = rbind(log, logical5)
+log = rbind(log, logical6)
+print("logical permutations "+toString(log))
+
+metaList = list(mask=getMask, schema=getSchema, fd=FD)
+targetClassification = list(target=targetApplicaton, cv=crossValidations,
wAccuracy=weightedAccuracy,
+ dirtyAcc = d_accuracy, mlHp = opt, cleanData = as.matrix(0))
+
+
+# val = compareValue(replace(target=eX, pattern=NaN, replacement=0), getMask)
+parfor(i in 1:nrow(log))
+{
+ lv = as.integer(as.scalar(log[i, 1])) + 1
+ [pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,
metaList=metaList, targetList=targetClassification, lp=log[i, 2:lv],
+ primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+}
-[pip, hp, acc] = bandit(X_train=X_train, Y_train=y_train, mask=getMask,
MLhp=opt,
- schema=getSchema, lp=allLgs, primitives=primitives, param=param, k=topK,
testAccuracy=d_accuracy,
- isWeighted=weightedAccuracy, R=resources, cv=crossValidations, verbose=TRUE);
+output = $output
+write(features, output+"/features.csv", format="csv")
if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
@@ -132,35 +185,29 @@ print(toString(hp))
print("best accuracy")
print(toString(acc))
-
-clean_accuracy = testBestPipeline(pip=pip[1,], hp=hp[1,], X_train=X_train,
y_train=y_train,
- X_test=X_test, y_test=y_test, cmask=getMask, MLhp=opt,
valAcc=as.scalar(acc[1,1]), dirAcc=d_accuracy,
- isWeighted=weightedAccuracy)
-
-
-# # # dataPath = $10
-# # # # write the preprocessing
-# # # trainset = cbind(X_train, y_train)
-# # # testset = cbind(X_test, y_test)
-# # # write(trainset, dataPath+"/train.csv" , format="csv", sep=",")
-# # # write(testset, dataPath+"/test.csv", format="csv", sep=",")
-# # # write(opt, dataPath+"/mlHp.csv", format="csv", sep=",")
-# # # write(allLgs, $11, format="csv")
+# if(targetApplicaton != "compare")
+ # clean_accuracy = testBestPipeline(pip=pip[1,], hp=hp[1,], X_train=X_train,
y_train=y_train,
+ # X_test=X_test, y_test=y_test, cmask=getMask, FD=FD, MLhp=opt,
valAcc=as.scalar(acc[1,1]), dirAcc=d_accuracy,
+ # isWeighted=weightedAccuracy)
+# else
+clean_accuracy = as.scalar(acc[1,1])
result = d_accuracy < clean_accuracy
-print("reult satisfied ------------"+result)
-write(result, $10, format="text")
+print("result satisfied ------------"+result)
+accuracies = cbind(as.matrix(d_accuracy), as.matrix(clean_accuracy))
-output = $11
-if(result) {
- write(pip, output+"pip.csv", format="csv")
- write(hp, output+"hp.csv", format="csv")
- write(acc, output+"acc.csv", format="csv")
-}
+
+tmp_hp = cbind(matrix(NaN, nrow(hp), 1), hp)
+writeResult = cbind(pip, as.frame(tmp_hp))
+writeResult = cbind(writeResult , as.frame(acc))
+write(pip, output+"/pipelines.csv", format="csv")
+write(hp, output+"/hyperparams.csv", format="csv")
+write(acc, output+"/accuracies.csv", format="csv")
+write(accuracies , output+"/BestAccuracy.csv", format="csv")
@@ -178,92 +225,94 @@ classifyDirty = function(Matrix[Double] Xtrain,
Matrix[Double] ytrain, Matrix[Do
{
# # classify without cleaning fill with default values 1
Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
- dX_train = utils::dummycoding(Xtrain, mask)
+ if(sum(mask) > 0)
+ Xtrain = utils::dummycoding(Xtrain, mask)
# print("rows in data ")
# print(nrow(dX_train))
# print("column in data")
# print(ncol(dX_train))
- accuracy = crossV(dX_train, ytrain, cv, mask, opt, isWeighted)
+ accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
accuracy = mean(accuracy)
print("cross validated dirty accuracy "+accuracy)
}
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
-
-crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
- Matrix[Double] MLhp, Boolean isWeighted)
-return (Matrix[Double] accuracyMatrix)
-{
-
- accuracyMatrix = matrix(0, k, 1)
-
- dataList = list()
- testL = list()
- data = order(target = cbind(y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
- classes = table(data[, 1], 1)
- ins_per_fold = classes/k
- start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
- fold_idxes = cbind(start_fold, ins_per_fold)
-
- start_i = 0; end_i = 0; idx_fold = 1;;
- for(i in 1:k)
- {
- fold_i = matrix(0, 0, ncol(data))
- start=0; end=0;
- for(j in 1:nrow(classes))
- {
- idx = as.scalar(classes[j, 1])
- start = end + 1;
- end = end + idx
- class_j = data[start:end, ]
- start_i = as.scalar(fold_idxes[j, 1]);
- end_i = as.scalar(fold_idxes[j, 2])
+lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B)
+return (Matrix[Double] loss) {
+ [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE)
+ loss = as.matrix(1 - (acc/100))
+ # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
+}
- fold_i = rbind(fold_i, class_j[start_i:end_i, ])
- }
- dataList = append(dataList, fold_i)
- fold_idxes[, 1] = fold_idxes[, 2] + 1
- fold_idxes[, 2] += ins_per_fold
- while(FALSE){}
- }
+# testBestPipeline = function(Frame[Unknown] pip, Matrix[Double] hp,
Matrix[Double] X_train, Matrix[Double] y_train,
+ # Matrix[Double] X_test, Matrix[Double] y_test, Matrix[Double] cmask,
Matrix[Double] FD, Matrix[Double] MLhp,
+ # Double valAcc, Double dirAcc, Boolean isWeighted)
+ # return (Double result) {
+ # print("hp "+toString(hp))
+ # lsTrain = list();
+ # lsTest = list();
+ # i = 1; k = 1
+ # trRow=nrow(X_train)
+ # # take the oversampling out from the test processing
+ # pip1 = as.frame("")
+ # # construct the parameter list for best hyper-parameters
+ # while(k <= ncol(pip))
+ # {
+ # end = as.integer(i+as.integer(as.scalar(hp[1,i])))
+ # mat = hp[1, i+1:end]
+ # i = end + 1
+ # lsTrain = append(lsTrain, mat)
+ # if(as.scalar(pip[1,k]) != "SMOTE") {
+ # pip1 = cbind(pip1, pip[1,k] )
+ # lsTest = append(lsTest, mat)
+ # }
+ # k = k + 1
+ # }
+
+ # # clean using best pipeline and train model
+ # [X_train, y_train] = executePipeline(pip, X_train, y_train, cmask, FD,
lsTrain, 1, FALSE)
+ # if(ncol(pip1) > 1)
+ # [X_test, y_test] = executePipeline(pip1[, 2:ncol(pip1)], X_test, y_test,
cmask, FD, lsTest, 1, FALSE)
+ # # X_train_clean = X_train[1:trRow, ]
+ # # y_train_clean = Y_train[1:trRow, ]
+ # # X_test_clean = X_train[trRow+1:nrow(X_train), ]
+ # # y_test_clean = Y_train[trRow+1:nrow(X_train), ]
+
+ # # classify after cleaning
+ # betas = multiLogReg(X=X_train, Y=y_train, icpt=1,
+ # reg=as.scalar(MLhp[1,1]), tol= 1e-9, maxi=as.scalar(MLhp[1,2]),
+ # maxii= 50, verbose=FALSE);
+
+ # [c_prob, c_yhat, c_accuracy] = multiLogRegPredict(X_test, betas, y_test,
FALSE)
+ # c_accuracy = getAccuracy(y_test, c_yhat, isWeighted)
+ # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=c_yhat, Y=y_test)
+
+
+ # print("Actual Records \n"+toString(cbind(X_test, y_test)))
+ # # print("Clean Records \n"+toString(cbind(X_test, y_test)))
+ # print("predictions Records \n"+toString(cbind(X_test, c_yhat)))
+ # print("accuracy of dirty data "+dirAcc)
+ # print("accuracy of val data "+valAcc)
+ # print("accuracy of test accuracy "+c_accuracy)
+ # print("clean confusion matrix \n"+toString(confusionCount_c))
+
+ # result = c_accuracy
+# }
- for(i in seq(1,k))
- {
- [trainList, hold_out] = remove(dataList, i)
- trainset = rbind(trainList)
- testset = as.matrix(hold_out)
- trainX = trainset[, 2:ncol(trainset)]
- trainy = trainset[, 1]
- testX = testset[, 2:ncol(testset)]
- testy = testset[, 1]
- beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]),
tol= 1e-9,
- maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
- [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
- accuracy = getAccuracy(testy, yhat, isWeighted)
- accuracyMatrix[i] = accuracy
- }
-}
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
+# # ######################################################################
+# # # # Function for cross validation using hold out method
+# # # # Inputs: The input dataset X, Y and the value of k validation, mask of
the
+# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # Output: It return a matrix having the accuracy of each fold.
+# # ######################################################################
-compare = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
+crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
Matrix[Double] MLhp, Boolean isWeighted)
return (Matrix[Double] accuracyMatrix)
{
@@ -290,7 +339,6 @@ return (Matrix[Double] accuracyMatrix)
end = end + idx
class_j = data[start:end, ]
-
start_i = as.scalar(fold_idxes[j, 1]);
end_i = as.scalar(fold_idxes[j, 2])
@@ -318,68 +366,6 @@ return (Matrix[Double] accuracyMatrix)
accuracy = getAccuracy(testy, yhat, isWeighted)
accuracyMatrix[i] = accuracy
}
-
}
-lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B)
-return (Matrix[Double] loss) {
- [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE)
- loss = as.matrix(1 - (acc/100))
- # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
-}
-
-
-
-testBestPipeline = function(Frame[Unknown] pip, Matrix[Double] hp,
Matrix[Double] X_train, Matrix[Double] y_train,
- Matrix[Double] X_test, Matrix[Double] y_test, Matrix[Double] cmask,
Matrix[Double] MLhp,
- Double valAcc, Double dirAcc, Boolean isWeighted)
- return (Double result) {
- print("hp "+toString(hp))
- ls = list();
- i = 1; k = 1
- trRow=nrow(X_train)
- # take the oversampling out from the test processing
- pip1 = as.frame("")
- # construct the parameter list for best hyper-parameters
- while(k <= ncol(pip))
- {
- end = as.integer(i+as.integer(as.scalar(hp[1,i])))
- mat = hp[1, i+1:end]
- i = end + 1
- if(as.scalar(pip[1,k]) != "SMOTE") {
- pip1 = cbind(pip1, pip[1,k] )
- ls = append(ls, mat)
- }
- k = k + 1
- }
-
- # clean using best pipeline and train model
- [X_train, Y_train] = executePipeline(pip1[, 2:ncol(pip1)],
rbind(X_train,X_test), rbind(y_train,y_test), cmask, ls, 1, FALSE)
- X_train_clean = X_train[1:trRow, ]
- y_train_clean = Y_train[1:trRow, ]
- X_test_clean = X_train[trRow+1:nrow(X_train), ]
- y_test_clean = Y_train[trRow+1:nrow(X_train), ]
-
- # classify after cleaning
- betas = multiLogReg(X=X_train_clean, Y=y_train_clean, icpt=1,
- reg=as.scalar(MLhp[1,1]), tol= 1e-9, maxi=as.scalar(MLhp[1,2]),
- maxii= 50, verbose=FALSE);
-
- [c_prob, c_yhat, c_accuracy] = multiLogRegPredict(X_test_clean, betas,
y_test_clean, FALSE)
- c_accuracy = getAccuracy(y_test_clean, c_yhat, isWeighted)
- [confusionCount_c, confusionAVG_c] = confusionMatrix(P=c_yhat,
Y=y_test_clean)
-
-
- print("Actual Records \n"+toString(cbind(X_test, y_test)))
- print("Clean Records \n"+toString(cbind(X_test_clean, y_test_clean)))
- print("predictions Records \n"+toString(cbind(X_test_clean, c_yhat)))
- print("accuracy of dirty data "+dirAcc)
- print("accuracy of val data "+valAcc)
- print("accuracy of test accuracy "+c_accuracy)
- print("clean confusion matrix \n"+toString(confusionCount_c))
-
- result = c_accuracy
-}
-
-
diff --git a/src/test/scripts/functions/pipelines/meta/meta_census.csv
b/src/test/scripts/functions/pipelines/meta/meta_census.csv
index 427abbc..8ffe862 100644
--- a/src/test/scripts/functions/pipelines/meta/meta_census.csv
+++ b/src/test/scripts/functions/pipelines/meta/meta_census.csv
@@ -1,3 +1,3 @@
Scehma,FP32,STRING,INT32,INT32,STRING,FP32,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,FP32,FP32,FP32,STRING,STRING,STRING,STRING,STRING,FP32,STRING,STRING,STRING,STRING,STRING,FP32,STRING,STRING,STRING,STRING,STRING,INT32,STRING,INT32,FP32,FP32,STRING
mask,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1
-,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
+FD,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1
diff --git a/src/test/scripts/functions/pipelines/testClassification.dml
b/src/test/scripts/functions/pipelines/testClassification.dml
new file mode 100644
index 0000000..93f90ed
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/testClassification.dml
@@ -0,0 +1,203 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Generate the logical pipelines for data cleaning
+
+source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
+source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
+
+# read the inputs
+F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
+ naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+sample = $sampleSize
+topK = $topk
+resources = $rv
+crossValidations = $cv
+weightedAccuracy = $weighted # accuracy flag
+targetApplicaton = $target # accuracy flag
+
+
+
+if(nrow(metaInfo) < 2)
+ stop("incomplete meta info")
+
+ # Do the initial cleaning
+
+
+getSchema = metaInfo[1, 2:ncol(metaInfo)]
+getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for
FD computation
+
+# 1. dropInvalid function will remove the values which are not the part
+# of the column data type
+
+X = dropInvalidType(F, getSchema)
+
+# 2. encode the categorical data
+if(sum(getMask) > 0)
+{
+ # always recode the label
+ index = utils::vectorToCsv(getMask)
+ jspecR = "{ids:true, recode:["+index+"]}"
+ [eX, X_meta] = transformencode(target=X, spec=jspecR);
+ # change the schema to reflect the encoded values
+ getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
+ getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
+
+}
+# if no categorical value exist then just cast the frame into matrix
+else
+ eX = as.matrix(X)
+
+# 3. extract the class label
+eY = eX[, ncol(eX)]
+eX = eX[, 1:ncol(eX) - 1]
+
+getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
+getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
+getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
+
+
+# get the logical seed
+lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
+allLgs = logical::transformLogical(lgSeed)
+
+
+d_accuracy = 0
+# 4. perform the sampling
+
+[eX, eY] = utils::doSample(eX, eY, sample)
+
+# 5. get train test and validation set with balanced class distribution
+# [X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY,
splitRatio=0.7, verbose=FALSE)
+X_train = eX
+y_train = eY
+# 6. find the best hyper parameters for classification algorithm
+# for now only find the best values for intercept and maximum outer iteration
+params = list("reg", "maxi");
+paramRanges = list(10^seq(0,-10), seq(10,100, 10));
+# if(sum(getMask) > 0)
+# {
+ # dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test),
pattern = NaN, replacement=0), getMask)
+ # dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),]
+ # dX_train = dX_train[1:nrow(y_train),]
+ # [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test,
+ # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+# }
+# else
+ # [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test,
+ # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+# hardcoded hyper-params for multilogReg
+opt = matrix("0 100", 1, 2)
+
+# 7. get the cross validated accuracy on dirty dataset (only on training set)
+d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy,
crossValidations)
+# print("dirty accuracy is "+d_accuracy)
+ # [eX, eY] = prioritise(eX, eY, getMask)
+
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1),
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
+FD = FD > 0
+
+metaList = list(mask=getMask, schema=getSchema, fd=FD)
+targetClassification = list(target=targetApplicaton, cv=crossValidations,
wAccuracy=weightedAccuracy,
+ dirAcc = d_accuracy, mlHp = opt, cleanData = as.matrix(0))
+
+# # initialize output variables
+pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features =
as.frame("NULL")
+
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY, metaList=metaList,
targetList=targetClassification, lp=allLgs[1],
+ primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+
+output = $output
+write(features, output+"/features.csv", format="csv")
+
+
+if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
+ stop("warning: no best pipeline found")
+
+
+print("best pipelines")
+print(toString(pip))
+
+print("best hyperparam")
+print(toString(hp))
+
+print("best accuracy")
+print(toString(acc))
+
+
+clean_accuracy = as.scalar(acc[1,1])
+
+
+result = d_accuracy < clean_accuracy
+print("result satisfied ------------"+result)
+
+accuracies = cbind(as.matrix(d_accuracy), as.matrix(clean_accuracy))
+
+
+write(pip, output+"/pipelines.csv", format="csv")
+write(hp, output+"/hyperparams.csv", format="csv")
+write(acc, output+"/accuracies.csv", format="csv")
+write(accuracies , output+"/BestAccuracy.csv", format="csv")
+write(result , $O)
+
+
+
+
+####################################################################
+# Function for classifying the dirty dataset, makes a call to crossV()
+# Inputs: takes the input dataset X, Y and the value of k validation, mask of
the
+# dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# via grid-search and a boolean value of (un)weighted accuracy.
+# Output: It return a matrix having the accuracy of each fold.
+####################################################################
+classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain,
Matrix[Double] opt,
+ Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
+ return (Double accuracy)
+{
+ # # classify without cleaning fill with default values 1
+ Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
+ if(sum(mask) > 0)
+ Xtrain = utils::dummycoding(Xtrain, mask)
+ # print("rows in data ")
+ # print(nrow(dX_train))
+ # print("column in data")
+ # print(ncol(dX_train))
+ accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
+ accuracy = mean(accuracy)
+ print("cross validated dirty accuracy "+accuracy)
+}
+
+
+
+
+lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B)
+return (Matrix[Double] loss) {
+ [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE)
+ loss = as.matrix(1 - (acc/100))
+ # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
+}
+
diff --git a/src/test/scripts/functions/pipelines/testCompare.dml
b/src/test/scripts/functions/pipelines/testCompare.dml
new file mode 100644
index 0000000..df110e2
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/testCompare.dml
@@ -0,0 +1,138 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Generate the logical pipelines for data cleaning
+
+source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
+
+# read the inputs
+F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
+ naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+topK = $topk
+resources = $rv
+targetApplicaton = $target # accuracy flag
+cleanData = read($cleanData, data_type="frame", format="csv", header=TRUE,
+ naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
+
+
+
+# take the sample of 500 rows to avoid java heap issue
+
+F = F[1:500,]
+cleanData = cleanData[1:500,]
+
+if(nrow(metaInfo) < 2)
+ stop("incomplete meta info")
+
+ # Do the initial cleaning
+
+
+getSchema = metaInfo[1, 2:ncol(metaInfo)]
+getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for
FD computation
+
+# 1. dropInvalid function will remove the values which are not the part
+# of the column data type
+
+X = dropInvalidType(F, getSchema)
+
+
+# 2. encode the categorical data
+if(sum(getMask) > 0)
+{
+ # always recode the label
+ index = utils::vectorToCsv(getMask)
+ jspecR = "{ids:true, recode:["+index+"]}"
+
+ [eX, X_meta] = transformencode(target=rbind(cleanData, X), spec=jspecR);
+ cleanX = eX[1:nrow(cleanData)]
+ eX = eX[nrow(cleanData)+1:nrow(eX)]
+
+ # change the schema to reflect the encoded values
+ getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
+ getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
+
+
+}
+# if no categorical value exist then just cast the frame into matrix
+else
+ eX = as.matrix(X)
+
+
+# get the logical seed
+lgSeed = logical::generateLogicalSeed(eX, as.matrix(0), getMask,
targetApplicaton)
+allLgs = logical::transformLogical(lgSeed)
+
+
+
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1),
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
+FD = FD > 0
+
+expectedAccuracy = 0.6
+
+metaList = list(mask=getMask, schema=getSchema, fd=FD)
+targetClassification = list(target=targetApplicaton, cv=0, wAccuracy=FALSE,
+ dirAcc = expectedAccuracy, mlHp = as.matrix(0), cleanData = cleanX)
+
+
+# # initialize output variables
+pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features =
as.frame("NULL")
+
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=as.matrix(0),
metaList=metaList, targetList=targetClassification,
+ lp=allLgs, primitives=primitives, param=param, k=topK, R=resources,
verbose=TRUE);
+
+
+output = $output
+write(features, output+"/features.csv", format="csv")
+
+
+if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < expectedAccuracy)
+ stop("warning: no best pipeline found")
+
+
+print("best pipelines")
+print(toString(pip))
+
+print("best hyperparam")
+print(toString(hp))
+
+print("best accuracy")
+print(toString(acc))
+
+clean_accuracy = as.scalar(acc[1,1])
+
+
+result = expectedAccuracy <= clean_accuracy
+print("result satisfied ------------"+result)
+
+accuracies = cbind(as.matrix(expectedAccuracy), as.matrix(clean_accuracy))
+
+
+write(pip, output+"/pipelines.csv", format="csv")
+write(hp, output+"/hyperparams.csv", format="csv")
+write(acc, output+"/accuracies.csv", format="csv")
+write(accuracies , output+"/BestAccuracy.csv", format="csv")
+write(result , $O)
\ No newline at end of file