[systemds] branch main updated: [SYSTEMDS-3299] Seed value in genetic Algorithm - This commit introduce seed parameter in genetic algorithm to control the random additions and transitions in pipelines.

ssiddiqi Mon, 28 Feb 2022 08:58:43 -0800

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 6840bfe  [SYSTEMDS-3299] Seed value in genetic Algorithm   - This 
commit introduce seed parameter in genetic algorithm to control the     random 
additions and transitions in pipelines.
6840bfe is described below

commit 6840bfec8e6d2cf1418cbfb6841dcf1979abe292
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Thu Feb 24 12:03:24 2022 +0100

    [SYSTEMDS-3299] Seed value in genetic Algorithm
      - This commit introduce seed parameter in genetic algorithm to control the
        random additions and transitions in pipelines.
---
 scripts/builtin/bandit.dml                         | 320 ++++++---------------
 scripts/builtin/executePipeline.dml                |  42 +--
 scripts/builtin/mice.dml                           |   6 +-
 scripts/builtin/topk_cleaning.dml                  |  48 ++--
 scripts/pipelines/scripts/enumerateLogical.dml     | 108 +++----
 scripts/pipelines/scripts/utils.dml                |  17 +-
 .../apache/sysds/runtime/util/UtilFunctions.java   |   8 +-
 .../functions/pipelines/applyEvaluateTest.dml      |   2 +-
 .../intermediates/classification/applyFunc.csv     |   6 +-
 .../intermediates/classification/bestAcc.csv       |   6 +-
 .../intermediates/classification/evalHp.csv        |   2 +-
 .../pipelines/intermediates/classification/hp.csv  |   6 +-
 .../pipelines/intermediates/classification/pip.csv |   6 +-
 .../intermediates/regression/applyFunc.csv         |  10 +-
 .../functions/pipelines/topkLogicalTest.dml        |   2 +-
 .../pipelines/topkcleaningClassificationTest.dml   |   2 +-
 16 files changed, 222 insertions(+), 369 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index e24e851..06fa57a 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,13 +53,14 @@
 
 m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, 
Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList,
   String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, 
Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3,
-  Integer R=50, Double baseLineScore, Boolean cv,  Integer cvk = 2, Boolean 
verbose = TRUE, String output="")
+  Integer R=50, Double baseLineScore, Boolean cv,  Integer cvk = 2, Double ref 
= 0, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="")
   return(Boolean perf)
   # return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, 
Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter) 
 {
   print("Starting optimizer")
-  NUM_FEATURES = 14
+  totalPruneCount = 0
   FLAG_VARIABLE = 5
+  pipelines_executed = 0
   HYPERPARAM_LENGTH = (ncol(lp) * FLAG_VARIABLE * 3) + 1 ## num of col in 
logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
   bestPipeline = frame("", rows=1, cols=1)
   bestHyperparams = as.matrix(0)
@@ -67,19 +68,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
   # initialize bandit variables
   # variable names follow publication where algorithm is introduced
   eta = 2  # the halving ratio is fixed to 2
-  s_max = floor(log(R,eta));
-  B = (s_max + 1) * R;
-
+  s_max = floor(log(R,eta)) - 1;
   # initialize output variables
   hparam = matrix(0, rows=k*(s_max+1), cols=HYPERPARAM_LENGTH)
   pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
-  startOut=0; endOut=0;
-  feaFrameOuter = frame(data=["#MissingValues", "MinVla", "MaxVal", 
"AverageMin", "AverageMax", 
-  "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers", 
"#OHEfeatures", "#Classes",
-  "Imbalance", "#rows", "#cols", "pipelines", "accuracy", "execution time in 
ms", "CV time in ms"],
-  rows = 1, cols = NUM_FEATURES + 4 )
+  endIdx = matrix(k, rows=(s_max+1), cols=1)
+  endIdx = cumsum(endIdx)
+  startIdx = (endIdx - k) + 1
 
-  for(s in s_max:0) { # TODO convert to parfor
+  n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/(s_max + 1));)
+    
+  for(s in s_max:0, check=0) { # TODO convert to parfor
     
    # result variables
     bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH)
@@ -87,21 +86,14 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
     start=1; end=0;
     
     # # compute the number of initial pipelines n
-    n = ceil(floor(B/R/(s+1)) * eta^s);
     r = R * eta^(-s);
-    # get the physical pipelines, the pipelines, pipelines are recoded
-    # [configurations, n] = get_physical_configurations(lp, n, primitives)
-    n = min(nrow(lp), n)
-    configurations = lp[1:n]
+    configurations = lp[1:(min(n, nrow(lp)))]
     # append configuration keys for extracting the pipeline later on
     id = seq(1, nrow(configurations))
     configurations = cbind(as.frame(id), configurations)
     # save the original configuration as a lookup table
     lookup = configurations
-    
-    if(verbose)
-      print("n "+ n +"\nR "+ R +"\ns_max "+ s_max +"\nB "+ B +"\nn "+ n +"\nr 
"+ r)
-    
+
     for(i in 0:s) {
       # successive halving
       n_i = min(max(as.integer(floor(n * eta^(-i))), 1), nrow(configurations));
@@ -112,10 +104,11 @@ m_bandit = function(Matrix[Double] X_train, 
Matrix[Double] Y_train, Matrix[Doubl
         print("no of resources --------------"+r_i)
         print("iteration  ---------------------"+i+" out of "+s)
       }
-      
       configurations = configurations[1:n_i, ]
-      [outPip,outHp, feaFrameOuter] = 
run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, 
Xtest=X_test, Ytest=Y_test,
-        metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, 
param=param, featureFrameOuter=feaFrameOuter, cv=cv, cvk=cvk)
+      pipelines_executed = pipelines_executed + (n_i * r_i)
+      [outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, 
r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
+        evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, 
cv=cv, cvk=cvk, ref=ref, enablePruning=enablePruning)
+      totalPruneCount = totalPruneCount + pruneCount
       # sort the pipelines by order of accuracy decreasing
       a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
       b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
@@ -123,7 +116,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
 
       # maintain the brackets results
       end = end + rowIndex
-      bracket_pipel[start:end, ] =  a[1:rowIndex,]
+      bracket_pipel[start:end, 1:ncol(a)] =  a[1:rowIndex,]
       bracket_hp[start:end, 1:ncol(b)] =  b[1:rowIndex,]
       start = end + 1
 
@@ -133,20 +126,19 @@ m_bandit = function(Matrix[Double] X_train, 
Matrix[Double] Y_train, Matrix[Doubl
       configurations = frameSort(cbind(avergae_perf, configurations), 
cbind(as.matrix(0), sortMask), TRUE)
       configurations = configurations[, 2:ncol(configurations)]
     }
+    if(n < nrow(lp))
+      lp = lp[n+1:nrow(lp),]
     bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows")
     bracket_hp = removeEmpty(target=bracket_hp, margin="rows")
     # keep the best k results for each bracket
     [bracket_bestPipeline, bracket_bestHyperparams] = 
extractBracketWinners(bracket_pipel, bracket_hp, k, lookup)
     # optimize by the features
-    startOut = endOut + 1
-    endOut = endOut + nrow(bracket_bestPipeline)
+    startOut = as.scalar(startIdx[s+1])
+    endOut = min(as.scalar(endIdx[s+1]), (startOut + 
nrow(bracket_bestPipeline) - 1))
     pipeline[startOut:endOut, ] = bracket_bestPipeline
-
     hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] = 
bracket_bestHyperparams
   }
-
   [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, 
baseLineScore, k)
-
   bestAccuracy = as.matrix(bestPipeline[,1])
   bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)]
   bestPipeline = bestPipeline[, 2:ncol(bestPipeline)]
@@ -166,11 +158,13 @@ m_bandit = function(Matrix[Double] X_train, 
Matrix[Double] Y_train, Matrix[Doubl
     print("topk  scores: \n"+toString(bestAccuracy))
     print("evalHp: \n"+toString(evalFunHp))
     print("performance improvement "+ imp)
+    print("total physical pipelines to be executed: "+pipelines_executed)
+    print("prune count: "+totalPruneCount)
+    print("actual executed pipelines: "+(pipelines_executed - totalPruneCount))
   }
   write(bestPipeline, output+"/pip.csv", format="csv")
   write(bestHyperparams, output+"/hp.csv", format="csv")
   write(bestAccuracy, output+"/bestAcc.csv", format="csv")
-  write(feaFrameOuter, output+"/featureFrame.csv", format="csv")
   write(baseLineScore, output+"/dirtyScore.csv", format="csv")
   write(evalFunHp, output+"/evalHp.csv", format="csv")
   write(applyFunc, output+"/applyFunc.csv", format="csv")
@@ -179,7 +173,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
 # this method will extract the physical pipelines for a given logical pipelines
 get_physical_configurations = function(Frame[String] logical, Scalar[int] 
numConfigs = 10, 
   Frame[Unknown] primitives)
-  return(Frame[String] physical, Double min)
+  return(Frame[String] physical)
 {
   # load the primitives
   physical = as.frame("NaN")
@@ -193,7 +187,7 @@ get_physical_configurations = function(Frame[String] 
logical, Scalar[int] numCon
   dim = primitives[, 8]
  
   operator = frame(0, rows=nrow(primitives), cols=ncol(logical))  # combine 
all logical primitives
-  parfor(j in 1:ncol(logical))
+  parfor(j in 1:ncol(logical), check = 0)
   {
     # extract the physical primitives
     if(as.scalar(logical[1,j]) == "ED")
@@ -208,60 +202,23 @@ get_physical_configurations = function(Frame[String] 
logical, Scalar[int] numCon
       operator[, j] = ci;
     else if(as.scalar(logical[1,j]) == "DIM")
       operator[, j] =  dim;
-    else if(as.scalar(logical[1,j]) == "DUMMY")
-      operator[, j] =  dummy;
+    else if(as.scalar(logical[1,j]) == "DUMMY") 
+      operator[, j] =  dummy; 
     else if(as.scalar(logical[1,j]) == "SCALE")
       operator[, j] = scale;
     else print("invalid operation "+as.scalar(logical[1,j]))
   }
-
-  idx = matrix(1, rows=1, cols=ncol(logical))
-  # get the indexes of columns for recode transformation
-  index = vectorToCsv(idx)
-  # recode logical pipelines for easy handling
-  jspecR = "{ids:true, recode:["+index+"]}";
-  [X, M] = transformencode(target=operator, spec=jspecR);
-  X = replace(target= X, pattern = NaN, replacement = 0)
-  
-  paramLens = matrix(0, ncol(logical), 1);
-  parfor( j in 1:ncol(logical)) {
-    vect = removeEmpty(target = X[,j], margin = "rows");
-    paramLens[j,1] = nrow(vect);
-  }
-  min = prod(paramLens)
-  numConfigs = ifelse(numConfigs == 0, min, numConfigs)
-  sample = ifelse(min > numConfigs, TRUE, FALSE)
-  paramVals = matrix(0, ncol(logical), max(paramLens));
-  parfor( j in 1:ncol(logical) ) {
-    vector = removeEmpty(target = X[,j], margin = "rows");
-    paramVals[j,1:nrow(vector)] = t(vector);
-  }
-  cumLens = rev(cumprod(rev(paramLens))/rev(paramLens));
-  XI = table(seq(1,nrow(cumLens)), sample(nrow(cumLens),nrow(cumLens)))
-  cumLens = XI %*% cumLens
-  # materialize hyper-parameter combinations 
-  HP = matrix(0, min(numConfigs, min), ncol(logical));
-  pip = seq(1,nrow(HP))
-  if(sample) 
-    pip = sample(nrow(HP),numConfigs)
-  XI = table(seq(1,nrow(pip)), sample(nrow(pip),nrow(pip)))
-  pip = XI %*% pip
-  
-  for( i in 1:nrow(HP)) {
-    for( j in 1:ncol(logical) ) {
-      HP[i,j] = 
paramVals[j,as.scalar((as.scalar(pip[i,1])/cumLens[j,1])%%paramLens[j,1]+1)];
-    }
-  }
-  
-  physical = transformdecode(target=HP, spec=jspecR, meta=M);
+  physical = operator 
 }
 
 # this method will call the execute pipelines with their hyper-parameters
 run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, 
Matrix[Double] X, Matrix[Double] Y,
   Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String 
evaluationFunc, Matrix[Double] evalFunHp,
-  Frame[Unknown] param, Frame[Unknown] featureFrameOuter, Boolean cv,  Integer 
cvk = 2, Boolean default = FALSE)
-  return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, 
Frame[Unknown] featureFrameOuter)
+  Frame[Unknown] param, Boolean cv,  Integer cvk = 2, Double ref = 0, Boolean 
enablePruning = FALSE, Boolean default = FALSE)
+  return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, 
Integer pruneCount, Matrix[Double] changesByPipMatrix)
 {
+  changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1)
+  pruneCount = 0
   output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)-1) * 5 * 3)
   output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
   output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
@@ -274,15 +231,13 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
   id = as.matrix(ph_pip[, 1])
   ph_pip = ph_pip[, 2:ncol(ph_pip)]
   evalFunOutput = as.matrix(0)
-  feaVec = gatherStats(X, Y, as.matrix(metaList['mask']))
 
   for(i in 1:nrow(ph_pip))
   {
     # execute configurations with r resources
     op = removeEmpty(target=ph_pip[i], margin="cols")
-    [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, 
param, r_i, default)
-    if(ncol(featureFrameOuter) > 1)
-      feaFrame = frame("", rows = no_of_res, cols = ncol(featureFrameOuter))
+    print("PIPELINE EXECUTION START ... "+toString(op))
+    [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, 
param, r_i, default, enablePruning)
     pip_toString = pipToString(op)
     hpForPruning = matrix(0, rows=1, cols=ncol(op))
     changesByOp = matrix(0, rows=1, cols=ncol(op))
@@ -297,25 +252,28 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
       indexes = table(indexes, 1, 1, nrow(hp), 1)
       hp_matrix = removeEmpty(target = hp, margin="rows", select = indexes)
       # # check if the pruning could be applied to avoid unnecessary executions
-      executionSingnal = pruningSignal(op, hp_matrix, hpForPruning, 
changesByOp)
-
+      pruneSignal = pruningSignal(op, hp_matrix, hpForPruning, changesByOp)
+      executionSingnal = ifelse(enablePruning, pruneSignal, TRUE)
+      ref = ifelse(enablePruning, ref, 0)
       if(executionSingnal)
       {
         t1 = time()
-        
         if(cv)
         {
           pipList = list(ph = op, hp = hp_matrix, flags = no_of_flag_vars)
-          [accuracy, evalHp, hpForPruning, changesByOp] = crossV(X=X, y=Y, 
cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, 
hpForPruning=hpForPruning, 
-          changesByOp=changesByOp, evalFunc=evaluationFunc)
+          [accuracy, evalHp, hpForPruning, changesByOp, changesByPip] = 
crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp,
+            pipList=pipList, metaList=metaList, hpForPruning=hpForPruning, 
+          changesByOp=changesByOp, evalFunc=evaluationFunc, ref=ref)
         }
         else 
         {
-          [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = 
executePipeline(pipeline=op, 
+          [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, 
changesByPip] = executePipeline(pipeline=op, 
             Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,  
hyperParameters=hp_matrix, hpForPruning=hpForPruning,
             changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, 
verbose=FALSE)
           if(max(eYtrain) == min(eYtrain)) 
             print("Y contains only one class")
+          else if(changesByPip < ref)
+            print("prunning alert 2: no training the model due to minimum 
changes")
           else 
             evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, 
Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
             accuracy = as.scalar(evalFunOutput[1, 1])
@@ -332,31 +290,26 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
         Y = clone_Y
         Xtest = clone_Xtest
         Ytest = clone_Ytest
-        if(ncol(featureFrameOuter) > 1) {
-          feaFrame[r, 1:ncol(feaVec)] = as.frame(feaVec)
-          feaFrame[r, (ncol(feaVec)+1)] = pip_toString
-          feaFrame[r, (ncol(feaVec)+2)] = accuracy
-          feaFrame[r, (ncol(feaVec)+3)] = accT #Tr
-          feaFrame[r, (ncol(feaVec)+4)] = accT
-        }
       }
-      else print("prunningAlert: not executing instance : "+r)
+      else
+      {
+        pruneCount = pruneCount + 1
+        print("prunningAlert: not executing instance : "+r+" 
pruneCount"+pruneCount)
+      }
+      changesByPipMatrix[index] = changesByPip
       index = index + 1
     }
-    
     X = clone_X
     Y = clone_Y
     Xtest = clone_Xtest
     Ytest = clone_Ytest
-    if(ncol(featureFrameOuter) > 1)
-      featureFrameOuter = rbind(featureFrameOuter, feaFrame)
   }
   output_hyperparam = removeEmpty(target=cbind(output_accuracy, output_hp), 
margin="rows")
   output_operator = removeEmpty(target=cbind(output_accuracy, 
output_pipelines), margin="rows")
 }
 
 # extract the hyper-parameters for pipelines
-getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, 
Integer no_of_res, Boolean default)
+getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, 
Integer no_of_res, Boolean default, Boolean enablePruning)
   return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer 
no_of_res, Integer NUM_META_FLAGS)
 {
 
@@ -384,6 +337,7 @@ getHyperparam = function(Frame[Unknown] pipeline, 
Frame[Unknown]  hpList, Intege
   paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res, 
cols=max(paramCount)+NUM_META_FLAGS+1)
 
   for(i in 1:ncol(pipeline)) {
+    op = as.scalar(pipeline[1, i])
     index = as.scalar(indexes[i])
     no_of_param = as.integer(as.scalar(paramCount[i]))
     # extract hasY and verbose flags
@@ -434,6 +388,8 @@ getHyperparam = function(Frame[Unknown] pipeline, 
Frame[Unknown]  hpList, Intege
           typeIdx = typeIdx + 1
         }
       }
+      if((op == "outlierBySd" | op == "outlierByIQR" | op == "imputeByFd") & 
no_of_res > 1 & enablePruning) 
+        OpParam = order(target=OpParam, by = 1, decreasing = FALSE, 
index.return = FALSE)
       # hyper-parameter vector contains no. of hp, values of hp, and flag 
values
       OpParam = cbind(matrix(no_of_param, rows=nrow(OpParam), cols=1),OpParam, 
attachMask,
         attachFD, attachY, isVerbose, dataFlag)
@@ -454,50 +410,8 @@ extractTopK = function(Frame[Unknown] pipeline, 
Matrix[Double] hyperparam,
   Double baseLineScore, Integer k)
   return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
 {
-  # # # take out the accuracy from pipelines
-  pipeline = pipeline[, 2:ncol(pipeline)]
-  idx = vectorToCsv(seq(1, ncol(pipeline)))
-  jspecDC = "{ids:true, recode:["+idx+"]}";
-  # OHE of categorical features
-  [dpipeline, dM] = transformencode(target=pipeline, spec=jspecDC);
-  # bind the pipelines and hyper-parameters into one matrix
-  forDedup = cbind(dpipeline, hyperparam) 
-  # perform the similarity based deduplication
-  dup = mdedup(cbind(pipeline, as.frame(hyperparam)), matrix(seq(2, 
ncol(forDedup)), 1,
-    ncol(forDedup)-1), matrix(1,1,ncol(forDedup)-1), as.matrix(1), 
as.matrix(1), FALSE)
-
-  if(sum(dup) > 0)
-  {
-    # take out the unique tuples
-    uniqueTuples = removeEmpty(target=forDedup, margin="rows", select=(dup==0))
-    # remove the zero rows, identifiers of unique records
-    dup = removeEmpty(target=dup, margin="rows")
-    # get the counts of duplicate tuples with their tuple id
-    countDist = table(dup, 1) > 0
-    countDist = countDist * seq(1, nrow(countDist))
-    countsVal = removeEmpty(target=countDist, margin="rows")
-    indexes = table(seq(1, nrow(countsVal)),countsVal,1,nrow(countsVal), 
cols=nrow(forDedup))
-
-    # for each duplicate record just take the one reocrd and strip the others
-    deduplicates = indexes %*% forDedup
-  
-    # combine the deduplicated tuples and unique tuples again
-    forDedup = rbind(uniqueTuples, deduplicates)
-  }
-  
-  # decode the pipelines
-  decoded = transformdecode(target=forDedup[, 1:ncol(pipeline)], meta=dM, 
spec=jspecDC)
-  # separate the pipelines and hyper-parameters
-  pipeline = decoded[, 1:ncol(pipeline)]
-  hyperparam = forDedup[, ncol(pipeline)+1:ncol(forDedup)]
-
-  # sort results
-  # # add accuracy back
-  pipeline = cbind(as.frame(forDedup[, ncol(pipeline)+1]), pipeline)
   hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, 
index.return=FALSE)
   pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1, 
cols=ncol(pipeline) - 1)), TRUE)
-
-
   # remove the row with accuracy less than test accuracy 
   mask = (hyperparam[, 1] < baseLineScore) == 0
   hyperparam = removeEmpty(target = hyperparam, margin = "rows", select = mask)
@@ -522,9 +436,8 @@ extractBracketWinners = function(Matrix[Double] pipeline, 
Matrix[Double] hyperpa
   bestPipeline = frame(data="|", rows=nrow(pipeline), cols=ncol(conf))
   parfor(i in 1: nrow(pipeline)) {
     index = as.scalar(pipeline[i, 3])
-    out = conf[index, 2:ncol(conf)]
+    bestPipeline[i] = conf[index]
     bestPipeline[i, 1] = as.frame(pipeline[i, 1])
-    bestPipeline[i, 2:ncol(bestPipeline)] = out
   }
 }
 
@@ -539,90 +452,6 @@ return (Frame[Unknown] maxperconf)
   maxperconf[1:ncol(tab),] = as.frame(t(colMaxs(tab)))
 }
 
-
-###############################################################################################
-# The function will collect the features like statistics and pipelines and 
accuracy 
-# so that they could be used for training a model and predicting pipelines 
without enumeration
-###############################################################################################
-gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask)
-return (Matrix[Double] features)
-{
-
-  features = matrix(0, rows = 1, cols= 14)
-  features[1, 1]=  sum(is.na(X)) # number of missing values
-  X = replace(target= X, pattern = NaN, replacement = 0)
-  num = removeEmpty(target=X, margin="cols", select=(mask == 0))
-  # get the stats
-  features[1, 2] =  min(num) # minimum value
-  features[1, 3] = max(num)
-  features[1, 4] = mean(colMins(num)) # average minimum value
-  features[1, 5] = mean(colMaxs(num)) # average maximum value
-  features[1, 6] = sum(mask) # number of categorical features
-  features[1, 7] = sum(mask == 0) # number of numerical features
-  features[1, 8] = mean(num) # mean value
-  colSd = colSds(num)
-  count3sdplus = sum(num > (colMeans(num) + 3*colSd ))
-  count3sdminus = sum(num < (colMeans(num) - 3*colSd ))
-  outliers = count3sdplus + count3sdminus
-  features[1, 9] = outliers
-  # OHE features 
-  OHE = sum(colMaxs(X) * mask)
-  features[1, 10] = OHE
-
-  if(nrow(Y) > 1 &  min(Y) >= 1)
-  {
-    ctab = table(Y, 1)
-    features[1, 11] = nrow(ctab) # number of classes
-    minCat = min(ctab) / nrow(ctab)
-    maxCat = max(ctab) / nrow(ctab)
-    # class imabalance 1=YES, 0=NO
-    features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
-  }
-  else 
-  {
-    features[1, 11] = 0
-    features[1, 12] = 0
-  }
-  features[1, 13] = nrow(X)
-  features[1, 14] = ncol(X)
-  
-}
-
-
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
-
-compareValue = function(Matrix[double] dirtyX, Matrix[double] fixedX,  
Matrix[Double] cleanX, Matrix[Double] mask) 
-return (Double precision, Double T)
-{
-  t1 = time()
-  DEFAULT = 404
-  mv = is.na(dirtyX)
-  correctionsRequired = 0
-  mv = is.na(fixedX)
-  dirtyX = replace(target= dirtyX, pattern=NaN, replacement=DEFAULT)
-  cleanX = replace(target= cleanX, pattern=NaN, replacement=DEFAULT)
-  fixedX = replace(target= fixedX, pattern=NaN, replacement=DEFAULT)
-  diffCleanDirty =  sum((abs(cleanX - dirtyX) < 0.001) < 1) #sum(cleanX == 
dirtyX) #
-  print("dirty != clean: "+diffCleanDirty)
-  correctionsRequired =  (abs(cleanX - dirtyX) < 0.001) < 1#dirtyX != cleanX
-  print("corrections required: "+sum(correctionsRequired))
-  correctionsMade =  sum(dirtyX != fixedX)
-  print("corrections made: "+correctionsMade)
-  dim = nrow(dirtyX) * ncol(dirtyX) 
-  match = (abs(cleanX - fixedX) < 0.001) * correctionsRequired
-  print("total matches "+sum(match))
-  # print("total matches \n"+toString(match))
-  precision = max(0.001, sum(match) / max(1, correctionsMade))
-  T = floor((time() - t1) / 1e+6)
-  print("Precision: "+toString(precision) + " in "+T+" ms")
-}
-
 pipToString = function(Frame[String] F)
 return (String s)
 {
@@ -632,16 +461,20 @@ return (String s)
 
 }
 
-
 crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, 
Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList,
-  Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = 
as.matrix(0), String evalFunc) 
-return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] 
hpForPruning, Matrix[Double] changesByOp)
+  Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = 
as.matrix(0), String evalFunc, Double ref = 0) 
+return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] 
hpForPruning, Matrix[Double] changesByOp, Double allChanges)
 {
+
+  # # in the below condition we compute the hp using cv method on train dataset
   if(is.na(as.scalar(evalFunHp[1,1]))) {
     forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, 
Xorig=as.matrix(0), evalFunHp=evalFunHp))
     evalFunHp = forEvalHp[1, 2:ncol(forEvalHp)]
-  }
+  } 
+  changesByPip = 0
+  cvChanges = matrix(0, rows=cvk, cols=ncol(changesByOp))
   accuracyMatrix = matrix(0, cvk, 1)
+  allChanges = matrix(0, cvk, 1)
   #create empty lists
   dataset_X = list(); #empty list
   dataset_y = list();
@@ -655,28 +488,33 @@ return (Double accuracy, Matrix[Double] evalFunHp, 
Matrix[Double] hpForPruning,
 
   beta_list = list();
   #keep one fold for testing in each iteration
-  for (i in seq(1, cvk), check=0) {
+  for (i in seq(1, cvk)) {
     [tmpX, testX] = remove(dataset_X, i); 
     [tmpy, testy] = remove(dataset_y, i);
     trainX = rbind(tmpX);
     trainy = rbind(tmpy);
     testX = as.matrix(testX)
     testy = as.matrix(testy)
-    if(as.scalar(pipList['flags']) != 0)
+    if(as.scalar(pipList['flags']) != 0)  # this flag is zero when CV is 
called from the dirtyScore function, means only accuracy calculation but no 
pipeline execution
     {
-      [trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp] = 
executePipeline(pipeline=as.frame(pipList['ph']),
+      [trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp, 
changesByPip] = executePipeline(pipeline=as.frame(pipList['ph']),
         Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy, 
metaList=metaList, hyperParameters=as.matrix(pipList['hp']), 
hpForPruning=hpForPruning,
         changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), 
test=TRUE, verbose=FALSE)
+      cvChanges[cvk] = changesByOp
+      allChanges[i] =  changesByPip
+    }
+    if(changesByPip < ref)
+      print("prunning alert 2: no training the model due to minimum changes")
+    else { 
+      res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, 
Xorig=as.matrix(0), evalFunHp=evalFunHp))
+      accuracyMatrix[i] = res[1, 1]
     }
-    res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, 
Xorig=as.matrix(0), evalFunHp=evalFunHp))
-    accuracyMatrix[i] = res[1, 1]
+    
   }
-
-  print("----- cv mean accuracy ---")
-  print(toString(accuracyMatrix))
+  allChanges = min(allChanges)
+  changesByOp = colMaxs(cvChanges)
   accuracy =  mean(accuracyMatrix)
-  print("mean: "+toString(accuracy))
-  # output = cbind(accuracy, evalFunHp)
+  print("cv accuracy: "+toString(accuracy))
 }
 
 pruningSignal = function(Frame[Unknown] ph_pip, Matrix[Double] hp_matrix, 
Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
@@ -705,6 +543,8 @@ return(Boolean execute)
 getParamMeta = function(Frame[Unknown] pipeline, Frame[Unknown] hpList)
 return(Frame[Unknown] applyFunc, Matrix[Double] indexes, Matrix[Double] 
paramCount)
 {
+  # print("pipeline in meta "+toString(pipeline))
+  # while(FALSE){}
   indexes = matrix(0, rows= ncol(pipeline), cols=1)
   paramCount = matrix(0, rows= ncol(pipeline), cols=1)
   applyList = hpList[, 1]
diff --git a/scripts/builtin/executePipeline.dml 
b/scripts/builtin/executePipeline.dml
index 05761a7..641823b 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -57,17 +57,17 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] Xtrain,  Mat
   Matrix[Double] Xtest,  Matrix[Double] Ytest, List[Unknown] metaList, 
Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
   Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test 
= FALSE, Boolean verbose)
   return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, 
Matrix[Double] Ytest,
-    Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
+    Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double 
changesAll)
 {
 
   mask=as.matrix(metaList['mask'])
   FD = as.matrix(metaList['fd'])
   applyFunc = as.frame(metaList['applyFunc'])
-
+  changesAll = 0.0
+  d = ncol(Xtrain)
   testRow = nrow(Xtest)
-  Xout = Xtrain
+  Xorig = Xtest
   t1 = time()
-  print("PIPELINE EXECUTION START ... "+toString(pipeline))
   if(verbose) {
     print("checks   rows in X = "+nrow(Xtrain)+" rows in Y = "+nrow(Ytrain)+" 
cols in X = "+ncol(Xtrain)+" col in Y = "+ncol(Ytrain))
     print("pipeline in execution "+toString(pipeline))
@@ -84,14 +84,7 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] Xtrain,  Mat
       L = evalList(op, hp)
       [L, O] = remove(L, 1);
       Xtrain = as.matrix(O)
-      if(nrow(as.matrix(hp[1])) == nrow(Xtrain) & ncol(as.matrix(hp[1])) == 
ncol(Xtrain)) {
-        changes = sum(abs(replace(target=Xtrain, pattern=NaN, replacement=0) - 
replace(target=as.matrix(hp[1]), pattern=NaN, replacement=0))  > 0.001)
-        print("# of changes values: "+toString(changes))
-      }
-      Xout = Xtrain
       if(applyOp != "NA") {
-        print("op: "+op)
-        # print("dataFlag: "+dataFlag)
         [Xtest, executeFlag] = applyDataFlag(Xtest, mask, dataFlag)
         L = append(L, list(X=Xtest));
         Xtest = eval(applyOp, L);
@@ -109,11 +102,13 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] Xtrain,  Mat
     else {
       print("not applying "+op+" executeFlag = 0")
     }
+    if(ncol(Xtest) == d) {
+      changesSingle = sum(abs(replace(target=Xtest, pattern=NaN, 
replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0))  > 
0.001 )
+      changesAll  = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) 
- replace(target=Xorig, pattern=NaN, replacement=0))  > 0.001 )
     
-
-    if(as.scalar(pipeline[1, i]) == "outlierBySd" | as.scalar(pipeline[1, i]) 
== "outlierByIQR" | as.scalar(pipeline[1, i]) == "imputeByFd") {
-      changes = sum(abs(replace(target=Xout, pattern=NaN, replacement=0) - 
replace(target=as.matrix(hp[1]), pattern=NaN, replacement=0))  > 0.001 )
-      [hpForPruning, changesByOp] = storeDataForPrunning(pipeline, 
hyperParameters, hpForPruning,  changesByOp, changes, i)
+      if(as.scalar(pipeline[1, i]) == "outlierBySd" | as.scalar(pipeline[1, 
i]) == "outlierByIQR" | as.scalar(pipeline[1, i]) == "imputeByFd") {
+        [hpForPruning, changesByOp] = storeDataForPrunning(pipeline, 
hyperParameters, hpForPruning,  changesByOp, changesSingle, i)
+      }
     }
   }
 
@@ -121,8 +116,6 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] Xtrain,  Mat
   if(nrow(Xtest) != testRow)
     stop("executePipeline: test rows altered")
   t2 = floor((time() - t1) / 1e+6)
-
-  print("PIPELINE EXECUTION ENDED: "+t2+" ms")
 }
 
 # This function will convert the matrix row-vector into list
@@ -228,7 +221,6 @@ return (Matrix[Double] X)
     # put nan back
     nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
     X = X + nanMask
-    # print("X less than equal to  zero "+sum(cat <= 0))
   }
 }
 
@@ -245,7 +237,6 @@ return (Matrix[Double] X)
     Xcat = removeEmpty(target=originalX, margin="cols", select=mask)
     nanMask = is.na(Xcat)
     Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111)
-    # print("unchanged data \n"+toString(originalX, rows=10))
     
     # reconstruct the original matrix
     p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), 
margin="rows", 
@@ -276,7 +267,6 @@ return (Matrix[Double] X)
   
   }
   else X = nX
-    # print("recreated data \n"+toString(X, rows = 20))
 
 }
 
@@ -390,9 +380,8 @@ return (Matrix[Double] X, Matrix[Double] Y)
     minClass = min(classes)
     maxClass = max(classes)
     diff = (maxClass - minClass)/sum(classes)
-    if(diff > 0.3)
+    if(diff > 0.2)
     {
-      #print("initiating oversampling")
       XY = order(target = cbind(Y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
       synthesized = matrix(0,0,0) # initialize variable
       start_class = 1
@@ -403,7 +392,6 @@ return (Matrix[Double] X, Matrix[Double] Y)
       outSet = matrix(0, 0, ncol(XY))
       remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100 
- (remainingRatio%%100)),
       remainingRatio-(remainingRatio%%100))
-      #print("remaining ratio: "+remainingRatio)
       for(i in 1: nrow(k), check=0) {
         end_class = end_class + as.scalar(classes[i])
         class_t = XY[start_class:end_class, ]
@@ -419,9 +407,8 @@ return (Matrix[Double] X, Matrix[Double] Y)
       Y = XY[, 1]
       X = XY[, 2:ncol(XY)]
     }
-    else { 
-      print("smote not applicable")
-
+    else {
+      str = "smote not applicable"
     }
   }
 }
@@ -477,7 +464,6 @@ return (Matrix[Double] X, Matrix[Double] Y)
     {
       Xcor = removeEmpty(target = X, margin = "rows", select = (inc==0))
       Ycor = removeEmpty(target = Y, margin = "rows", select = (inc==0))
-      # print("inc vector "+toString(inc))
       Xinc = removeEmpty(target = X, margin = "rows", select = inc)
       Yinc = removeEmpty(target = Y, margin = "rows", select = inc)
       yhat = removeEmpty(target = yhat, margin = "rows", select = inc)
@@ -497,8 +483,6 @@ return (Matrix[Double] X, Matrix[Double] Y)
     }
   }
   classes = table(Y, 1)
-  print("class distribution after flipLabels")
-  print(toString(classes))
 }
 
 # # # # wrapper for normalize
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index aaa41c7..acef187 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -49,7 +49,6 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, 
Integer iter = 3,
   Double threshold = 0.8, Boolean verbose = FALSE)
   return(Matrix[Double] output, Matrix[Double] meta, Double threshold, 
Frame[String] dM, List[Unknown] betaList)
 {
-
   if(ncol(X) < 2)
     stop("MICE can not be applied on single vectors.
          expected number of columns > 1 found: "+ncol(X))
@@ -76,7 +75,6 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, 
Integer iter = 3,
   X1 = X + (Mask1 * imputationVec)
   d = ncol(X1)
   n = nrow(X1)
-  
   # compute index of categorical features
   index = vectorToCsv(cMask)
   # specifications for one-hot encoding of categorical features
@@ -147,10 +145,10 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, 
Integer iter = 3,
           prob = matrix(1, nrow(test_Y), 1)
         }
         else {
-          beta = multiLogReg(X=train_X, Y=train_Y, icpt = 2, tol = 0.0001, reg 
= 0.00001, 
+          beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg 
= 0.00001, 
             maxi = 100, maxii=50, verbose=FALSE)
           # predicting missing values 
-          [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
+          [prob, pred, acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
           prob = rowMaxs(prob)
         }
         validThreshold = prob > threshold
diff --git a/scripts/builtin/topk_cleaning.dml 
b/scripts/builtin/topk_cleaning.dml
index 37cc1dc..028f6c6 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -58,7 +58,7 @@ source("scripts/builtin/bandit.dml") as bandit;
 
 s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = 
as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] 
primitives,
   Frame[Unknown] parameters, String evaluationFunc, Matrix[Double] evalFunHp, 
Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, Double 
sample = 1.0,
-  Double expectedIncrease=1.0, Boolean cv=TRUE, Integer cvk = 2, Boolean 
isLastLabel = TRUE, Boolean correctTypos=FALSE, String output)
+  Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk 
= 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean 
enablePruning = FALSE, String output)
   return(Boolean perf)
   # return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, 
Matrix[Double] topKScores, Frame[Unknown] bestLogical,
   # Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp)
@@ -95,16 +95,33 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, 
Frame[Unknown] dataTest = a
   print("-- Cleaning - Get Dirty Score: ");
   [dirtyScore, evalFunHp] = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, 
Ytest=eYtest, evaluationFunc=evaluationFunc, 
     metaList=metaList, sample=sample, cv=cv, cvk=cvk, evalFunHp=evalFunHp, 
ctx=ctx)
-  t4 = time(); print("---- finalized in: "+(t4-t3)/1e9+"s");
-
+  t4 = time(); print("---- finalized in: "+(t4-t3)/1e9+"s");  
+  
  # # do the string processing
   print("-- Cleaning - Data Preparation (strings, transform, sample): ");
   [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, cv, 
correctTypos, ctx)
-  
   # # if mask has 1s then there are categorical features
   print("---- feature transformations to numeric matrix");
   [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode")
-  
+  # # # do the early dropping
+  # # 1. if 70% of the column is empty
+  # # # 2. if the column has only single value
+  # # # have all unique values
+  Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0)
+  nullMask = is.na(eXtrain)
+  singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | 
(colMaxs(Xtmp) == colMins(Xtmp))
+  allmostEmpty = colSums(nullMask) 
+  allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.7)
+  allSum = singleValuesCol | allmostEmptyRatio
+  if(sum(allSum) > 0) {
+    eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 
0))
+    if(!cv)
+      eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 
0))
+    mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0))
+    fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0))
+    schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0))
+    metaList = list(mask=mask, schema=schema, fd=fdMask, 
applyFunc=as.frame("null"), distY=0)
+  }
   # apply sampling on training data for pipeline enumeration
   # TODO why recoding/sampling twice (within getDirtyScore)
   print("---- class-stratified sampling of feature matrix w/ f="+sample);
@@ -137,25 +154,19 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, 
Frame[Unknown] dataTest = a
   }
   metaList['distY'] = dist
 
-  if(sum(mask) > 0)
-  {
-    dummyEncode = frame("DUMMY", rows=nrow(logical), cols=1)
-    logical = cbind(logical, dummyEncode)
-  }
-  
   print("-- Cleaning - Enum Logical Pipelines: ");
-  [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, 
Xtest=eXtest, ytest=eYtest,
-  seed=logical, max_iter=max_iter, metaList = metaList,
+  [bestLogical, con, refChanges] = lg::enumerateLogical(X=eXtrain, y=eYtrain, 
Xtest=eXtest, ytest=eYtest,
+  initial_population=logical, seed = seed,  max_iter=max_iter, metaList = 
metaList,
   evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, 
param=parameters,
   dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, 
ctx=ctx)
   t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
-  # bestLogical = frame(["MVI", "OTLR", "DUMMY"], rows=1, cols=3)
+
   topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); 
topKScores = matrix(0,0,0); features = as.frame("NULL")
   
   # # [topKPipelines, topKHyperParams, topKScores, features] = 
   perf = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, 
Y_test=eYtest,  metaList=metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, 
primitives=primitives, param=parameters, baseLineScore=dirtyScore,
-    k=topK, R=resource_val, cv=cv, cvk=cvk, output=output, verbose=TRUE);  
+    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, enablePruning = 
enablePruning, output=output, verbose=TRUE);  
   t7 = time(); print("-- Cleaning - Enum Physical Pipelines: 
"+(t7-t6)/1e9+"s");
 }
 
@@ -164,7 +175,6 @@ return(Frame[String] schema, Matrix[Double] mask, 
Matrix[Double] fdMask, Integer
 {
   if(as.scalar(metaData[1, 1]) == "NULL")
   {
-    print("creating meta data")
     r1 = detectSchema(data)
     r2 = matrix(0, rows=1, cols=ncol(data))
     for(i in 1 : ncol(r1))
@@ -204,7 +214,7 @@ runStringPipeline = function(Frame[Unknown] Xtrain, 
Frame[Unknown] Xtest, Frame[
 return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
 {
   if(cv)
-    Xtrain = utils::stringProcessing(train=Xtrain, test=Xtrain, mask=mask, 
schema=schema, CorrectTypos=correctTypos, ctx=ctx)
+    Xtrain = utils::stringProcessing(train=Xtrain, test=matrix(0,0,0), 
mask=mask, schema=schema, CorrectTypos=correctTypos, ctx=ctx)
   else
   {
     # # # binding train and test to use same dictionary for both
@@ -231,10 +241,9 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
   eXtest = replace(target=eXtest, pattern=NaN, replacement = 1)
   print(prefix+" sample from train data and dummy code");
   [eXtrain, Ytrain] =  utils::doSample(eXtrain, Y, sample, TRUE)
-  sliceX = eXtrain
   [eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, 
cv, "dummycode")
   pipList = list(lp = as.frame("NULL"), ph = as.frame("NULL"), hp = 
as.matrix(0), flags = 0)
-  print(prefix+" hyper-parameter tuning");
+  print(prefix+" hyper-parameter tuning and dirtyscore computation");
   if(cv) {
     [dirtyScore, evalFunHp] = bandit::crossV(X=eXtrain, y=Ytrain, cvk=cvk, 
evalFunHp=evalFunHp,
       pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
@@ -246,7 +255,6 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
     evalFunHp = res[1, 2:ncol(res)]
     print("Dirty Accuracy holdout: "+dirtyScore)
   }
-  
 }
 
 recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, 
Matrix[Double] mask, Boolean cv, String code)
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml 
b/scripts/pipelines/scripts/enumerateLogical.dml
index 235f8ae..2cacd65 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -52,10 +52,10 @@
 source("scripts/builtin/bandit.dml") as bandit;
 
 enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] 
Xtest, Matrix[Double] ytest,
-  Frame[Unknown] seed, Integer max_iter=10, List[Unknown] metaList, String 
evaluationFunc, Matrix[Double] evalFunHp,
+  Frame[Unknown] initial_population, Integer seed = -1, Integer max_iter=10, 
List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
   Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79, 
Boolean cv=FALSE, Boolean cvk=3,
   Boolean verbose, List[Unknown] ctx=list(prefix="----"))
-return (Frame[Unknown] output, boolean converged)
+return (Frame[Unknown] output, boolean converged, Double refChanges)
 {
 
   finalOutput = list()
@@ -65,43 +65,43 @@ return (Frame[Unknown] output, boolean converged)
   iter = 1
   populationLength = 0
   converged = FALSE
-  # get the physical instances from logical ones
-  # unrolled by physical pipelines
-  pipelines = frame(0, rows=nrow(primitives)^ncol(primitives), cols=ncol(seed))
   start = 1; 
   end = 0;
-  allOps = param[, 2]
-  dist = as.scalar(metaList['distY'])
-  if(nrow(y) > 0 & min(y) >= 1 & dist <= 15)
-    allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"frequencyEncode\")
-    & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\"))?x:\"0\"")
-  else 
-    allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"mice\") & !x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
-      & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\") &
-      !x.equals(\"abstain\") & !x.equals(\"underSampling\") & 
!x.equals(\"flipLabels\") & !x.equals(\"SMOTE\"))?x:\"0\"")
-  # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
-  allOps = removeEmpty(target=allOps, margin="rows")
-  for(i in 1:nrow(seed)) { 
-    pconf = bandit::get_physical_configurations(seed[i], 0, primitives)
+  [allOps, ref] = getOps(param[, 2], as.scalar(metaList['distY']), nrow(y), 
min(y))
+
+  # unrolled by physical pipelines
+  pipelines = frame(0, rows=nrow(primitives)^ncol(primitives), 
cols=max(ncol(initial_population), ncol(ref)))
+  for(i in 1:nrow(initial_population)) { 
+    pconf = bandit::get_physical_configurations(initial_population[i], 0, 
primitives)
     end = end + nrow(pconf)
     pipelines[start:end, 1:ncol(pconf)] = pconf
     start = end + 1
   }
+
   pipelines = removeEmpty(target = pipelines, margin="rows") 
+  if(sum(mask) > 0)
+  {
+    dummyEncode = frame("dummycoding", rows=nrow(pipelines), cols=1)
+    pipelines[, 2] = dummyEncode
+  }
+  pipelines = rbind(ref, pipelines)
   population = pipelines
   populationSize = nrow(pipelines)
-
+  randomOps = sample(3, (populationSize * max_iter), TRUE, seed)
+  transitions = sample(nrow(allOps), (populationSize * max_iter), TRUE, seed)
+  refChangesInternal = 0
   while(!converged & iter <= max_iter)
   {
     populationLength = max(populationLength, ncol(population))
     id = seq(1, nrow(population))
     print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" );
     # # # execute the physical pipelines
-    [outPip, outHp, feaFrameOuter] = 
bandit::run_with_hyperparam(cbind(as.frame(id), population), 
-      num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp, 
param, as.frame(""), cv, cvk, TRUE)
+    [outPip, outHp, p, refChanges] = 
bandit::run_with_hyperparam(cbind(as.frame(id), population), 
+      num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp, 
param, cv, cvk, 0, FALSE, TRUE)
     # # sort the configurations score-wise
-    actPip = cbind(as.frame(outPip[, 1]), population)
-    sort_mask = cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(population)))
+    actPip = cbind(as.frame(outPip[, 1]), as.frame(refChanges))
+    actPip = cbind(actPip, population)
+    sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1, 
cols=ncol(population)))
     sortedPipelines = frameSort(actPip, sort_mask, TRUE)
     converged = as.double(as.scalar(sortedPipelines[1, 1])) > dirtyScore
     if(converged)
@@ -111,21 +111,22 @@ return (Frame[Unknown] output, boolean converged)
       sortedPipelines = sortedPipelines[1:diR]
     finalOutput = append(finalOutput, sortedPipelines)
     # # # if converged then stop otherwise generate new population
-    sortedPipelines = sortedPipelines[, 2:ncol(sortedPipelines)]
-    children = frame(0, rows=populationSize, cols=ncol(sortedPipelines) + 1)
+    children = frame(0, rows=populationSize, cols=ncol(sortedPipelines))
+    sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)]
     # # randomly pick the pipelines for transitions
     pipRand = sample(nrow(sortedPipelines), populationSize, TRUE)
     if(!converged) {
-      parfor(i in 1:nrow(children), check=0) {
+      for(i in 1:nrow(children), check=0) {
+        idxR = (nrow(children) * (iter - 1)) + i
         idx = as.scalar(pipRand[i])
         top = removeEmpty(target=sortedPipelines[idx], margin="cols")
         tail = top[, ncol(top)]
         if(sum(mask) > 0)
           top = top[, 1:ncol(top) - 1]
           
-        random = ifelse(ncol(top) <=2, 1, as.scalar(sample(3, 1)))
+        random = ifelse(ncol(top) <=2, 1, as.scalar(randomOps[idxR]))
         if(random == 1)
-          c1 = addition(top, allOps) 
+          c1 = addition(top, allOps[as.scalar(transitions[idxR])]) 
         else if(random == 2)
           c1 = mutation(top) 
         else if(random == 3)
@@ -143,8 +144,7 @@ return (Frame[Unknown] output, boolean converged)
     print(prefix+" EnumLP did not converge after "+(iter - 1)+" / "+max_iter+" 
iterations")  
   }
   # # # prepare the final frame output
-  output = frame(0, rows=round((populationSize/2)) * length(finalOutput) , 
cols=populationLength + 1) 
-  print("rows in output: "+nrow(output))
+  output = frame(0, rows=round((populationSize/2)) * length(finalOutput) , 
cols=populationLength + 2) 
   start = 1; 
   end = 0;
   for(i in 1:length(finalOutput))
@@ -154,31 +154,19 @@ return (Frame[Unknown] output, boolean converged)
     output[start:end, 1:ncol(pipFrame)] = pipFrame
     start = end + 1
   }
-  sort_mask = cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(output) - 1))
+  sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1, 
cols=ncol(output) - 2))
   output = removeEmpty(target=output, margin="rows")
-  output = frameSort(output, sort_mask, TRUE)
-  print("final Pipelines")
-  print(toString(output, rows=150))
-  output = output[, 2:ncol(output)]
+  output = frameSort(output, sort_mask, FALSE)
+  refChanges = as.double(as.scalar(output[nrow(output), 2]))
+  output = output[, 3:ncol(output)]
 }
 
-
 addition = function(Frame[Unknown] top, Frame[Unknown] allOps)
 return (Frame [Unknown] child)
 {
-  c = as.scalar(sample(nrow(allOps), 1))
-  # place_to_add = as.scalar(sample(ncol(top), 1))
-  # if(place_to_add == 1)
-  child = cbind(allOps[c, 1], top)
-  # else
-  # {
-    # start = top[, 1:place_to_add-1]
-    # end = top[, place_to_add:ncol(top)]
-    # child = cbind(cbind(start, allOps[c, 1]), end)
-  # }
+  child = cbind(allOps, top)
 }
 
-
 mutation = function(Frame[Unknown] child)
 return (Frame [Unknown] mChild)
 {
@@ -198,14 +186,32 @@ removal = function(Frame[Unknown] child)
 return (Frame[Unknown] child)
 {
   random = as.scalar(rand(rows=1, cols=1))
-  print("before removal")
-  print(toString(child))
   if(ncol(child) >= 2)
   {
     idx = as.scalar(sample(ncol(child), 1))
     child[1, idx] = as.frame(0)
     child = removeEmpty(target=child, margin="cols")
   }
-  print("after removal")
-  print(toString(child))
 }
+
+getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer 
minValue)
+ return (Frame[String] allOps, Frame[String] ref) {
+ 
+  # # # TODO fix the following hard-coded condition by taking a file input
+  # # allOps are the operation which are randomly added to a population, for 
now I am reusing param file
+  # # so map condition with remove the operations which should not be added 
twice in a pipeline i.e., dummycoding
+  # # for regression class imbalance operators are also removed
+  if(n > 0 & minValue >= 1 & dist <= 15) {
+    allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"frequencyEncode\")
+    & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\"))?x:\"0\"")
+    ref = frame(["imputeByMean", "winsorize", "scale", "dummycoding"], rows=1, 
cols=4)
+  }
+  else {
+    allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
+      & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\") &
+      !x.equals(\"abstain\") & !x.equals(\"underSampling\") & 
!x.equals(\"flipLabels\") & !x.equals(\"SMOTE\"))?x:\"0\"")
+    # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
+    ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
+  }
+  allOps = removeEmpty(target=allOps, margin="rows")
+}
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/utils.dml 
b/scripts/pipelines/scripts/utils.dml
index b0e55bb..b4dd7df 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -157,7 +157,6 @@ return(Frame[Unknown] train, Frame[Unknown] test, 
Matrix[Double] M)
   # step 1 do the case transformations
   print(prefix+" convert strings to lower case");
   train = map(train, "x -> x.toLowerCase()")
-  
   # step 2 fix invalid lengths
   # q0 = 0.05
   # q1 = 0.95
@@ -201,7 +200,21 @@ return(Frame[Unknown] train, Frame[Unknown] test, 
Matrix[Double] M)
           test[, i] = correctTyposApply(test[, i], ft, dt, dm, fr);
       }
   }
-  
+  # # step 7 convert date to decimal
+  isDate = map(train[1:10], "x -> UtilFunctions.isDateColumn(x)")
+  isDate = replace(target = as.matrix(isDate), pattern = NaN, replacement = 0)
+  isDate = (colMaxs(isDate)) & as.matrix(schema == frame("STRING", rows=1, 
cols=ncol(schema)))
+  if(sum(isDate) > 0) {
+    print(prefix+" changing date to timestamp")
+    dateColIdx = removeEmpty(target = isDate * t(seq(1, ncol(isDate))), 
margin="cols")
+    for(i in 1:ncol(dateColIdx))
+    {
+      idx = as.scalar(dateColIdx[i])
+      train[, idx] = map(train[, idx], "x -> UtilFunctions.getTimestamp(x)", 
margin=2)
+      if(length(test) > 0)
+        test[, idx] = map(test[, idx], "x -> UtilFunctions.getTimestamp(x)", 
margin=2)
+    }
+  }
   # TODO add deduplication
   print(prefix+" deduplication via entity resolution");
   
diff --git a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java 
b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
index 17db8f6..cb7d19b 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
@@ -493,6 +493,8 @@ public class UtilFunctions {
        }
 
        public static double objectToDoubleSafe(ValueType vt, Object in) {
+               if(vt == ValueType.STRING && in == null)
+                       return 0.0;
                if(vt == ValueType.STRING && !NumberUtils.isCreatable((String) 
in)) {
                        return 1.0;
                } else return objectToDouble(vt, in);
@@ -909,11 +911,13 @@ public class UtilFunctions {
 
                if (maxMatches <= 0 || dateCol < 0){
                        //ERROR - no date column found
-                       throw new DMLRuntimeException("No date column found.");
+                       System.out.println("No date column in the dataset");
                }
                return dateCol;
        }
-
+       public static String isDateColumn (String values) {
+               return DATE_FORMATS.keySet().parallelStream().anyMatch(e -> 
values.toLowerCase().matches(e))?"1":"0";
+       }
        public static String[] getDominantDateFormat (String[] values) {
                String[] output = new String[values.length];
                Map<String, String> date_formats = DATE_FORMATS;
diff --git a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml 
b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
index 9f8a681..320eb12 100644
--- a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
+++ b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml
@@ -95,7 +95,7 @@ return(Matrix[Double] output, Matrix[Double] error)
   }
   else {
     beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), 
reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), 
-      maxi=as.scalar(evalFunHp[1,4]), maxii=0, verbose=FALSE);
+      maxi=1000, maxii=0, verbose=FALSE);
     [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
     error = yhat != Ytest
     a = getAccuracy(Ytest, yhat, TRUE)
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
 
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index fd464fe..5a57be4 100644
--- 
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++ 
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-NA,dummycodingApply
-NA,dummycodingApply
-NA,dummycodingApply
+NA,dummycodingApply,0,0
+NA,dummycodingApply,0,0
+imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index ae312ae..274dcca 100644
--- 
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ 
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-73.73188405797102
-69.7463768115942
-69.02173913043478
+70.83333333333334
+69.38405797101449
+68.65942028985508
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
index dcb46fe..ec20472 100644
--- 
a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
+++ 
b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv
@@ -1 +1 @@
-2.0,10.0,0.001,1000.0
+2.0,10.0,0.001
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index ef64dd0..231e789 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-14.0,1.0,0.2750943835009122,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-14.0,1.0,0.4614295314769764,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-14.0,1.0,0.49358019629519945,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.44724177618347905,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.3017247635995244,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.016068274841623598,0.9737026111609255,0,0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index bdfc48a..d6f15ed 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-underSampling,dummycoding
-underSampling,dummycoding
-underSampling,dummycoding
+underSampling,dummycoding,0,0
+underSampling,dummycoding,0,0
+imputeByMean,winsorize,scale,dummycoding
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv 
b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
index d7b7ef0..46d28d3 100644
--- 
a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
+++ 
b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
@@ -1,5 +1,5 @@
-imputeByFdApply,outlierBySdApply,dummycodingApply,0,0,0,0,0,0
-imputeByMeanApply,imputeByFdApply,outlierBySdApply,dummycodingApply,0,0,0,0,0
-imputeByFdApply,outlierBySdApply,dummycodingApply,dummycodingApply,0,0,0,0,0
-imputeByFdApply,outlierBySdApply,dummycodingApply,dummycodingApply,0,0,0,0,0
-imputeByMeanApply,imputeByFdApply,outlierBySdApply,dummycodingApply,0,0,0,0,0
+outlierBySdApply,fillDefaultApply,outlierByIQRApply,scaleApply,0,0,0
+normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply,0
+miceApply,normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply
+normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply,0
+normalizeApply,outlierByIQRApply,winsorizeApply,forward_fill,imputeByMeanApply,scaleApply,0
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml 
b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
index 02c1429..890dd9b 100644
--- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -92,7 +92,7 @@ testY = eY[split+1:nrow(eX),]
 
 
 [bestLogical, converged] = lg::enumerateLogical(X=trainX, y=trainY, 
Xtest=testX, ytest=testY,
-  seed=logical, max_iter=max_iter, metaList = metaList, 
evaluationFunc="evalML", dirtyScore = dirtyScore + expectedIncrease,
+  initial_population=logical, seed = 42, max_iter=max_iter, metaList = 
metaList, evaluationFunc="evalML", dirtyScore = dirtyScore + expectedIncrease,
   evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives, 
param=param,
   cv=FALSE, verbose=TRUE)
 
diff --git 
a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml 
b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 74bea3d..56a82c8 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -59,7 +59,7 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)]
 # [topKPipelines, topKHyperParams, topKScores, bestLogical, features, 
dirtyScore, evalHp] = 
 result = topk_cleaning(dataTrain=trainData, dataTest=testData, 
metaData=metaInfo, primitives=primitives, parameters=param,
   evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, 
resource_val=resources,
-  expectedIncrease=expectedIncrease, max_iter=max_iter, cv=testCV, cvk=cvk, 
sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
+  expectedIncrease=expectedIncrease, seed = 42, max_iter=max_iter, cv=testCV, 
cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
 
 write(result, $O)

[systemds] branch main updated: [SYSTEMDS-3299] Seed value in genetic Algorithm - This commit introduce seed parameter in genetic algorithm to control the random additions and transitions in pipelines.

Reply via email to