[systemds] branch main updated: [MINOR] Inclusion of error-based sub-sampling in cleaning pipelines

ssiddiqi Wed, 30 Nov 2022 04:03:12 -0800

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 9f2aa4e103 [MINOR] Inclusion of error-based sub-sampling in cleaning 
pipelines
9f2aa4e103 is described below

commit 9f2aa4e1035c6f80ca09db27b2b740313b18b880
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Nov 30 12:46:44 2022 +0100

    [MINOR] Inclusion of error-based sub-sampling in cleaning pipelines
---
 scripts/builtin/abstain.dml                        |   2 +-
 scripts/builtin/fit_pipeline.dml                   |  13 +-
 scripts/builtin/mice.dml                           |   2 +-
 scripts/builtin/tomeklink.dml                      |  12 +-
 scripts/builtin/topk_cleaning.dml                  |  19 +-
 scripts/pipelines/scripts/enumerateLogical.dml     |   5 +-
 scripts/pipelines/scripts/utils.dml                | 194 +++++++++++++++++++--
 .../functions/pipelines/fit_pipelineTest.dml       |   2 +-
 .../functions/pipelines/topkLogicalTest.dml        |   6 +-
 9 files changed, 217 insertions(+), 38 deletions(-)

diff --git a/scripts/builtin/abstain.dml b/scripts/builtin/abstain.dml
index 5a4c354327..6e0cb51634 100644
--- a/scripts/builtin/abstain.dml
+++ b/scripts/builtin/abstain.dml
@@ -41,7 +41,7 @@ return (Matrix[Double] Xout, Matrix[Double] Yout)
 {
   Xout = X
   Yout = Y
-  if(min(Y) != max(Y))
+  if(min(Y) != max(Y) & max(Y) <= 2)
   {
     betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0, 
verbose=verbose)
     [prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE)
diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml
index 96023f7b49..e31bf65676 100644
--- a/scripts/builtin/fit_pipeline.dml
+++ b/scripts/builtin/fit_pipeline.dml
@@ -47,7 +47,7 @@ source("scripts/builtin/topk_cleaning.dml") as topk;
 source("scripts/builtin/bandit.dml") as bandit;
 
 s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, 
Frame[Unknown] metaData = as.frame("NULL"),
-  Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, String 
evaluationFunc, Matrix[Double] evalFunHp,
+  Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer 
cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
   Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
 return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] 
cleanTest, List[Unknown] externalState, List[Unknown] iState)
 {
@@ -92,6 +92,11 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, 
Matrix[Double] cleanTe
   hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
   pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
 
+  [trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, 
evalFunHp=evalFunHp,
+      pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
+  print("train score cv: "+toString(trainScore))
+  
+  
   # # # now test accuracy
   [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = 
executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
     Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, 
flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
@@ -99,15 +104,15 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, 
Matrix[Double] cleanTe
   if(max(eYtrain) == min(eYtrain)) 
     stop("Y contains only one class")
 
-  score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, 
Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
-  trainAccuracy = as.scalar(score[1, 1])
+  # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, 
Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
+  # trainAccuracy = as.scalar(score[1, 1])
   
   score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, 
Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
   testAccuracy = as.scalar(score[1, 1])
   
   scores = matrix(0, rows=1, cols=3)
   scores[1, 1] = dirtyScore
-  scores[1, 2] = trainAccuracy
+  # scores[1, 2] = trainAccuracy
   scores[1, 3] = testAccuracy  
   cleanTrain = cbind(eXtrain, eYtrain)
   cleanTest = cbind(eXtest, eYtest)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index ca2c4592e6..8d7b1af69e 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -142,7 +142,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, 
Integer iter = 3,
         }
         else {
           beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg 
= 0.00001, 
-            maxi = 50, maxii=50, verbose=FALSE)
+            maxi = 20, maxii=20, verbose=FALSE)
           # predicting missing values 
           [prob, pred, acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
           prob = rowMaxs(prob)
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index 9bb72f007d..413b823e9f 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -82,9 +82,11 @@ return (Matrix[Double] nn) {
 get_links = function(Matrix[Double] X, Matrix[Double] y, double majority_label)
 return (Matrix[Double] tomek_links) {
   tomek_links = matrix(-1, 1, 1)
-  nn = get_nn(X)
-  perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
-  nn_labels = perm %*% y
-  links = (y != majority_label) & (nn_labels == majority_label)
-  tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
+  if(max(y) <= 2 ) {
+    nn = get_nn(X)
+    perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
+    nn_labels = perm %*% y
+    links = (y != majority_label) & (nn_labels == majority_label)
+    tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
+  }
 }
diff --git a/scripts/builtin/topk_cleaning.dml 
b/scripts/builtin/topk_cleaning.dml
index f2353e9919..d72f300b92 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -27,8 +27,10 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as 
lg;
 source("scripts/builtin/bandit.dml") as bandit;
 
 s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = 
as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] 
primitives,
-  Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String 
evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer 
resource_val = 20, Integer max_iter = 10,
-  Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean 
cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean 
correctTypos=FALSE, Boolean enablePruning = FALSE)
+  Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String 
evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer 
resource_val = 20,
+  Integer max_iter = 10, Double lq = 0.1, Double uq=0.7, Double sample = 1.0, 
Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 
2, 
+  Boolean isLastLabel = TRUE,
+  Boolean correctTypos=FALSE, Boolean enablePruning = FALSE)
   return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, 
Matrix[Double] topKScores,
     Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc)
 {
@@ -71,13 +73,16 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, 
Frame[Unknown] dataTest = a
   [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, cv, 
correctTypos, ctx)
   # # if mask has 1s then there are categorical features
   print("---- feature transformations to numeric matrix");
-  [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode")
+  [eXtrain, eXtest, metaR] = recodeData(Xtrain, Xtest, mask, cv, "recode")
   # # # do the early dropping
   # [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv)
   # apply sampling on training data for pipeline enumeration
   # TODO why recoding/sampling twice (within getDirtyScore)
   print("---- class-stratified sampling of feature matrix w/ f="+sample);
-  [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, TRUE)
+  if(sum(mask) > ncol(mask)/2 & nrow(eYtrain) >= 10000 & sample == 1.0) 
+    [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq)
+  else 
+    [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, 
metaR, TRUE)
   t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
 
   # # # create logical pipeline seeds 
@@ -110,14 +115,14 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, 
Frame[Unknown] dataTest = a
   [bestLogical, bestHp, con, refChanges, acc] = 
lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
   initial_population=logical, refSol=refSol, seed = seed,  max_iter=max_iter, 
metaList = metaList,
   evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, 
param=parameters,
-  dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, 
ctx=ctx)
+  dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=FALSE, 
ctx=ctx)
   t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
-  topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); 
topKScores = matrix(0,0,0); features = as.frame("NULL")
+  topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); 
topKScores = matrix(0,0,0); applyFunc = as.frame("NULL")
   # write(acc, output+"/acc.csv", format="csv")
   # stop("end of enumlp")
   [topKPipelines, topKHyperParams, topKScores, applyFunc] = 
bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,  
metaList=metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, 
lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
-    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, 
enablePruning = enablePruning, verbose=TRUE);  
+    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, 
enablePruning = enablePruning, verbose=FALSE);  
   t7 = time(); print("-- Cleaning - Enum Physical Pipelines: 
"+(t7-t6)/1e9+"s");
 }
 
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml 
b/scripts/pipelines/scripts/enumerateLogical.dml
index 19de3a6fe4..cb933787b4 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -275,14 +275,15 @@ getOps = function( Frame[string] allOps, Frame[String] 
refSol, Integer dist, Int
   # # for regression class imbalance operators are also removed
   if(n > 0 & minValue >= 1 & dist <= 15) {
     allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"frequencyEncode\")
-    & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\"))?x:\"0\"")
+    & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\") &
+     !x.equals(\"abstain\") & !x.equals(\"underSampling\") & 
!x.equals(\"flipLabels\") & !x.equals(\"mice\") & 
!x.equals(\"SMOTE\"))?x:\"0\"")
     ref = frame(["imputeByMean", "winsorize", "scale", "dummycoding"], rows=1, 
cols=4)
   }
   else {
     allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
       & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\") &
       !x.equals(\"abstain\") & !x.equals(\"underSampling\") & 
!x.equals(\"flipLabels\") & !x.equals(\"mice\") & 
!x.equals(\"SMOTE\"))?x:\"0\"") 
-    ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
+    ref = frame(["imputeByMean", "winsorize", "scale", "dummycoding"], rows=1, 
cols=4)
   }
   if(as.scalar(refSol[1,1]) == "NaN")
     refSol = ref
diff --git a/scripts/pipelines/scripts/utils.dml 
b/scripts/pipelines/scripts/utils.dml
index e9d5488511..a4f9f6c9b7 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -50,14 +50,116 @@ return (Frame[Unknown] frameblock)
 }
 
 
+# # #######################################################################
+# # # Function for group-wise/stratified sampling from all classes in labelled 
dataset
+# # # Inputs: The input dataset X, Y  and  sampling ratio between 0 and 1
+# # # Output: sample X and Y
+# # #######################################################################
+# # doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, 
Matrix[Double] mask, Frame[String] metaR, Boolean verbose = FALSE)
+  # # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double] 
filterMask)
+# # {
+  # # print("initial number of rows: " +nrow(eX))
+  # # # # # prepare feature vector for NB
+  # # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6,  maxi=50, 
maxii=50, verbose=FALSE);
+  # # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
+
+  # # # # if the operation is binary make a fixed confidence of 0.9, for 
multi-class compute kappa
+  # # # threshold = 0
+  # # # if(max(eY) == 2)
+    # # # threshold = quantile(rowMaxs(trainProbs), 0.95)
+  # # kappa = 0.0
+  # # # if(max(eY) <= 2) {
+    # # # kappa = quantile(rowMaxs(trainProbs), 0.95)
+    # # # print("for binary classification")
+  # # # }
+  # # # else {
+    # # # # compute kappa
+    # # classFreA = table(eY, 1, 1, max(eY), 1)
+    # # classFreP = table(yhat, 1, 1, max(eY), 1)
+    # # probA = classFreA/nrow(eY)
+    # # probP = classFreP/nrow(eY)
+    # # condProb = sum(probA * probP)
+    # # kappa = ((accuracy/100) - condProb) / (1 - condProb)
+    # # print("kappa for multi-class"+toString(kappa))
+  # # # }
+  # # print("threshold "+toString(kappa))
+  # # filterMask = rowMaxs(trainProbs) > kappa
+  # # # sampledX = removeEmpty(target = eX, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
+  # # # sampledY = removeEmpty(target = eY, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
+  # # # print("filtered number of rows: " +nrow(sampledX))
+
+  # # mask[1,1] = 0
+    # # # # # stats of wrong
+  # # maxUniques = max(colMaxs(replace(target=eX, pattern=NaN, replacement=1)) 
* mask)
+  # # print("maxUniques "+maxUniques)
+  # # while(FALSE){}
+  # # stats = matrix(0, rows=maxUniques, cols=ncol(mask))
+  # # metaInfo = frame(0, rows=nrow(metaR), cols = 2*ncol(metaR))
+  # # # m = 1
+  # # for(i in 1:ncol(mask))
+  # # {
+    # # print("meta: "+as.scalar(mask[1, i]))
+    # # if(as.scalar(mask[1, i]) == 1)
+    # # {
+      # # problematic_cats = removeEmpty(target=eX[, i], margin = "rows", 
select = (yhat != eY))
+      # # problematic_cats_sums = table(problematic_cats, 1)
+      # # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
+      # # stats_rowMax = rowMaxs(stats)
+      # # stats2 = (stats == stats_rowMax) * (stats_rowMax >= 100)
+      # # # colum =  metaR[, i]
+      # # # print("printing meta recoded")
+      # # # print(toString(colum))
+      # # # while(FALSE){}
+      # # # tmpValue = map(colum, "x -> x.toLowerCase()")
+      # # # tmpIndex = map(colum, "x -> x.toLowerCase()")
+      # # # metaInfo[1:nrow(tmpIndex), m] = tmpIndex
+      # # # metaInfo[1:nrow(tmpIndex), m+1] = tmpValue
+      # # # m = m + 2
+    # # }
+  # # }
+  # # filterMask = eX[, 4] == 2 | eX[, 5] == 4 | eX[, 5] == 7 | eX[, 5] == 8
+  # # filterMask = filterMask == 0
+  # # # stats = cbind(seq(1, nrow(stats)), stats, stats_rowMax)
+  # # # stats2 = cbind(seq(1, nrow(stats)), stats2)
+  # # # print("print status: \n"+toString(stats))
+  # # # print("print status 2: \n"+toString(stats2))
+  # # # print("meta infor: \n"+toString(metaInfo, rows=10))
+  # # # # create the filter mask  
+  # # print("rows taken after filtering the categories: "+sum(filterMask))
+  # # MIN_SAMPLE = 1000
+  # # sampledX = eX
+  # # sampledY = eY
+  # # ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
+  # # sampled = floor(nrow(eX) * ratio)
+  
+  # # if(sampled > MIN_SAMPLE & ratio != 1.0)
+  # # {
+    # # sampleVec = sample(nrow(eX), sampled, FALSE, 23)
+    # # P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec), 
nrow(eX))
+    # # if((nrow(eY) > 1))  # for classification
+    # # {
+      # # sampledX = P %*% eX
+      # # sampledY = P %*% eY
+    # # }
+    # # else if(nrow(eY) == 1) { # for clustering
+      # # sampledX = P %*% eX
+      # # sampledY = eY 
+    # # }
+    # # print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
+  # # }
+
+# # }
+
+
 #######################################################################
 # Function for group-wise/stratified sampling from all classes in labelled 
dataset
 # Inputs: The input dataset X, Y  and  sampling ratio between 0 and 1
 # Output: sample X and Y
 #######################################################################
-doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, 
Boolean verbose = FALSE)
+doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, 
Matrix[Double] mask, Frame[String] metaR, Boolean verbose = FALSE)
   return (Matrix[Double] sampledX, Matrix[Double] sampledY)
 {
+
   MIN_SAMPLE = 1000
   sampledX = eX
   sampledY = eY
@@ -79,9 +181,69 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, 
Double ratio, Boolean
     }
     print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
   }
+}
+
 
+doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq, 
Double uq)
+  return (Matrix[Double] sampledX, Matrix[Double] sampledY)
+{
+  print("initial number of rows: " +nrow(eX))
+  print("quantiles: "+lq+" "+uq)
+  # # # prepare feature vector for NB
+  beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6,  maxi=20, 
maxii=20, verbose=FALSE);
+  [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
+
+  # kappa = 0.0
+
+  # # compute kappa
+  # classFreA = table(eY, 1, 1, max(eY), 1)
+  # classFreP = table(yhat, 1, 1, max(eY), 1)
+  # probA = classFreA/nrow(eY)
+  # probP = classFreP/nrow(eY)
+  # condProb = sum(probA * probP)
+  # kappa = ((accuracy/100) - condProb) / (1 - condProb)
+  # print("kappa for multi-class"+toString(kappa))
+  # filterMask = rowMaxs(trainProbs) < kappa
+  # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)), 
kappa)
+  # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)), 
kappa)
+  # print("threshold "+toString(threshold))
+
+  print("applying error filter")
+  # sampledX = removeEmpty(target = eX, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
+  # sampledY = removeEmpty(target = eY, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
+  filterMask = rowMaxs(trainProbs) <  quantile(rowMaxs(trainProbs), lq) | 
rowMaxs(trainProbs) >  quantile(rowMaxs(trainProbs), uq)
+  sampledX = removeEmpty(target = eX, margin = "rows", select=filterMask)
+  sampledY = removeEmpty(target = eY, margin = "rows", select=filterMask)
+  print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
+ 
 }
 
+# doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY)
+  # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double] 
filterMask)
+# {
+  # print("initial number of rows: " +nrow(eX))
+  # # # # prepare feature vector for NB
+  # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6,  maxi=50, 
maxii=50, verbose=FALSE);
+  # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
+  # # # # stats of wrong
+  # maxUniques = max(colMaxs(eX) * mask)
+  # stats = matrix(0, rows=nrow(maxUniques), cols=ncol(mask))
+  # for(i in 1:ncol(mask))
+  # {
+    # if(as.scalar(mask[1, i]) == 1)
+    # {
+      # problematic_cats = removeEmpty(target=eX[, i], margin = rows, select = 
(yhat != eY))
+      # problematic_cats_sums = table(problematic_cats, 1)
+      # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
+    # }
+  
+  # }
+  # print(toString(stats))
+
+
+# }
+
+
 # #######################################################################
 # # Wrapper of transformencode OHE call, to call inside eval as a function
 # # Inputs: The input dataset X, and  mask of the columns
@@ -132,13 +294,16 @@ stringProcessing = function(Frame[Unknown] data, 
Matrix[Double] mask,
   Frame[String] schema, Boolean CorrectTypos, List[Unknown] ctx = 
list(prefix="--"))
 return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] 
dictionary, Matrix[Double] dateColIdx)
 { 
-
+  hasCategory = sum(mask) > 0
   prefix = as.scalar(ctx["prefix"]);
   distanceMatrix = list()
   dictionary = list()
+  
   # step 1 do the case transformations
   print(prefix+" convert strings to lower case");
-  data = map(data, "x -> x.toLowerCase()")
+  if(hasCategory) {
+    data = map(data, "x -> x.toLowerCase()")
+  
   # step 2 fix invalid lengths
   # q0 = 0.05
   # q1 = 0.95
@@ -152,14 +317,15 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, 
List[Unknown] dictiona
   # data = valueSwap(data, schema)
   
   # step 3 drop invalid types
-  print(prefix+" drop values with type mismatch");
-  data = dropInvalidType(data, schema)
-
+    print(prefix+" drop values with type mismatch");
+    data = dropInvalidType(data, schema)
+  
 
-  # step 5 porter stemming on all features
-  print(prefix+" porter-stemming on all features");
-  data = map(data, "x -> PorterStemmer.stem(x)", 0)
 
+    # step 5 porter stemming on all features
+    print(prefix+" porter-stemming on all features");
+    data = map(data, "x -> PorterStemmer.stem(x)", 0)
+  }
   # step 6 typo correction  
   if(CorrectTypos)
   {
@@ -204,13 +370,13 @@ return(Frame[Unknown] data)
   data = map(data, "x -> x.toLowerCase()")
   # step 2 fix invalid lengths
 
-  q0 = 0.05
-  q1 = 0.95
+  # q0 = 0.05
+  # q1 = 0.95
 
-  [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+  # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
 
-  # # step 3 fix swap values
-  data = valueSwap(data, schema)
+  # # # step 3 fix swap values
+  # data = valueSwap(data, schema)
 
   # step 3 drop invalid types
   data = dropInvalidType(data, schema)
diff --git a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml 
b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
index 4244fd79e6..cb265dbdd0 100644
--- a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
+++ b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
@@ -60,7 +60,7 @@ testData = F[split+1:nrow(F),]
 
 
 print("pipeline: "+toString(pip[1]))
-[result, trX, tsX, exState, iState]  = fit_pipeline(trainData, testData, 
metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE, 
FALSE)
+[result, trX, tsX, exState, iState]  = fit_pipeline(trainData, testData, 
metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, 
TRUE, FALSE)
 eXtest  = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 
TRUE, exState, iState, FALSE)
 
 
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml 
b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
index f1ae5f1275..3c6e70cd7b 100644
--- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -92,9 +92,9 @@ testY = eY[split+1:nrow(eX),]
 
 
 [bestLogical, bestHp, converged] = lg::enumerateLogical(X=trainX, y=trainY, 
Xtest=testX, ytest=testY,
-  initial_population=logical, seed = 42, max_iter=max_iter, metaList = 
metaList, evaluationFunc="evalML", dirtyScore = dirtyScore + expectedIncrease,
-  evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives, 
param=param,
-  cv=FALSE, verbose=TRUE)
+  initial_population=logical, seed = 42, max_iter=max_iter, metaList = 
metaList, evaluationFunc="evalML",
+  dirtyScore = dirtyScore + expectedIncrease, evalFunHp=matrix("1 1e-3 1e-9 
100", rows=1, cols=4), primitives=primitives,
+  param=param, cv=FALSE, verbose=TRUE)
 
 
 print("bestLogical "+toString(bestLogical))

[systemds] branch main updated: [MINOR] Inclusion of error-based sub-sampling in cleaning pipelines

Reply via email to