This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 9f2aa4e103 [MINOR] Inclusion of error-based sub-sampling in cleaning
pipelines
9f2aa4e103 is described below
commit 9f2aa4e1035c6f80ca09db27b2b740313b18b880
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Nov 30 12:46:44 2022 +0100
[MINOR] Inclusion of error-based sub-sampling in cleaning pipelines
---
scripts/builtin/abstain.dml | 2 +-
scripts/builtin/fit_pipeline.dml | 13 +-
scripts/builtin/mice.dml | 2 +-
scripts/builtin/tomeklink.dml | 12 +-
scripts/builtin/topk_cleaning.dml | 19 +-
scripts/pipelines/scripts/enumerateLogical.dml | 5 +-
scripts/pipelines/scripts/utils.dml | 194 +++++++++++++++++++--
.../functions/pipelines/fit_pipelineTest.dml | 2 +-
.../functions/pipelines/topkLogicalTest.dml | 6 +-
9 files changed, 217 insertions(+), 38 deletions(-)
diff --git a/scripts/builtin/abstain.dml b/scripts/builtin/abstain.dml
index 5a4c354327..6e0cb51634 100644
--- a/scripts/builtin/abstain.dml
+++ b/scripts/builtin/abstain.dml
@@ -41,7 +41,7 @@ return (Matrix[Double] Xout, Matrix[Double] Yout)
{
Xout = X
Yout = Y
- if(min(Y) != max(Y))
+ if(min(Y) != max(Y) & max(Y) <= 2)
{
betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0,
verbose=verbose)
[prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE)
diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml
index 96023f7b49..e31bf65676 100644
--- a/scripts/builtin/fit_pipeline.dml
+++ b/scripts/builtin/fit_pipeline.dml
@@ -47,7 +47,7 @@ source("scripts/builtin/topk_cleaning.dml") as topk;
source("scripts/builtin/bandit.dml") as bandit;
s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData,
Frame[Unknown] metaData = as.frame("NULL"),
- Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, String
evaluationFunc, Matrix[Double] evalFunHp,
+ Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer
cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double]
cleanTest, List[Unknown] externalState, List[Unknown] iState)
{
@@ -92,6 +92,11 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain,
Matrix[Double] cleanTe
hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
+ [trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk,
evalFunHp=evalFunHp,
+ pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
+ print("train score cv: "+toString(trainScore))
+
+
# # # now test accuracy
[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] =
executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix,
flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
@@ -99,15 +104,15 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain,
Matrix[Double] cleanTe
if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")
- score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain,
Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
- trainAccuracy = as.scalar(score[1, 1])
+ # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain,
Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
+ # trainAccuracy = as.scalar(score[1, 1])
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest,
Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
testAccuracy = as.scalar(score[1, 1])
scores = matrix(0, rows=1, cols=3)
scores[1, 1] = dirtyScore
- scores[1, 2] = trainAccuracy
+ # scores[1, 2] = trainAccuracy
scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain, eYtrain)
cleanTest = cbind(eXtest, eYtest)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index ca2c4592e6..8d7b1af69e 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -142,7 +142,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3,
}
else {
beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg
= 0.00001,
- maxi = 50, maxii=50, verbose=FALSE)
+ maxi = 20, maxii=20, verbose=FALSE)
# predicting missing values
[prob, pred, acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
prob = rowMaxs(prob)
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index 9bb72f007d..413b823e9f 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -82,9 +82,11 @@ return (Matrix[Double] nn) {
get_links = function(Matrix[Double] X, Matrix[Double] y, double majority_label)
return (Matrix[Double] tomek_links) {
tomek_links = matrix(-1, 1, 1)
- nn = get_nn(X)
- perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
- nn_labels = perm %*% y
- links = (y != majority_label) & (nn_labels == majority_label)
- tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
+ if(max(y) <= 2 ) {
+ nn = get_nn(X)
+ perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
+ nn_labels = perm %*% y
+ links = (y != majority_label) & (nn_labels == majority_label)
+ tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
+ }
}
diff --git a/scripts/builtin/topk_cleaning.dml
b/scripts/builtin/topk_cleaning.dml
index f2353e9919..d72f300b92 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -27,8 +27,10 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as
lg;
source("scripts/builtin/bandit.dml") as bandit;
s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest =
as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown]
primitives,
- Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String
evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer
resource_val = 20, Integer max_iter = 10,
- Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean
cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean
correctTypos=FALSE, Boolean enablePruning = FALSE)
+ Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String
evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer
resource_val = 20,
+ Integer max_iter = 10, Double lq = 0.1, Double uq=0.7, Double sample = 1.0,
Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk =
2,
+ Boolean isLastLabel = TRUE,
+ Boolean correctTypos=FALSE, Boolean enablePruning = FALSE)
return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams,
Matrix[Double] topKScores,
Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc)
{
@@ -71,13 +73,16 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
[Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, cv,
correctTypos, ctx)
# # if mask has 1s then there are categorical features
print("---- feature transformations to numeric matrix");
- [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode")
+ [eXtrain, eXtest, metaR] = recodeData(Xtrain, Xtest, mask, cv, "recode")
# # # do the early dropping
# [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv)
# apply sampling on training data for pipeline enumeration
# TODO why recoding/sampling twice (within getDirtyScore)
print("---- class-stratified sampling of feature matrix w/ f="+sample);
- [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, TRUE)
+ if(sum(mask) > ncol(mask)/2 & nrow(eYtrain) >= 10000 & sample == 1.0)
+ [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq)
+ else
+ [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask,
metaR, TRUE)
t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
# # # create logical pipeline seeds
@@ -110,14 +115,14 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
[bestLogical, bestHp, con, refChanges, acc] =
lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter,
metaList = metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives,
param=parameters,
- dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE,
ctx=ctx)
+ dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=FALSE,
ctx=ctx)
t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
- topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0);
topKScores = matrix(0,0,0); features = as.frame("NULL")
+ topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0);
topKScores = matrix(0,0,0); applyFunc = as.frame("NULL")
# write(acc, output+"/acc.csv", format="csv")
# stop("end of enumlp")
[topKPipelines, topKHyperParams, topKScores, applyFunc] =
bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,
metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical,
lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
- k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed,
enablePruning = enablePruning, verbose=TRUE);
+ k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed,
enablePruning = enablePruning, verbose=FALSE);
t7 = time(); print("-- Cleaning - Enum Physical Pipelines:
"+(t7-t6)/1e9+"s");
}
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml
b/scripts/pipelines/scripts/enumerateLogical.dml
index 19de3a6fe4..cb933787b4 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -275,14 +275,15 @@ getOps = function( Frame[string] allOps, Frame[String]
refSol, Integer dist, Int
# # for regression class imbalance operators are also removed
if(n > 0 & minValue >= 1 & dist <= 15) {
allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"frequencyEncode\")
- & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\"))?x:\"0\"")
+ & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\") &
+ !x.equals(\"abstain\") & !x.equals(\"underSampling\") &
!x.equals(\"flipLabels\") & !x.equals(\"mice\") &
!x.equals(\"SMOTE\"))?x:\"0\"")
ref = frame(["imputeByMean", "winsorize", "scale", "dummycoding"], rows=1,
cols=4)
}
else {
allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
& !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\") &
!x.equals(\"abstain\") & !x.equals(\"underSampling\") &
!x.equals(\"flipLabels\") & !x.equals(\"mice\") &
!x.equals(\"SMOTE\"))?x:\"0\"")
- ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
+ ref = frame(["imputeByMean", "winsorize", "scale", "dummycoding"], rows=1,
cols=4)
}
if(as.scalar(refSol[1,1]) == "NaN")
refSol = ref
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index e9d5488511..a4f9f6c9b7 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -50,14 +50,116 @@ return (Frame[Unknown] frameblock)
}
+# # #######################################################################
+# # # Function for group-wise/stratified sampling from all classes in labelled
dataset
+# # # Inputs: The input dataset X, Y and sampling ratio between 0 and 1
+# # # Output: sample X and Y
+# # #######################################################################
+# # doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio,
Matrix[Double] mask, Frame[String] metaR, Boolean verbose = FALSE)
+ # # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double]
filterMask)
+# # {
+ # # print("initial number of rows: " +nrow(eX))
+ # # # # # prepare feature vector for NB
+ # # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=50,
maxii=50, verbose=FALSE);
+ # # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
+
+ # # # # if the operation is binary make a fixed confidence of 0.9, for
multi-class compute kappa
+ # # # threshold = 0
+ # # # if(max(eY) == 2)
+ # # # threshold = quantile(rowMaxs(trainProbs), 0.95)
+ # # kappa = 0.0
+ # # # if(max(eY) <= 2) {
+ # # # kappa = quantile(rowMaxs(trainProbs), 0.95)
+ # # # print("for binary classification")
+ # # # }
+ # # # else {
+ # # # # compute kappa
+ # # classFreA = table(eY, 1, 1, max(eY), 1)
+ # # classFreP = table(yhat, 1, 1, max(eY), 1)
+ # # probA = classFreA/nrow(eY)
+ # # probP = classFreP/nrow(eY)
+ # # condProb = sum(probA * probP)
+ # # kappa = ((accuracy/100) - condProb) / (1 - condProb)
+ # # print("kappa for multi-class"+toString(kappa))
+ # # # }
+ # # print("threshold "+toString(kappa))
+ # # filterMask = rowMaxs(trainProbs) > kappa
+ # # # sampledX = removeEmpty(target = eX, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
+ # # # sampledY = removeEmpty(target = eY, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
+ # # # print("filtered number of rows: " +nrow(sampledX))
+
+ # # mask[1,1] = 0
+ # # # # # stats of wrong
+ # # maxUniques = max(colMaxs(replace(target=eX, pattern=NaN, replacement=1))
* mask)
+ # # print("maxUniques "+maxUniques)
+ # # while(FALSE){}
+ # # stats = matrix(0, rows=maxUniques, cols=ncol(mask))
+ # # metaInfo = frame(0, rows=nrow(metaR), cols = 2*ncol(metaR))
+ # # # m = 1
+ # # for(i in 1:ncol(mask))
+ # # {
+ # # print("meta: "+as.scalar(mask[1, i]))
+ # # if(as.scalar(mask[1, i]) == 1)
+ # # {
+ # # problematic_cats = removeEmpty(target=eX[, i], margin = "rows",
select = (yhat != eY))
+ # # problematic_cats_sums = table(problematic_cats, 1)
+ # # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
+ # # stats_rowMax = rowMaxs(stats)
+ # # stats2 = (stats == stats_rowMax) * (stats_rowMax >= 100)
+ # # # colum = metaR[, i]
+ # # # print("printing meta recoded")
+ # # # print(toString(colum))
+ # # # while(FALSE){}
+ # # # tmpValue = map(colum, "x -> x.toLowerCase()")
+ # # # tmpIndex = map(colum, "x -> x.toLowerCase()")
+ # # # metaInfo[1:nrow(tmpIndex), m] = tmpIndex
+ # # # metaInfo[1:nrow(tmpIndex), m+1] = tmpValue
+ # # # m = m + 2
+ # # }
+ # # }
+ # # filterMask = eX[, 4] == 2 | eX[, 5] == 4 | eX[, 5] == 7 | eX[, 5] == 8
+ # # filterMask = filterMask == 0
+ # # # stats = cbind(seq(1, nrow(stats)), stats, stats_rowMax)
+ # # # stats2 = cbind(seq(1, nrow(stats)), stats2)
+ # # # print("print status: \n"+toString(stats))
+ # # # print("print status 2: \n"+toString(stats2))
+ # # # print("meta infor: \n"+toString(metaInfo, rows=10))
+ # # # # create the filter mask
+ # # print("rows taken after filtering the categories: "+sum(filterMask))
+ # # MIN_SAMPLE = 1000
+ # # sampledX = eX
+ # # sampledY = eY
+ # # ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
+ # # sampled = floor(nrow(eX) * ratio)
+
+ # # if(sampled > MIN_SAMPLE & ratio != 1.0)
+ # # {
+ # # sampleVec = sample(nrow(eX), sampled, FALSE, 23)
+ # # P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec),
nrow(eX))
+ # # if((nrow(eY) > 1)) # for classification
+ # # {
+ # # sampledX = P %*% eX
+ # # sampledY = P %*% eY
+ # # }
+ # # else if(nrow(eY) == 1) { # for clustering
+ # # sampledX = P %*% eX
+ # # sampledY = eY
+ # # }
+ # # print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
+ # # }
+
+# # }
+
+
#######################################################################
# Function for group-wise/stratified sampling from all classes in labelled
dataset
# Inputs: The input dataset X, Y and sampling ratio between 0 and 1
# Output: sample X and Y
#######################################################################
-doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio,
Boolean verbose = FALSE)
+doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio,
Matrix[Double] mask, Frame[String] metaR, Boolean verbose = FALSE)
return (Matrix[Double] sampledX, Matrix[Double] sampledY)
{
+
MIN_SAMPLE = 1000
sampledX = eX
sampledY = eY
@@ -79,9 +181,69 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio, Boolean
}
print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
}
+}
+
+doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq,
Double uq)
+ return (Matrix[Double] sampledX, Matrix[Double] sampledY)
+{
+ print("initial number of rows: " +nrow(eX))
+ print("quantiles: "+lq+" "+uq)
+ # # # prepare feature vector for NB
+ beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=20,
maxii=20, verbose=FALSE);
+ [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
+
+ # kappa = 0.0
+
+ # # compute kappa
+ # classFreA = table(eY, 1, 1, max(eY), 1)
+ # classFreP = table(yhat, 1, 1, max(eY), 1)
+ # probA = classFreA/nrow(eY)
+ # probP = classFreP/nrow(eY)
+ # condProb = sum(probA * probP)
+ # kappa = ((accuracy/100) - condProb) / (1 - condProb)
+ # print("kappa for multi-class"+toString(kappa))
+ # filterMask = rowMaxs(trainProbs) < kappa
+ # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)),
kappa)
+ # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)),
kappa)
+ # print("threshold "+toString(threshold))
+
+ print("applying error filter")
+ # sampledX = removeEmpty(target = eX, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
+ # sampledY = removeEmpty(target = eY, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
+ filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) |
rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq)
+ sampledX = removeEmpty(target = eX, margin = "rows", select=filterMask)
+ sampledY = removeEmpty(target = eY, margin = "rows", select=filterMask)
+ print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
+
}
+# doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY)
+ # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double]
filterMask)
+# {
+ # print("initial number of rows: " +nrow(eX))
+ # # # # prepare feature vector for NB
+ # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=50,
maxii=50, verbose=FALSE);
+ # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
+ # # # # stats of wrong
+ # maxUniques = max(colMaxs(eX) * mask)
+ # stats = matrix(0, rows=nrow(maxUniques), cols=ncol(mask))
+ # for(i in 1:ncol(mask))
+ # {
+ # if(as.scalar(mask[1, i]) == 1)
+ # {
+ # problematic_cats = removeEmpty(target=eX[, i], margin = rows, select =
(yhat != eY))
+ # problematic_cats_sums = table(problematic_cats, 1)
+ # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
+ # }
+
+ # }
+ # print(toString(stats))
+
+
+# }
+
+
# #######################################################################
# # Wrapper of transformencode OHE call, to call inside eval as a function
# # Inputs: The input dataset X, and mask of the columns
@@ -132,13 +294,16 @@ stringProcessing = function(Frame[Unknown] data,
Matrix[Double] mask,
Frame[String] schema, Boolean CorrectTypos, List[Unknown] ctx =
list(prefix="--"))
return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown]
dictionary, Matrix[Double] dateColIdx)
{
-
+ hasCategory = sum(mask) > 0
prefix = as.scalar(ctx["prefix"]);
distanceMatrix = list()
dictionary = list()
+
# step 1 do the case transformations
print(prefix+" convert strings to lower case");
- data = map(data, "x -> x.toLowerCase()")
+ if(hasCategory) {
+ data = map(data, "x -> x.toLowerCase()")
+
# step 2 fix invalid lengths
# q0 = 0.05
# q1 = 0.95
@@ -152,14 +317,15 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix,
List[Unknown] dictiona
# data = valueSwap(data, schema)
# step 3 drop invalid types
- print(prefix+" drop values with type mismatch");
- data = dropInvalidType(data, schema)
-
+ print(prefix+" drop values with type mismatch");
+ data = dropInvalidType(data, schema)
+
- # step 5 porter stemming on all features
- print(prefix+" porter-stemming on all features");
- data = map(data, "x -> PorterStemmer.stem(x)", 0)
+ # step 5 porter stemming on all features
+ print(prefix+" porter-stemming on all features");
+ data = map(data, "x -> PorterStemmer.stem(x)", 0)
+ }
# step 6 typo correction
if(CorrectTypos)
{
@@ -204,13 +370,13 @@ return(Frame[Unknown] data)
data = map(data, "x -> x.toLowerCase()")
# step 2 fix invalid lengths
- q0 = 0.05
- q1 = 0.95
+ # q0 = 0.05
+ # q1 = 0.95
- [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+ # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
- # # step 3 fix swap values
- data = valueSwap(data, schema)
+ # # # step 3 fix swap values
+ # data = valueSwap(data, schema)
# step 3 drop invalid types
data = dropInvalidType(data, schema)
diff --git a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
index 4244fd79e6..cb265dbdd0 100644
--- a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
+++ b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
@@ -60,7 +60,7 @@ testData = F[split+1:nrow(F),]
print("pipeline: "+toString(pip[1]))
-[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData,
metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE,
FALSE)
+[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData,
metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp,
TRUE, FALSE)
eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,],
TRUE, exState, iState, FALSE)
diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
index f1ae5f1275..3c6e70cd7b 100644
--- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml
+++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml
@@ -92,9 +92,9 @@ testY = eY[split+1:nrow(eX),]
[bestLogical, bestHp, converged] = lg::enumerateLogical(X=trainX, y=trainY,
Xtest=testX, ytest=testY,
- initial_population=logical, seed = 42, max_iter=max_iter, metaList =
metaList, evaluationFunc="evalML", dirtyScore = dirtyScore + expectedIncrease,
- evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives,
param=param,
- cv=FALSE, verbose=TRUE)
+ initial_population=logical, seed = 42, max_iter=max_iter, metaList =
metaList, evaluationFunc="evalML",
+ dirtyScore = dirtyScore + expectedIncrease, evalFunHp=matrix("1 1e-3 1e-9
100", rows=1, cols=4), primitives=primitives,
+ param=param, cv=FALSE, verbose=TRUE)
print("bestLogical "+toString(bestLogical))