This is an automated email from the ASF dual-hosted git repository. ssiddiqi pushed a commit to branch Dec2 in repository https://gitbox.apache.org/repos/asf/systemds.git
commit a44002e42081900d21350a60631ddffafc426e72 Author: Shafaq Siddiqi <[email protected]> AuthorDate: Thu Jan 5 09:25:45 2023 +0100 [MINOR] Cleanups in cleaning pipelines --- scripts/builtin/topk_cleaning.dml | 8 +-- scripts/pipelines/scripts/utils.dml | 65 +++++++++++----------- .../intermediates/classification/applyFunc.csv | 6 +- .../intermediates/classification/bestAcc.csv | 6 +- .../intermediates/classification/dirtyScore.csv | 2 +- .../pipelines/intermediates/classification/hp.csv | 6 +- .../pipelines/intermediates/classification/pip.csv | 6 +- 7 files changed, 51 insertions(+), 48 deletions(-) diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index 8fe5b48b1b..ed5a00572e 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -29,7 +29,7 @@ source("scripts/builtin/bandit.dml") as bandit; s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives, Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, Double lq = 0.1, Double uq=0.7, Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, - Boolean isLastLabel = TRUE, + Boolean isLastLabel = TRUE, Integer rowCount = 3700, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE) return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc) @@ -43,7 +43,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # prepare meta data # # keeping the meta list format if we decide to add more stuff in metadata [schema, mask, fdMask, maskY] = prepareMeta(dataTrain, metaData) - metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("null"), distY=0) + metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("null"), distY=0, minFold=0) t2 = time(); print("-- Cleaning - Prepare Metadata: "+(t2-t1)/1e9+"s"); # separate the label @@ -79,8 +79,8 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # apply sampling on training data for pipeline enumeration # TODO why recoding/sampling twice (within getDirtyScore) print("---- class-stratified sampling of feature matrix w/ f="+sample); - if(sum(mask) > ncol(mask)/2 & nrow(eYtrain) >= 10000 & sample == 1.0) - [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, 3500) + if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # & + [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount) else [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, metaR, TRUE) t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s"); diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml index b1b1e2086a..45688db883 100644 --- a/scripts/pipelines/scripts/utils.dml +++ b/scripts/pipelines/scripts/utils.dml @@ -86,21 +86,24 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq, Double uq, Integer rowCount = 3500) return (Matrix[Double] sampledX, Matrix[Double] sampledY) { - print("initial number of rows: " +nrow(eX)) - print("quantiles: "+lq+" "+uq) - # # # prepare feature vector for NB - beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=20, maxii=20, verbose=FALSE); - [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE) + print("Error filtering") + if(nrow(eY) < rowCount) + filterMask = matrix(1, rows=nrow(eY), cols=1) + else { + # # # prepare feature vector for NB + beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=20, maxii=20, verbose=FALSE); + [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE) - print("applying error filter") - filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) | rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq) - delta = 0.001 - while(sum(filterMask) < rowCount & nrow(eY) > rowCount) - { - lq = lq + delta - uq = uq - delta + print("applying error filter") filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) | rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq) + delta = 0.001 + while(sum(filterMask) < rowCount & nrow(eY) > rowCount) + { + lq = lq + delta + uq = uq - delta + filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) | rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq) + } } sampledX = removeEmpty(target = eX, margin = "rows", select=filterMask) sampledY = removeEmpty(target = eY, margin = "rows", select=filterMask) @@ -205,18 +208,18 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona } # # step 7 convert date to decimal dateColIdx = as.matrix(0) - isDate = map(data[1:10], "x -> UtilFunctions.isDateColumn(x)") - isDate = replace(target = as.matrix(isDate), pattern = NaN, replacement = 0) - isDate = (colMaxs(isDate)) & as.matrix(schema == frame("STRING", rows=1, cols=ncol(schema))) - if(sum(isDate) > 0) { - print(prefix+" changing date to timestamp") - dateColIdx = removeEmpty(target = isDate * t(seq(1, ncol(isDate))), margin="cols") - for(i in 1:ncol(dateColIdx)) - { - idx = as.scalar(dateColIdx[i]) - data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2) - } - } + # isDate = map(data[1:10], "x -> UtilFunctions.isDateColumn(x)") + # isDate = replace(target = as.matrix(isDate), pattern = NaN, replacement = 0) + # isDate = (colMaxs(isDate)) & as.matrix(schema == frame("STRING", rows=1, cols=ncol(schema))) + # if(sum(isDate) > 0) { + # print(prefix+" changing date to timestamp") + # dateColIdx = removeEmpty(target = isDate * t(seq(1, ncol(isDate))), margin="cols") + # for(i in 1:ncol(dateColIdx)) + # { + # idx = as.scalar(dateColIdx[i]) + # data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2) + # } + # } # TODO add deduplication print(prefix+" deduplication via entity resolution"); @@ -261,11 +264,11 @@ return(Frame[Unknown] data) } } # # step 7 convert date to decimal - if(sum(dateColIdx) > 0) { - for(i in 1:ncol(dateColIdx)) - { - idx = as.scalar(dateColIdx[i]) - data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2) - } - } + # if(sum(dateColIdx) > 0) { + # for(i in 1:ncol(dateColIdx)) + # { + # idx = as.scalar(dateColIdx[i]) + # data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2) + # } + # } } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv index af64dc371a..5db997f9a8 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv @@ -1,3 +1,3 @@ -forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0 -forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0 -forward_fill,winsorizeApply,imputeByMedianApply,NA,winsorizeApply,forward_fill,imputeByMeanApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0 +outlierBySdApply,winsorizeApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +normalizeApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +imputeByMeanApply,outlierBySdApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv index e8747b356c..39fcddd2e1 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv @@ -1,3 +1,3 @@ -73.731884057971 -73.731884057971 -73.731884057971 +74.87179487179488 +74.87179487179488 +74.87179487179488 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv index 4e5b1a5042..fae86940b1 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv @@ -1 +1 @@ -61.050724637681164 \ No newline at end of file +74.87179487179488 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index 750229f523..49cc34605f 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 [...] -40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 [...] -64.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,1.0,0,0,0,0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 [...] +27.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,2.0,0.05,0.95,0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 [...] +21.0,0,0,0,0,0,0,0,1.0,0.75,0,0,1.0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, [...] +27.0,0,0,0,0,1.0,0,0,0,2.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 [...] diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index 228496cbef..3fa4259115 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0 -forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0 -forward_fill,winsorize,imputeByMedian,tomeklink,winsorize,forward_fill,imputeByMean,dummycoding,0,0,0,0,0,0,0,0,0,0 +outlierBySd,winsorize,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +normalize,abstain,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +imputeByMean,outlierBySd,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
