This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 3d1ba3c3b53f48d971c358a99f7c8b3e0b609c2b Author: Matthias Boehm <[email protected]> AuthorDate: Fri Sep 3 00:15:05 2021 +0200 [SYSTEMDS-3115,3120] Implements cleaning pipeline enumeration scripts * Parfor parallelization logical pipeline enumeration * Various vectorization in the correctTypos builtin --- scripts/builtin/correctTypos.dml | 51 +++++--------------------- scripts/pipelines/scripts/enumerateLogical.dml | 25 ++++++------- 2 files changed, 21 insertions(+), 55 deletions(-) diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml index 9f95a4d..45d3861 100644 --- a/scripts/builtin/correctTypos.dml +++ b/scripts/builtin/correctTypos.dml @@ -123,11 +123,7 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double A = ascii_matrix[1:as.scalar(lengths[i,1]), i]; B = ascii_matrix[1:as.scalar(lengths[j,1]), j]; d = damerauLevenshteinDistanceBound(A, B, distance_threshold, FALSE); - if (d == -1) { - distance_matrix[i, j] = 42000; - } else { - distance_matrix[i, j] = d; - } + distance_matrix[i, j] = ifelse(d == -1, 42000, d); } } } @@ -178,36 +174,26 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double } } - replaceStrings = function(String replacement, String to_replace, Frame[String] strings) return(Frame[String] strings) { - for (i in 1:nrow(strings)) { - if (as.scalar(strings[i,]) == to_replace) { - strings[i,] = replacement; - } - } + strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s"); } - insertOrIncrement = function(String str, Frame[Unknown] dict) return(Frame[Unknown] dict) { i = 1; - ret = FALSE; break = FALSE; while (i <= nrow(dict) & !break) { if (as.scalar(dict[i, 1]) == str) { - value = as.integer(as.scalar(dict[i, 2])) + 1; - dict[i, 2] = value; - contains = TRUE; + dict[i, 2] = as.frame(as.integer(as.scalar(dict[i, 2])) + 1); break = TRUE; } i = i + 1; } - if (!break) { + if (!break) dict = rbind(dict, cbind(as.frame(str), as.frame(1))); - } } @@ -216,19 +202,11 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d dl_matrix = matrix(0, rows = length(A) + 1, cols = length(B) + 1); dl_matrix[length(A) + 1, length(B) + 1] = -1; - - for (j in 2:length(B) + 1) { - dl_matrix[1, j] = j - 1; - } - + dl_matrix[1, 2:(length(B)+1)] = t(seq(2,length(B)+1) - 1); dl_matrix[2, 1] = 1; for (j in 2:length(B) + 1) { - if (as.scalar(A[1]) == as.scalar(B[j - 1])) { - cost = 0; - } else { - cost = 1; - } + cost = as.integer(as.scalar(A[1]) != as.scalar(B[j - 1])) dl_matrix[2, j] = min(min( dl_matrix[2, j - 1] + 1, dl_matrix[1, j] + 1), @@ -241,23 +219,14 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d i += 1; dl_matrix[i, 1] = i - 1; - - if (as.scalar(A[i - 1]) == as.scalar(B[1])) { - cost = 0; - } else { - cost = 1; - } + cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[1])) dl_matrix[i, 2] = min(min( dl_matrix[i - 1, 2] + 1, dl_matrix[i, 1] + 1), dl_matrix[i - 1, 1] + cost); for (j in 3:length(B) + 1) { - if (as.scalar(A[i - 1]) == as.scalar(B[j - 1])) { - cost = 0; - } else { - cost = 1; - } + cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[j - 1])) if (as.scalar(A[i - 1]) == as.scalar(B[j - 2]) & as.scalar(A[i - 2]) == as.scalar(B[j - 1])) { dl_matrix[i, j] = min(min( dl_matrix[i, j - 1] + 1, @@ -272,9 +241,7 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d } } - if( min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound) { - break_condition = TRUE; - } + break_condition = min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound; } if (is_verbose){ diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml index f894c4e..1133eb7 100644 --- a/scripts/pipelines/scripts/enumerateLogical.dml +++ b/scripts/pipelines/scripts/enumerateLogical.dml @@ -67,9 +67,9 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T) { physicalPipList = list() logicalPipList = list() + # # # get the physical instances from logical ones - for(i in 1:nrow(population)) - { + for(i in 1:nrow(population)) { lv = as.integer(as.scalar(population[i, 1])) + 1 lp = population[i, 2:lv] physicalConf = bandit::get_physical_configurations(lp, num_inst, primitives) @@ -77,23 +77,22 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T) logicalPipList = append(logicalPipList, lp) } - scores = matrix(0, rows=length(physicalPipList), cols=1) - # # # execute the physical pipelines - for(i in 1:length(physicalPipList)) - { - physicalConf = as.frame(physicalPipList[i]) - lp = as.frame(logicalPipList[i]) + scores = matrix(0, length(physicalPipList), 1) + # TODO better parfor-dep handling of multi-assignments to avoid check=0 + parfor(i in 1:length(physicalPipList), check=0) { + lp2 = as.frame(logicalPipList[i,1]) + pp2 = as.frame(physicalPipList[i,1]) # # append configuration keys for extracting the pipeline later on - id = seq(1, nrow(physicalConf)) - physicalConf = cbind(as.frame(id), physicalConf) + id = seq(1, nrow(pp2)) + idpp = cbind(as.frame(id), pp2) # # execute the physical instances and store the minimum scores, each pipeline is executed num_exec times - [outPip,outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp, physicalConf, num_exec, X, y, Xtest, ytest, metaList, + [outPip, outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp2, idpp, num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, verbose) # # sort the configurations groupwise - max_perf = bandit::getMaxPerConf(outPip, nrow(physicalConf)) - scores[i] = as.matrix(max_perf[1, 1]) + max_perf = bandit::getMaxPerConf(outPip, nrow(pp2)) + scores[i,1] = as.matrix(max_perf[1,1]) } # # select parents and best score
