(systemds) 01/03: [SYSTEMDS-3330] Fixes in generator.py, builtin function fixes

baunsgaard Wed, 24 Sep 2025 06:35:29 -0700

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit 9a7c3c7227403c993f7c7b56c920b05b8ea44228
Author: anuunchin <[email protected]>
AuthorDate: Wed Sep 24 15:01:06 2025 +0200

    [SYSTEMDS-3330] Fixes in generator.py, builtin function fixes
    
    This commit resolves some inconsistent behavior in the generation
    of builtin functions in python generated files.
    
    Closes #2291
---
 scripts/builtin/ampute.dml                         |  1 -
 scripts/builtin/confusionMatrix.dml                |  2 +-
 scripts/builtin/cooccurrenceMatrix.dml             | 13 ++--
 scripts/builtin/decisionTree.dml                   |  4 +-
 scripts/builtin/differenceStatistics.dml           |  5 ++
 scripts/builtin/glove.dml                          | 87 +++++++++++-----------
 scripts/builtin/imputeByKNN.dml                    | 25 +++----
 scripts/builtin/quantizeByCluster.dml              |  5 +-
 scripts/builtin/randomForest.dml                   |  7 +-
 scripts/builtin/shapExplainer.dml                  |  1 +
 scripts/builtin/topk_cleaning.dml                  | 38 +++++++++-
 src/main/python/docs/README.md                     |  2 +-
 src/main/python/docs/requires-docs.txt             |  3 +-
 src/main/python/generator/dml_parser.py            | 16 ++--
 .../python/systemds/operator/algorithm/__init__.py | 36 +++++++++
 .../systemds/operator/algorithm/builtin/ampute.py  | 10 +++
 .../operator/algorithm/builtin/apply_pipeline.py   |  1 -
 .../operator/algorithm/builtin/confusionMatrix.py  |  2 +-
 .../{confusionMatrix.py => cooccurrenceMatrix.py}  | 37 ++++-----
 .../operator/algorithm/builtin/correctTypos.py     |  1 -
 .../algorithm/builtin/correctTyposApply.py         |  1 -
 .../operator/algorithm/builtin/decisionTree.py     |  4 +-
 .../algorithm/builtin/denialConstraints.py         |  1 -
 .../algorithm/builtin/differenceStatistics.py      |  5 ++
 .../systemds/operator/algorithm/builtin/dmv.py     |  1 -
 .../systemds/operator/algorithm/builtin/ema.py     |  1 -
 .../operator/algorithm/builtin/executePipeline.py  | 38 ++++++++--
 .../operator/algorithm/builtin/fit_pipeline.py     |  1 -
 .../algorithm/builtin/fixInvalidLengths.py         |  1 -
 .../algorithm/builtin/fixInvalidLengthsApply.py    |  1 -
 .../operator/algorithm/builtin/frameSort.py        |  1 -
 .../systemds/operator/algorithm/builtin/glove.py   | 67 +++++++++++++++++
 .../operator/algorithm/builtin/imputeByKNN.py      | 19 ++++-
 .../systemds/operator/algorithm/builtin/mdedup.py  |  1 -
 .../algorithm/builtin/quantizeByCluster.py         | 83 +++++++++++++++++++++
 .../operator/algorithm/builtin/randomForest.py     |  7 +-
 .../operator/algorithm/builtin/shapExplainer.py    | 78 +++++++++++++++++++
 .../operator/algorithm/builtin/topk_cleaning.py    | 34 ++++++++-
 .../builtin/{fixInvalidLengthsApply.py => wer.py}  | 28 +++----
 39 files changed, 516 insertions(+), 152 deletions(-)

diff --git a/scripts/builtin/ampute.dml b/scripts/builtin/ampute.dml
index 0da6af20fd..ffc645b5b3 100644
--- a/scripts/builtin/ampute.dml
+++ b/scripts/builtin/ampute.dml
@@ -30,7 +30,6 @@
 # mech         a string [either "MAR", "MNAR", or "MCAR"] specifying the 
missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a 
non-default weight matrix is specified
 # weights      a weight matrix [shape: k-by-m], containing weights that will 
be used to calculate the weighted sum scores. Will be overridden if mech == 
"MCAR"
 # seed         a manually defined seed for reproducible RNG
-
 # 
-------------------------------------------------------------------------------------
 #
 # OUTPUT:
diff --git a/scripts/builtin/confusionMatrix.dml 
b/scripts/builtin/confusionMatrix.dml
index 3ac70fb3f8..b21088f2cf 100644
--- a/scripts/builtin/confusionMatrix.dml
+++ b/scripts/builtin/confusionMatrix.dml
@@ -23,7 +23,7 @@
 # and actual labels. We return both the counts and relative frequency
 # (normalized by sum of true labels)
 #
-# .. code-block::
+# .. code-block:: text
 #
 #                   True Labels
 #                     1    2
diff --git a/scripts/builtin/cooccurrenceMatrix.dml 
b/scripts/builtin/cooccurrenceMatrix.dml
index 86b8b9ca16..590f4ba1e0 100644
--- a/scripts/builtin/cooccurrenceMatrix.dml
+++ b/scripts/builtin/cooccurrenceMatrix.dml
@@ -18,22 +18,21 @@
 # under the License.
 #
 #-------------------------------------------------------------
-#
-# The implementation is based on
+
+# Cleans and processes text data by removing punctuation, converting it to 
lowercase, and reformatting.
+# Adds an index column to the result. The implementation is based on
 # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
 #
-#-------------------------------------------------------------
-
-## Cleans and processes text data by removing punctuation, converting it to 
lowercase, and reformatting.
-## Adds an index column to the result.
 # INPUT:
 # 
------------------------------------------------------------------------------
 # S     (Frame[Unknown]): 1D input data frame containing text data.
 # 
------------------------------------------------------------------------------
+#
 # OUTPUT:
 # 
------------------------------------------------------------------------------
 # result    (Frame[Unknown]): Processed text data with an index column.
 # 
------------------------------------------------------------------------------
+
 processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
     print("processText");
     tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
@@ -172,4 +171,4 @@ f_cooccurrenceMatrix = function(
     [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
     [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
     coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), 
tableSize, distanceWeighting, symmetric, windowSize);
-}
+}
\ No newline at end of file
diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
index 69bf12af90..94c292d855 100644
--- a/scripts/builtin/decisionTree.dml
+++ b/scripts/builtin/decisionTree.dml
@@ -30,9 +30,9 @@
 #   and the following trees, M would look as follows:
 #
 #   (L1)               |d<5|
-#                     /     \
+#                     /     \\
 #   (L2)           P1:2    |a<7|
-#                          /   \
+#                          /   \\
 #   (L3)                 P2:2 P3:1
 #
 #   --> M :=
diff --git a/scripts/builtin/differenceStatistics.dml 
b/scripts/builtin/differenceStatistics.dml
index 0e9019f096..30f207091e 100644
--- a/scripts/builtin/differenceStatistics.dml
+++ b/scripts/builtin/differenceStatistics.dml
@@ -28,6 +28,11 @@
 # X        First Matrix to compare
 # Y        Second Matrix to compare
 # 
--------------------------------------------------------------------------------
+#
+# OUTPUT:
+# 
-------------------------------------------------------------------------------------
+# stats.   Difference statistics
+# 
-------------------------------------------------------------------------------------
 
 m_differenceStatistics = function(Matrix[Double] X, Matrix[Double] Y)  {
 
diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml
index fc5ee9bafb..9acf52975c 100644
--- a/scripts/builtin/glove.dml
+++ b/scripts/builtin/glove.dml
@@ -18,6 +18,51 @@
 # under the License.
 #-------------------------------------------------------------
 
+
+# Computes the vector embeddings for words in a large text corpus. 
+#
+# INPUT:
+# 
--------------------------------------------------------------------------------
 
+# input                 1DInput corpus in CSV format.
+# seed                  Random seed for reproducibility.
+# vector_size           Dimensionality of word vectors, V.
+# eta                   Learning rate for optimization, recommended value: 
0.05.
+# alpha                 Weighting function parameter, recommended value: 0.75.
+# x_max                 Maximum co-occurrence value as per the GloVe paper: 
100.
+# tol                   Tolerance value to avoid overfitting, recommended 
value: 1e-4.
+# iterations            Total number of training iterations.
+# print_loss_it         Interval (in iterations) for printing the loss.
+# maxTokens             Maximum number of tokens per text entry.
+# windowSize            Context window size.
+# distanceWeighting     Whether to apply distance-based weighting.
+# symmetric             Determines if the matrix is symmetric (TRUE) or 
asymmetric (FALSE).
+# 
------------------------------------------------------------------------------
+#
+# OUTPUT:
+# 
------------------------------------------------------------------------------
+# G                     The word indices and their word vectors, of shape (N, 
V). Each represented as a vector, of shape (1,V)
+# 
------------------------------------------------------------------------------
+
+
+f_glove = function(
+    Frame[Unknown] input,
+    int seed, int vector_size,
+    double alpha, double eta,
+    double x_max,
+    double tol,
+    int iterations,
+    int print_loss_it,
+    Int maxTokens,
+    Int windowSize,
+    Boolean distanceWeighting,
+    Boolean symmetric)
+    return (frame[Unknown] G){
+
+        [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, 
windowSize, distanceWeighting, symmetric);
+        G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, 
alpha, eta, x_max, tol, iterations, print_loss_it);
+}
+
+
 init = function(matrix[double] cooc_matrix, double x_max, double alpha)
   return(matrix[double] weights, matrix[double] log_cooc_matrix){
   E = 2.718281828;
@@ -118,45 +163,3 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, 
frame[Unknown] cooc_i
     print("Given " + iterations + " iterations, " + "stopped (or converged) at 
the " + final_iter + " iteration / error: " + error);
     G = cbind(cooc_index[,2], as.frame(G));
 }
-
-glove = function(
-    Frame[Unknown] input,
-    int seed, int vector_size,
-    double alpha, double eta,
-    double x_max,
-    double tol,
-    int iterations,
-    int print_loss_it,
-    Int maxTokens,
-    Int windowSize,
-    Boolean distanceWeighting,
-    Boolean symmetric)
-    return (frame[Unknown] G){
-
-        /*
-        * Main function to Computes the vector embeddings for words in a large 
text corpus.
-        * INPUT:
-        * 
------------------------------------------------------------------------------
-        * - input (Frame[Unknown]): 1DInput corpus in CSV format.
-        * - seed: Random seed for reproducibility.
-        * - vector_size: Dimensionality of word vectors, V.
-        * - eta: Learning rate for optimization, recommended value: 0.05.
-        * - alpha: Weighting function parameter, recommended value: 0.75.
-        * - x_max: Maximum co-occurrence value as per the GloVe paper: 100.
-        * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4.
-        * - iterations: Total number of training iterations.
-        * - print_loss_it: Interval (in iterations) for printing the loss.
-        * - maxTokens (Int): Maximum number of tokens per text entry.
-        * - windowSize (Int): Context window size.
-        * - distanceWeighting (Boolean): Whether to apply distance-based 
weighting.
-        * - symmetric (Boolean): Determines if the matrix is symmetric (TRUE) 
or asymmetric (FALSE).
-        * 
------------------------------------------------------------------------------
-        * OUTPUT:
-        * 
------------------------------------------------------------------------------
-        * G (Frame[Unknown]): The word indices and their word vectors, of 
shape (N, V). Each represented as a vector, of shape (1,V)
-        * 
------------------------------------------------------------------------------
-        */
-
-        [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, 
windowSize, distanceWeighting, symmetric);
-        G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, 
alpha, eta, x_max, tol, iterations, print_loss_it);
-}
diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml
index 13136ff2c9..edd8e7727d 100644
--- a/scripts/builtin/imputeByKNN.dml
+++ b/scripts/builtin/imputeByKNN.dml
@@ -25,23 +25,16 @@
 # the missing values by column means. Currently, only the column with the most
 # missing values is actually imputed.
 #
-# 
------------------------------------------------------------------------------
 # INPUT:
 # 
------------------------------------------------------------------------------
-# X           Matrix with missing values, which are represented as NaNs
-# method      Method used for imputing missing values with different 
performance
-#             and accuracy tradeoffs:
-#             'dist' (default): Compute all-pairs distances and impute the
-#                               missing values by closest. O(N^2 * #features)
-#             'dist_missing':   Compute distances between data and records with
-#                               missing values. O(N*M * #features), assuming
-#                               that the number of records with MV is M<<N.
-#             'dist_sample':    Compute distances between sample of data and
-#                               records with missing values. O(S*M * #features)
-#                               with M<<N and S<<N, but suboptimal imputation.
-# seed        Root seed value for random/sample calls for deterministic 
behavior
-#             -1 for true randomization
-# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
+# X             Matrix with missing values, which are represented as NaNs
+# method        Method used for imputing missing values with different 
performance and accuracy tradeoffs:\n
+#               - 'dist' (default): Compute all-pairs distances and impute the 
missing values by closest. O(N^2 * #features)
+#               - 'dist_missing': Compute distances between data and records 
with missing values. O(N*M * #features), assuming that the number of records 
with MV is M<<N.
+#               - 'dist_sample': Compute distances between sample of data and 
records with missing values. O(S*M * #features) with M<<N and S<<N, but 
suboptimal imputation.
+#
+# seed          Root seed value for random/sample calls for deterministic 
behavior. -1 for true randomization
+# sample_frac   Sample fraction for 'dist_sample' (value between 0 and 1)
 # 
------------------------------------------------------------------------------
 #
 # OUTPUT:
@@ -136,4 +129,4 @@ compute_missing_values = function (Matrix[Double] X, 
Matrix[Double] filled_matri
     #Get the subset records that need to be imputed
     imputedValue = t(reshaped) %*% aligned
     imputedValue = t(imputedValue)
-}
+}
\ No newline at end of file
diff --git a/scripts/builtin/quantizeByCluster.dml 
b/scripts/builtin/quantizeByCluster.dml
index 824ac35053..20b0b0c89d 100644
--- a/scripts/builtin/quantizeByCluster.dml
+++ b/scripts/builtin/quantizeByCluster.dml
@@ -58,7 +58,7 @@
 #           the product quantization. Only relevant when space_decomp = TRUE.
 # 
------------------------------------------------------------------------------------------
 
-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, 
Integer runs = 10,
+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 
10, Integer runs = 10,
     Integer max_iter = 1000, Double eps = 1e-6, Integer 
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean 
space_decomp=FALSE, Integer seed = -1)
   return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
 {
@@ -118,5 +118,4 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 
4, Integer k = 10, I
       codes[,i] = tmp_c + offset
     }
   }
-}
-
+}
\ No newline at end of file
diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
index 8daeb5bc7f..53529afb9a 100644
--- a/scripts/builtin/randomForest.dml
+++ b/scripts/builtin/randomForest.dml
@@ -26,16 +26,17 @@
 # and optionally subset of features (columns). During tree construction, split
 # candidates are additionally chosen on a sample of remaining features.
 #
-# .. code-block::
+# .. code-block:: text
 #
 #   For example, given a feature matrix with features [a,b,c,d]
 #   and the following two trees, M (the output) would look as follows:
 #
 #   (L1)          |a<7|                   |d<5|
-#                /     \                 /     \
+#                /     \\                 /     \\
 #   (L2)     |c<3|     |b<4|         |a<7|     P3:2
-#            /   \     /   \         /   \
+#            /   \\     /   \\         /  \\
 #   (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+#
 #   --> M :=
 #   [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
 #    [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
diff --git a/scripts/builtin/shapExplainer.dml 
b/scripts/builtin/shapExplainer.dml
index b78a5dbcef..39d365bf01 100644
--- a/scripts/builtin/shapExplainer.dml
+++ b/scripts/builtin/shapExplainer.dml
@@ -51,6 +51,7 @@
 # S              Matrix holding the shapley values along the cols, one row per 
instance.
 # expected       Double holding the average prediction of all instances.
 # -----------------------------------------------------------------------------
+
 s_shapExplainer = function(String model_function, list[unknown] model_args, 
Matrix[Double] x_instances,
     Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, 
Integer remove_non_var=0,
     Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer 
verbose = 0)
diff --git a/scripts/builtin/topk_cleaning.dml 
b/scripts/builtin/topk_cleaning.dml
index 6f946c7729..c998732092 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -19,8 +19,44 @@
 #
 #-------------------------------------------------------------
 
-# This function cleans top-K item (where K is given as input)for a given list 
of users.
+# This function cleans top-K item (where K is given as input) for a given list 
of users.
 # metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, 
metaData[3] stores FD mask
+#
+# INPUT:
+# 
------------------------------------------------------------------------------
+# dataTrain           Training set
+# dataTest            Test set ignored when cv is set to True
+# metaData            3×n frame with schema, categorical mask, and FD mask for 
dataTrain
+# primitives          Library of primitive cleaning operators
+# parameters          Hyperparameter search space that matches the primitives
+# refSol              Reference solution
+# evaluationFunc      Name of a SystemDS DML function that scores a pipeline
+# evalFunHp           Hyperparameter matrix for the above evaluation function
+# topK                Number of best pipelines to return
+# resource_val        Maximum resource R for the Bandit search
+# max_iter            Maximum iterations while enumerating logical pipelines
+# lq                  Lower quantile used by utils::doErrorSample when 
triggered
+# uq                  Upper quantile used by utils::doErrorSample when 
triggered
+# sample              Fraction of rows to subsample from dataTrain
+# expectedIncrease    Minimum improvement over dirtyScore that a candidate 
must deliver
+# seed                Seed number
+# cv                  TRUE means k-fold CV, FALSE means hold-out split
+# cvk                 Number of folds if cv = TRUE
+# isLastLabel         TRUE if the last column is the label
+# rowCount            Row-count threshold above which doErrorSample may 
replace uniform sampling
+# correctTypos        Run spelling correction in the string preprocessing step
+# enablePruning       Enable pruning inside the Bandit phase
+# 
------------------------------------------------------------------------------
+#
+# OUTPUT:
+#-------------------------------------------------------------------------------
+# topKPipelines       K cleaned-data pipelines
+# topKHyperParams     Hyperparameter matrix with rows aligning with 
topKPipelines
+# topKScores          Evaluation scores with rows aligning with topKPipelines
+# dirtyScore          Baseline score on the unclean data
+# evalFunHp           Updated evaluation function hyperparameters
+# applyFunc           Frame of “apply” functions for deploying each of the 
top-K pipelines
+#-------------------------------------------------------------------------------
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
diff --git a/src/main/python/docs/README.md b/src/main/python/docs/README.md
index 61bdd24a3e..e5bc5c5958 100644
--- a/src/main/python/docs/README.md
+++ b/src/main/python/docs/README.md
@@ -39,4 +39,4 @@ and then run `make html`:
 make html
 ```
 
-The docs will then be created at: `/src/main/python/build`in HTML will be 
placed in the `./_build` directory.
+The docs will then be created at: `/src/main/python/docs/build/html/`.
\ No newline at end of file
diff --git a/src/main/python/docs/requires-docs.txt 
b/src/main/python/docs/requires-docs.txt
index 9305d9320f..1022b65240 100644
--- a/src/main/python/docs/requires-docs.txt
+++ b/src/main/python/docs/requires-docs.txt
@@ -24,4 +24,5 @@ sphinx_rtd_theme
 numpy
 py4j
 scipy
-requests
\ No newline at end of file
+requests
+pandas
\ No newline at end of file
diff --git a/src/main/python/generator/dml_parser.py 
b/src/main/python/generator/dml_parser.py
index 2abffb021f..8e835e96a1 100644
--- a/src/main/python/generator/dml_parser.py
+++ b/src/main/python/generator/dml_parser.py
@@ -28,7 +28,7 @@ import re
 class FunctionParser(object):
     header_input_pattern = r"^[ \t\n]*[#]+[ \t\n]*input[ 
\t\n\w:;.,#]*[\s#\-]*[#]+[\w\s\d:,.()\" \t\n\-]*[\s#\-]*$"
     header_output_pattern = r"[\s#\-]*[#]+[ \t]*(return|output)[ 
\t\w:;.,#]*[\s#\-]*[#]+[\w\s\d:,.()\" \t\-]*[\s#\-]*$"
-    function_pattern = r"^[ms]_[\w]+[ \t\n]*=[ \t\n]+function[^#{]*"
+    function_pattern = r"^[fms]_[\w]+[ \t\n]*=[ \t\n]+function[^#{]*"
     # parameter_pattern = 
r"^m_[\w]+[\s]+=[\s]+function[\s]*\([\s]*(?=return)[\s]*\)[\s]*return[\s]*\([\s]*([\w\[\]\s,\d=.\-_]*)[\s]*\)[\s]*"
     header_parameter_pattern = r"[\s#\-]*[#]+[ 
\t]*([\w|-]+)[\s]+([\w]+)[\s]+([\w,\d.\"\-]+)[\s]+([\w|\W]+)"
     divider_pattern = r"[\s#\-]*"
@@ -57,15 +57,13 @@ class FunctionParser(object):
         """
         file_name = os.path.basename(path)
         function_name, extension = os.path.splitext(file_name)
-        # try:
-        function_definition = self.find_function_definition(path)
-        # pattern = re.compile(
-        #     self.__class__.parameter_pattern, flags=re.I | re.M)
-        # match = pattern.match(function_definition)
-
-        # if match:
+        try:
+            function_definition = self.find_function_definition(path)
+        except AttributeError:
+            print(f"[INFO] Skipping '{function_name}': does not match function 
name pattern. It is likely an internal function.")
+            return
 
-        func_split = function_definition.split("function")[1].split("return")
+        func_split = function_definition.split("function", 
1)[1].split("return")
        
         param_str = self.extract_param_str(func_split[0])
         retval_str = None
diff --git a/src/main/python/systemds/operator/algorithm/__init__.py 
b/src/main/python/systemds/operator/algorithm/__init__.py
index bd611ee6cc..c198a7138e 100644
--- a/src/main/python/systemds/operator/algorithm/__init__.py
+++ b/src/main/python/systemds/operator/algorithm/__init__.py
@@ -31,6 +31,7 @@ from .builtin.alsDS import alsDS
 from .builtin.alsPredict import alsPredict 
 from .builtin.alsTopkPredict import alsTopkPredict 
 from .builtin.ampute import ampute 
+from .builtin.apply_pipeline import apply_pipeline 
 from .builtin.arima import arima 
 from .builtin.auc import auc 
 from .builtin.autoencoder_2layer import autoencoder_2layer 
@@ -38,7 +39,10 @@ from .builtin.bandit import bandit
 from .builtin.bivar import bivar 
 from .builtin.components import components 
 from .builtin.confusionMatrix import confusionMatrix 
+from .builtin.cooccurrenceMatrix import cooccurrenceMatrix 
 from .builtin.cor import cor 
+from .builtin.correctTypos import correctTypos 
+from .builtin.correctTyposApply import correctTyposApply 
 from .builtin.cov import cov 
 from .builtin.cox import cox 
 from .builtin.cspline import cspline 
@@ -50,15 +54,22 @@ from .builtin.dbscanApply import dbscanApply
 from .builtin.decisionTree import decisionTree 
 from .builtin.decisionTreePredict import decisionTreePredict 
 from .builtin.deepWalk import deepWalk 
+from .builtin.denialConstraints import denialConstraints 
 from .builtin.differenceStatistics import differenceStatistics 
 from .builtin.discoverFD import discoverFD 
 from .builtin.dist import dist 
+from .builtin.dmv import dmv 
+from .builtin.ema import ema 
 from .builtin.executePipeline import executePipeline 
 from .builtin.f1Score import f1Score 
 from .builtin.fdr import fdr 
 from .builtin.ffPredict import ffPredict 
 from .builtin.ffTrain import ffTrain 
+from .builtin.fit_pipeline import fit_pipeline 
+from .builtin.fixInvalidLengths import fixInvalidLengths 
+from .builtin.fixInvalidLengthsApply import fixInvalidLengthsApply 
 from .builtin.flattenQuantile import flattenQuantile 
+from .builtin.frameSort import frameSort 
 from .builtin.frequencyEncode import frequencyEncode 
 from .builtin.frequencyEncodeApply import frequencyEncodeApply 
 from .builtin.garch import garch 
@@ -66,6 +77,7 @@ from .builtin.gaussianClassifier import gaussianClassifier
 from .builtin.getAccuracy import getAccuracy 
 from .builtin.glm import glm 
 from .builtin.glmPredict import glmPredict 
+from .builtin.glove import glove 
 from .builtin.gmm import gmm 
 from .builtin.gmmPredict import gmmPredict 
 from .builtin.gnmf import gnmf 
@@ -97,6 +109,7 @@ from .builtin.img_translate_linearized import 
img_translate_linearized
 from .builtin.impurityMeasures import impurityMeasures 
 from .builtin.imputeByFD import imputeByFD 
 from .builtin.imputeByFDApply import imputeByFDApply 
+from .builtin.imputeByKNN import imputeByKNN 
 from .builtin.imputeByMean import imputeByMean 
 from .builtin.imputeByMeanApply import imputeByMeanApply 
 from .builtin.imputeByMedian import imputeByMedian 
@@ -126,6 +139,7 @@ from .builtin.mae import mae
 from .builtin.mape import mape 
 from .builtin.matrixProfile import matrixProfile 
 from .builtin.mcc import mcc 
+from .builtin.mdedup import mdedup 
 from .builtin.mice import mice 
 from .builtin.miceApply import miceApply 
 from .builtin.mse import mse 
@@ -153,6 +167,7 @@ from .builtin.pcaTransform import pcaTransform
 from .builtin.pnmf import pnmf 
 from .builtin.ppca import ppca 
 from .builtin.psnr import psnr 
+from .builtin.quantizeByCluster import quantizeByCluster 
 from .builtin.raGroupby import raGroupby 
 from .builtin.raJoin import raJoin 
 from .builtin.raSelection import raSelection 
@@ -165,6 +180,7 @@ from .builtin.scaleMinMax import scaleMinMax
 from .builtin.selectByVarThresh import selectByVarThresh 
 from .builtin.ses import ses 
 from .builtin.setdiff import setdiff 
+from .builtin.shapExplainer import shapExplainer 
 from .builtin.sherlock import sherlock 
 from .builtin.sherlockPredict import sherlockPredict 
 from .builtin.shortestPath import shortestPath 
@@ -189,10 +205,12 @@ from .builtin.symmetricDifference import 
symmetricDifference
 from .builtin.tSNE import tSNE 
 from .builtin.toOneHot import toOneHot 
 from .builtin.tomeklink import tomeklink 
+from .builtin.topk_cleaning import topk_cleaning 
 from .builtin.underSampling import underSampling 
 from .builtin.union import union 
 from .builtin.univar import univar 
 from .builtin.vectorToCsv import vectorToCsv 
+from .builtin.wer import wer 
 from .builtin.winsorize import winsorize 
 from .builtin.winsorizeApply import winsorizeApply 
 from .builtin.xdummy1 import xdummy1 
@@ -211,6 +229,7 @@ __all__ = ['WoE',
  'alsPredict',
  'alsTopkPredict',
  'ampute',
+ 'apply_pipeline',
  'arima',
  'auc',
  'autoencoder_2layer',
@@ -218,7 +237,10 @@ __all__ = ['WoE',
  'bivar',
  'components',
  'confusionMatrix',
+ 'cooccurrenceMatrix',
  'cor',
+ 'correctTypos',
+ 'correctTyposApply',
  'cov',
  'cox',
  'cspline',
@@ -230,15 +252,22 @@ __all__ = ['WoE',
  'decisionTree',
  'decisionTreePredict',
  'deepWalk',
+ 'denialConstraints',
  'differenceStatistics',
  'discoverFD',
  'dist',
+ 'dmv',
+ 'ema',
  'executePipeline',
  'f1Score',
  'fdr',
  'ffPredict',
  'ffTrain',
+ 'fit_pipeline',
+ 'fixInvalidLengths',
+ 'fixInvalidLengthsApply',
  'flattenQuantile',
+ 'frameSort',
  'frequencyEncode',
  'frequencyEncodeApply',
  'garch',
@@ -246,6 +275,7 @@ __all__ = ['WoE',
  'getAccuracy',
  'glm',
  'glmPredict',
+ 'glove',
  'gmm',
  'gmmPredict',
  'gnmf',
@@ -277,6 +307,7 @@ __all__ = ['WoE',
  'impurityMeasures',
  'imputeByFD',
  'imputeByFDApply',
+ 'imputeByKNN',
  'imputeByMean',
  'imputeByMeanApply',
  'imputeByMedian',
@@ -306,6 +337,7 @@ __all__ = ['WoE',
  'mape',
  'matrixProfile',
  'mcc',
+ 'mdedup',
  'mice',
  'miceApply',
  'mse',
@@ -333,6 +365,7 @@ __all__ = ['WoE',
  'pnmf',
  'ppca',
  'psnr',
+ 'quantizeByCluster',
  'raGroupby',
  'raJoin',
  'raSelection',
@@ -345,6 +378,7 @@ __all__ = ['WoE',
  'selectByVarThresh',
  'ses',
  'setdiff',
+ 'shapExplainer',
  'sherlock',
  'sherlockPredict',
  'shortestPath',
@@ -369,10 +403,12 @@ __all__ = ['WoE',
  'tSNE',
  'toOneHot',
  'tomeklink',
+ 'topk_cleaning',
  'underSampling',
  'union',
  'univar',
  'vectorToCsv',
+ 'wer',
  'winsorize',
  'winsorizeApply',
  'xdummy1',
diff --git a/src/main/python/systemds/operator/algorithm/builtin/ampute.py 
b/src/main/python/systemds/operator/algorithm/builtin/ampute.py
index d323000710..fb3a82a380 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/ampute.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/ampute.py
@@ -33,6 +33,16 @@ def ampute(X: Matrix,
     """
      This function injects missing values into a multivariate a given dataset, 
similarly to the ampute() method in R's MICE package.
     
+    
+    
+    :param X: a multivariate numeric dataset [shape: n-by-m]
+    :param prop: a number in the (0, 1] range specifying the proportion of 
amputed rows across the entire dataset
+    :param patterns: a pattern matrix of 0's and 1's [shape: k-by-m] where 
each row corresponds to a pattern. 0 indicates that a variable should have 
missing values and 1 indicating that a variable should remain complete
+    :param freq: a vector [length: k] containing the relative frequency with 
which each pattern in the patterns matrix should occur
+    :param mech: a string [either "MAR", "MNAR", or "MCAR"] specifying the 
missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a 
non-default weight matrix is specified
+    :param weights: a weight matrix [shape: k-by-m], containing weights that 
will be used to calculate the weighted sum scores. Will be overridden if mech 
== "MCAR"
+    :param seed: a manually defined seed for reproducible RNG
+    :return: amputed output dataset
     """
 
     params_dict = {'X': X}
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py 
b/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py
index be1100b412..63ffc3f66b 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py 
b/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py
index 81c549b598..66a01780b0 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py
@@ -35,7 +35,7 @@ def confusionMatrix(P: Matrix,
      and actual labels. We return both the counts and relative frequency
      (normalized by sum of true labels)
     
-     .. code-block::
+     .. code-block:: text
     
                        True Labels
                          1    2
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py 
b/src/main/python/systemds/operator/algorithm/builtin/cooccurrenceMatrix.py
similarity index 56%
copy from src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py
copy to 
src/main/python/systemds/operator/algorithm/builtin/cooccurrenceMatrix.py
index 81c549b598..6df77d3e7d 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/cooccurrenceMatrix.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 # Autogenerated By   : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/confusionMatrix.dml
+# Autogenerated From : scripts/builtin/cooccurrenceMatrix.dml
 
 from typing import Dict, Iterable
 
@@ -28,36 +28,29 @@ from systemds.operator import OperationNode, Matrix, Frame, 
List, MultiReturn, S
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def confusionMatrix(P: Matrix,
-                    Y: Matrix):
+def cooccurrenceMatrix(input: Frame,
+                       maxTokens: int,
+                       windowSize: int,
+                       distanceWeighting: bool,
+                       symmetric: bool):
     """
-     Computes the confusion matrix for input vectors of predictions
-     and actual labels. We return both the counts and relative frequency
-     (normalized by sum of true labels)
+     Cleans and processes text data by removing punctuation, converting it to 
lowercase, and reformatting.
+     Adds an index column to the result. The implementation is based on
+     https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
     
-     .. code-block::
     
-                       True Labels
-                         1    2
-                     1   TP | FP
-       Predictions      ----+----
-                     2   FN | TN
     
-    
-    
-    :param P: vector of predictions (1-based, recoded)
-    :param Y: vector of actual labels (1-based, recoded)
-    :return: the confusion matrix as absolute counts
-    :return: the confusion matrix as relative frequencies
+    :param S: (Frame[Unknown]): 1D input data frame containing text data.
+    :return: (Frame[Unknown]): Processed text data with an index column.
     """
 
-    params_dict = {'P': P, 'Y': Y}
+    params_dict = {'input': input, 'maxTokens': maxTokens, 'windowSize': 
windowSize, 'distanceWeighting': distanceWeighting, 'symmetric': symmetric}
     
-    vX_0 = Matrix(P.sds_context, '')
-    vX_1 = Matrix(P.sds_context, '')
+    vX_0 = Matrix(input.sds_context, '')
+    vX_1 = Frame(input.sds_context, '')
     output_nodes = [vX_0, vX_1, ]
 
-    op = MultiReturn(P.sds_context, 'confusionMatrix', output_nodes, 
named_input_nodes=params_dict)
+    op = MultiReturn(input.sds_context, 'cooccurrenceMatrix', output_nodes, 
named_input_nodes=params_dict)
 
     vX_0._unnamed_input_nodes = [op]
     vX_1._unnamed_input_nodes = [op]
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py 
b/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py
index 321a1949f5..64354a9bc3 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py 
b/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py
index 0a2c61a6f4..5da8769509 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py 
b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
index a1a751d0aa..3fe565b8c7 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
@@ -44,9 +44,9 @@ def decisionTree(X: Matrix,
        and the following trees, M would look as follows:
     
        (L1)               |d<5|
-                         /     \
+                         /     \\
        (L2)           P1:2    |a<7|
-                              /   \
+                              /   \\
        (L3)                 P2:2 P3:1
     
        --> M :=
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py 
b/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py
index 347502b848..5cdec21296 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py 
b/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py
index dfe2218a42..b6597bb6e4 100644
--- 
a/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py
+++ 
b/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py
@@ -35,6 +35,11 @@ def differenceStatistics(X: Matrix,
      they are different. This can be used for instance in comparison of lossy
      compression techniques, that reduce the fidelity of the data. 
     
+    
+    
+    :param X: First Matrix to compare
+    :param Y: Second Matrix to compare
+    :return: Difference statistics
     """
 
     params_dict = {'X': X, 'Y': Y}
diff --git a/src/main/python/systemds/operator/algorithm/builtin/dmv.py 
b/src/main/python/systemds/operator/algorithm/builtin/dmv.py
index deaf3ea8a6..2955e505e1 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/dmv.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/dmv.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git a/src/main/python/systemds/operator/algorithm/builtin/ema.py 
b/src/main/python/systemds/operator/algorithm/builtin/ema.py
index 4e0ccca6bb..90f9a852d7 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/ema.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/ema.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py 
b/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py
index 1fffb46f10..66750fc071 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py
@@ -28,7 +28,18 @@ from systemds.operator import OperationNode, Matrix, Frame, 
List, MultiReturn, S
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def executePipeline(X: Matrix):
+def executePipeline(pipeline: Frame,
+                    Xtrain: Matrix,
+                    Ytrain: Matrix,
+                    Xtest: Matrix,
+                    Ytest: Matrix,
+                    metaList: List,
+                    hyperParameters: Matrix,
+                    flagsCount: int,
+                    verbose: bool,
+                    startInd: int,
+                    endInd: int,
+                    **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
      This function execute pipeline.
     
@@ -56,17 +67,30 @@ def executePipeline(X: Matrix):
     :return: ---
     """
 
-    params_dict = {'X': X}
+    params_dict = {'pipeline': pipeline, 'Xtrain': Xtrain, 'Ytrain': Ytrain, 
'Xtest': Xtest, 'Ytest': Ytest, 'metaList': metaList, 'hyperParameters': 
hyperParameters, 'flagsCount': flagsCount, 'verbose': verbose, 'startInd': 
startInd, 'endInd': endInd}
+    params_dict.update(kwargs)
     
-    vX_0 = Matrix(X.sds_context, '')
-    vX_1 = Matrix(X.sds_context, '')
-    vX_2 = Matrix(X.sds_context, '')
-    output_nodes = [vX_0, vX_1, vX_2, ]
+    vX_0 = Matrix(pipeline.sds_context, '')
+    vX_1 = Matrix(pipeline.sds_context, '')
+    vX_2 = Matrix(pipeline.sds_context, '')
+    vX_3 = Matrix(pipeline.sds_context, '')
+    vX_4 = Scalar(pipeline.sds_context, '')
+    vX_5 = Matrix(pipeline.sds_context, '')
+    vX_6 = Matrix(pipeline.sds_context, '')
+    vX_7 = Scalar(pipeline.sds_context, '')
+    vX_8 = List(pipeline.sds_context, '')
+    output_nodes = [vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, vX_6, vX_7, vX_8, ]
 
-    op = MultiReturn(X.sds_context, 'executePipeline', output_nodes, 
named_input_nodes=params_dict)
+    op = MultiReturn(pipeline.sds_context, 'executePipeline', output_nodes, 
named_input_nodes=params_dict)
 
     vX_0._unnamed_input_nodes = [op]
     vX_1._unnamed_input_nodes = [op]
     vX_2._unnamed_input_nodes = [op]
+    vX_3._unnamed_input_nodes = [op]
+    vX_4._unnamed_input_nodes = [op]
+    vX_5._unnamed_input_nodes = [op]
+    vX_6._unnamed_input_nodes = [op]
+    vX_7._unnamed_input_nodes = [op]
+    vX_8._unnamed_input_nodes = [op]
 
     return op
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py 
b/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py
index 5de40c745f..48363035d8 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py 
b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py
index b635f31b29..cc0e83a51e 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py 
b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py
index cc8fe68aac..ed2572368d 100644
--- 
a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py
+++ 
b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git a/src/main/python/systemds/operator/algorithm/builtin/frameSort.py 
b/src/main/python/systemds/operator/algorithm/builtin/frameSort.py
index 0bfc7f3afe..2575baefe4 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/frameSort.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/frameSort.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git a/src/main/python/systemds/operator/algorithm/builtin/glove.py 
b/src/main/python/systemds/operator/algorithm/builtin/glove.py
new file mode 100644
index 0000000000..cbf9c421c4
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/glove.py
@@ -0,0 +1,67 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/glove.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def glove(input: Frame,
+          seed: int,
+          vector_size: int,
+          alpha: float,
+          eta: float,
+          x_max: float,
+          tol: float,
+          iterations: int,
+          print_loss_it: int,
+          maxTokens: int,
+          windowSize: int,
+          distanceWeighting: bool,
+          symmetric: bool):
+    """
+    
+    
+    
+    :param input: 1DInput corpus in CSV format.
+    :param seed: Random seed for reproducibility.
+    :param vector_size: Dimensionality of word vectors, V.
+    :param eta: Learning rate for optimization, recommended value: 0.05.
+    :param alpha: Weighting function parameter, recommended value: 0.75.
+    :param x_max: Maximum co-occurrence value as per the GloVe paper: 100.
+    :param tol: Tolerance value to avoid overfitting, recommended value: 1e-4.
+    :param iterations: Total number of training iterations.
+    :param print_loss_it: Interval (in iterations) for printing the loss.
+    :param maxTokens: Maximum number of tokens per text entry.
+    :param windowSize: Context window size.
+    :param distanceWeighting: Whether to apply distance-based weighting.
+    :param symmetric: Determines if the matrix is symmetric (TRUE) or 
asymmetric (FALSE).
+    :return: The word indices and their word vectors, of shape (N, V). Each 
represented as a vector, of shape (1,V)
+    """
+
+    params_dict = {'input': input, 'seed': seed, 'vector_size': vector_size, 
'alpha': alpha, 'eta': eta, 'x_max': x_max, 'tol': tol, 'iterations': 
iterations, 'print_loss_it': print_loss_it, 'maxTokens': maxTokens, 
'windowSize': windowSize, 'distanceWeighting': distanceWeighting, 'symmetric': 
symmetric}
+    return Matrix(input.sds_context,
+        'glove',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py 
b/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py
index fcc096180b..f04aa09851 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py
@@ -25,13 +25,30 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
 def imputeByKNN(X: Matrix,
                 **kwargs: Dict[str, VALID_INPUT_TYPES]):
+    """
+     Imputes missing values, indicated by NaNs, using KNN-based methods
+     (k-nearest neighbors by euclidean distance). In order to avoid NaNs in
+     distance computation and meaningful nearest neighbor search, we initialize
+     the missing values by column means. Currently, only the column with the 
most
+     missing values is actually imputed.
     
+    
+    
+    :param X: Matrix with missing values, which are represented as NaNs
+    :param method: Method used for imputing missing values with different 
performance and accuracy tradeoffs:\n
+        - 'dist' (default): Compute all-pairs distances and impute the missing 
values by closest. O(N^2 * #features)
+        - 'dist_missing': Compute distances between data and records with 
missing values. O(N*M * #features), assuming that the number of records with MV 
is M<<N.
+        - 'dist_sample': Compute distances between sample of data and records 
with missing values. O(S*M * #features) with M<<N and S<<N, but suboptimal 
imputation.
+    :param seed: Root seed value for random/sample calls for deterministic 
behavior. -1 for true randomization
+    :param sample_frac: Sample fraction for 'dist_sample' (value between 0 and 
1)
+    :return: Imputed dataset
+    """
+
     params_dict = {'X': X}
     params_dict.update(kwargs)
     return Matrix(X.sds_context,
diff --git a/src/main/python/systemds/operator/algorithm/builtin/mdedup.py 
b/src/main/python/systemds/operator/algorithm/builtin/mdedup.py
index 85d93d5c2c..cbcc15d43b 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/mdedup.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/mdedup.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/quantizeByCluster.py 
b/src/main/python/systemds/operator/algorithm/builtin/quantizeByCluster.py
new file mode 100644
index 0000000000..5afb96412b
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/quantizeByCluster.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/quantizeByCluster.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def quantizeByCluster(X: Matrix,
+                      **kwargs: Dict[str, VALID_INPUT_TYPES]):
+    """
+     The quantizeByCluster-function implements product quantization. 
Initially, it
+     divides the original vector space into M subspaces. The resulting lower 
dimensional
+     subvectors are then quantized. If the column count is not divisible by 
the number of
+     subspaces M, the data is padded with zeros. Optimal space decomposition 
can be
+     computed, when the data follows a Gaussian distribution. The function 
uses kmeans for
+     quantizing and svd to compute the space decomposition.
+    
+    
+    
+    :param X: The input matrix to perform product quantization on
+    :param M: Number of subspaces
+    :param k: Number of vectors in the subcodebooks
+    :param runs: Number of runs (with different initial centroids)
+    :param max_iter: Maximum number of iterations per run
+    :param eps: Tolerance (epsilon) for WCSS change ratio
+    :param avg_sample_size_per_centroid: Average number of records per 
centroid in data samples
+    :param separate: Cluster subspaces separately. If value is set to true,
+        kmeans is run M times, once for each subspace. Otherwise
+        kmeans is run only once.
+    :param space_decomp: Decompose the vector space by multiplying the input
+        matrix X with an orthogonal matrix R. Assumes the data
+        follows a parametric Gaussian distribution.
+        Time complexity in O(nrow(X)^2 * min(nrow(X), ncol(X))).
+    :param seed: The seed used for initial sampling. If set to -1 random
+        seeds are selected.
+    :return: The matrix containing the centroids. If clustered separately, the 
ith
+        subcodebook is the ith chunk of size k. The codebook matrix has the 
dimensions
+        [k*M x ncol(X)/M].
+    :return: The mapping of vectors to centroids. Each vector of the input 
matrix X is mapped
+        onto a vector of codes. The entries in the codes matrix are the 
indices of
+        the vectors in the codebook. The codes matrix has the dimensions 
[nrow(X) x M].
+    :return: The orthogonal matrix R which is applied to the input matrix X 
before performing
+        the product quantization. Only relevant when space_decomp = TRUE.
+    """
+
+    params_dict = {'X': X}
+    params_dict.update(kwargs)
+    
+    vX_0 = Matrix(X.sds_context, '')
+    vX_1 = Matrix(X.sds_context, '')
+    vX_2 = Matrix(X.sds_context, '')
+    output_nodes = [vX_0, vX_1, vX_2, ]
+
+    op = MultiReturn(X.sds_context, 'quantizeByCluster', output_nodes, 
named_input_nodes=params_dict)
+
+    vX_0._unnamed_input_nodes = [op]
+    vX_1._unnamed_input_nodes = [op]
+    vX_2._unnamed_input_nodes = [op]
+
+    return op
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py 
b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
index 88b1c9145b..177ebd3fd3 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
@@ -40,16 +40,17 @@ def randomForest(X: Matrix,
      and optionally subset of features (columns). During tree construction, 
split
      candidates are additionally chosen on a sample of remaining features.
     
-     .. code-block::
+     .. code-block:: text
     
        For example, given a feature matrix with features [a,b,c,d]
        and the following two trees, M (the output) would look as follows:
     
        (L1)          |a<7|                   |d<5|
-                    /     \                 /     \
+                    /     \\                 /     \\
        (L2)     |c<3|     |b<4|         |a<7|     P3:2
-                /   \     /   \         /   \
+                /   \\     /   \\         /  \\
        (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+    
        --> M :=
        [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
         [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/shapExplainer.py 
b/src/main/python/systemds/operator/algorithm/builtin/shapExplainer.py
new file mode 100644
index 0000000000..42a0afb6e6
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/shapExplainer.py
@@ -0,0 +1,78 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/shapExplainer.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def shapExplainer(model_function: str,
+                  model_args: List,
+                  x_instances: Matrix,
+                  X_bg: Matrix,
+                  **kwargs: Dict[str, VALID_INPUT_TYPES]):
+    """
+     Computes shapley values for multiple instances in parallel using 
antithetic permutation sampling.
+     The resulting matrix phis holds the shapley values for each feature in 
the column given by the index of the feature in the sample.
+    
+     This method first creates two large matrices for masks and masked 
background data for all permutations and
+     then runs in paralell on all instances in x.
+     While the prepared matrices can become very large (2 * #features * 
#permuations * #n_samples * #features),
+     the preparation of a row for the model call breaks down to a single 
element-wise multiplication of this mask with the row and
+     an addition to the masked background data, since masks can be reused for 
each instance.
+    
+    
+    
+    :param model_function: The function of the model to be evaluated as a 
String. This function has to take a matrix of samples
+        and return a vector of predictions.
+        It might be usefull to wrap the model into a function the takes and 
returns the desired shapes and
+        use this wrapper here.
+    :param model_args: Arguments in order for the model, if desired. This will 
be prepended by the created instances-matrix.
+    :param x_instances: Multiple instances as rows for which to compute the 
shapley values.
+    :param X_bg: The background dataset from which to pull the random samples 
to perform Monte Carlo integration.
+    :param n_permutations: The number of permutaions. Defaults to 10. 
Theoretical 1 should already be enough for models with up
+        to second order interaction effects.
+    :param n_samples: Number of samples from X_bg used for marginalization.
+    :param remove_non_var: EXPERIMENTAL: If set, for every instance the 
varaince of each feature is checked against this feature in the
+        background data. If it does not change, we do not run any model cals 
for it.
+    :param seed: A seed, in case the sampling has to be deterministic.
+    :param verbose: A boolean to enable logging of each step of the function.
+    :return: Matrix holding the shapley values along the cols, one row per 
instance.
+    :return: Double holding the average prediction of all instances.
+    """
+
+    params_dict = {'model_function': model_function, 'model_args': model_args, 
'x_instances': x_instances, 'X_bg': X_bg}
+    params_dict.update(kwargs)
+    
+    vX_0 = Matrix(model_function.sds_context, '')
+    vX_1 = Scalar(model_function.sds_context, '')
+    output_nodes = [vX_0, vX_1, ]
+
+    op = MultiReturn(model_function.sds_context, 'shapExplainer', 
output_nodes, named_input_nodes=params_dict)
+
+    vX_0._unnamed_input_nodes = [op]
+    vX_1._unnamed_input_nodes = [op]
+
+    return op
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py 
b/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py
index 16a20d20e0..270a6d7b16 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py
@@ -25,7 +25,6 @@
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
@@ -36,8 +35,39 @@ def topk_cleaning(dataTrain: Frame,
                   evalFunHp: Matrix,
                   **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     This function cleans top-K item (where K is given as input)for a given 
list of users.
+     This function cleans top-K item (where K is given as input) for a given 
list of users.
      metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores 
schema, metaData[3] stores FD mask
+    
+    
+    
+    :param dataTrain: Training set
+    :param dataTest: Test set ignored when cv is set to True
+    :param metaData: 3×n frame with schema, categorical mask, and FD mask for 
dataTrain
+    :param primitives: Library of primitive cleaning operators
+    :param parameters: Hyperparameter search space that matches the primitives
+    :param refSol: Reference solution
+    :param evaluationFunc: Name of a SystemDS DML function that scores a 
pipeline
+    :param evalFunHp: Hyperparameter matrix for the above evaluation function
+    :param topK: Number of best pipelines to return
+    :param resource_val: Maximum resource R for the Bandit search
+    :param max_iter: Maximum iterations while enumerating logical pipelines
+    :param lq: Lower quantile used by utils::doErrorSample when triggered
+    :param uq: Upper quantile used by utils::doErrorSample when triggered
+    :param sample: Fraction of rows to subsample from dataTrain
+    :param expectedIncrease: Minimum improvement over dirtyScore that a 
candidate must deliver
+    :param seed: Seed number
+    :param cv: TRUE means k-fold CV, FALSE means hold-out split
+    :param cvk: Number of folds if cv = TRUE
+    :param isLastLabel: TRUE if the last column is the label
+    :param rowCount: Row-count threshold above which doErrorSample may replace 
uniform sampling
+    :param correctTypos: Run spelling correction in the string preprocessing 
step
+    :param enablePruning: Enable pruning inside the Bandit phase
+    :return: K cleaned-data pipelines
+    :return: Hyperparameter matrix with rows aligning with topKPipelines
+    :return: Evaluation scores with rows aligning with topKPipelines
+    :return: Baseline score on the unclean data
+    :return: Updated evaluation function hyperparameters
+    :return: Frame of “apply” functions for deploying each of the top-K 
pipelines
     """
 
     params_dict = {'dataTrain': dataTrain, 'primitives': primitives, 
'parameters': parameters, 'evaluationFunc': evaluationFunc, 'evalFunHp': 
evalFunHp}
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py 
b/src/main/python/systemds/operator/algorithm/builtin/wer.py
similarity index 69%
copy from 
src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py
copy to src/main/python/systemds/operator/algorithm/builtin/wer.py
index cc8fe68aac..99d278461c 100644
--- 
a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/wer.py
@@ -20,33 +20,29 @@
 # -------------------------------------------------------------
 
 # Autogenerated By   : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/fixInvalidLengthsApply.dml
+# Autogenerated From : scripts/builtin/wer.dml
 
 from typing import Dict, Iterable
 
 from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
-from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def fixInvalidLengthsApply(X: Frame,
-                           mask: Matrix,
-                           qLow: Matrix,
-                           qUp: Matrix):
+def wer(R: Frame,
+        H: Frame):
     """
-     Fix invalid lengths
+     This built-in function computes the word error rate (WER)
+     defined as wer = (numSubst + numDel + numIns) / length(r)
     
     
     
-    :param X: ---
-    :param mask: ---
-    :param ql: ---
-    :param qu: ---
-    :return: ---
-    :return: ---
+    :param R: Input frame of reference strings, shape: [N x 1]
+    :param H: Input frame of hypothesis strings, shape: [N x 1]
+    :return: Output matrix of word error rate per pair of strings,
+        shape: [N x 1], where W[i,1] = wer(R[i,1], H[i,1])
     """
 
-    params_dict = {'X': X, 'mask': mask, 'qLow': qLow, 'qUp': qUp}
-    return Matrix(X.sds_context,
-        'fixInvalidLengthsApply',
+    params_dict = {'R': R, 'H': H}
+    return Matrix(R.sds_context,
+        'wer',
         named_input_nodes=params_dict)

(systemds) 01/03: [SYSTEMDS-3330] Fixes in generator.py, builtin function fixes

Reply via email to