This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 2e19d94208 [SYSTEMDS-3182] Builtin ampute() for introducing missing
values in data
2e19d94208 is described below
commit 2e19d94208a5bc7e4bfab4e2459f788363336254
Author: mmoesm <[email protected]>
AuthorDate: Tue Apr 15 17:26:03 2025 +0200
[SYSTEMDS-3182] Builtin ampute() for introducing missing values in data
Closes #2250.
---
scripts/builtin/ampute.dml | 311 +++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../functions/builtin/part1/BuiltinAmputeTest.java | 150 ++++++++++
src/test/scripts/functions/builtin/ampute.R | 63 +++++
src/test/scripts/functions/builtin/ampute.dml | 55 ++++
5 files changed, 580 insertions(+)
diff --git a/scripts/builtin/ampute.dml b/scripts/builtin/ampute.dml
new file mode 100644
index 0000000000..7d96136b7c
--- /dev/null
+++ b/scripts/builtin/ampute.dml
@@ -0,0 +1,311 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This function injects missing values into a multivariate a given dataset,
similarly to the ampute() method in R's MICE package.
+#
+# INPUT:
+#
-------------------------------------------------------------------------------------
+# X a multivariate numeric dataset [shape: n-by-m]
+# prop a number in the (0, 1] range specifying the proportion of
amputed rows across the entire dataset
+# patterns a pattern matrix of 0's and 1's [shape: k-by-m] where each row
corresponds to a pattern. 0 indicates that a variable should have missing
values and 1 indicating that a variable should remain complete
+# freq a vector [length: k] containing the relative frequency with
which each pattern in the patterns matrix should occur
+# mech a string [either "MAR", "MNAR", or "MCAR"] specifying the
missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a
non-default weight matrix is specified
+# weights a weight matrix [shape: k-by-m], containing weights that will
be used to calculate the weighted sum scores. Will be overridden if mech ==
"MCAR"
+# seed a manually defined seed for reproducible RNG
+
+#
-------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#
-------------------------------------------------------------------------------------
+# amputedX amputed output dataset
+#
-------------------------------------------------------------------------------------
+
+m_ampute = function(Matrix[Double] X,
+ Double prop = 0.5,
+ Matrix[Double] patterns = matrix(0, 0, 0),
+ Matrix[Double] freq = matrix(0, 0, 0),
+ String mech = "MAR",
+ Matrix[Double] weights = matrix(0, 0, 0),
+ Integer seed = -1) return(Matrix[Double] amputedX) {
+ # 1. Validate inputs, and set defaults for any empty freq, patterns, or
weights matrices:
+ [freq, patterns, weights] = u_validateInputs(X, prop, freq, patterns, mech,
weights) # FIX ME
+ # freq = nfreq
+ # patterns = npatterns
+ # weights = nweights
+
+ numSamples = nrow(X)
+ numFeatures = ncol(X)
+ numPatterns = nrow(patterns)
+ [groupAssignments, numPerGroup] = u_randomChoice(numSamples, freq, seed) #
Assign samples to groups based on freq vector.
+ amputedX = matrix(0, rows=numSamples, cols=numFeatures + 1) # Create array
to hold output.
+
+ parfor (patternNum in 1:numPatterns, check=0) {
+ groupSize = as.scalar(numPerGroup[patternNum])
+ if (groupSize == 0) {
+ print("ampute warning: Zero rows assigned to pattern " + patternNum + ".
Consider increasing input data size or pattern frequency?")
+ }
+ else {
+ # 2. Collect group examples and mapping to original indices:
+ [groupSamples, backMapping] = u_getGroupSamples(X, groupAssignments,
numSamples, groupSize, numFeatures, patternNum)
+
+ # 3. Get amputation probabilities:
+ sumScores = groupSamples %*% t(weights[patternNum])
+ probs = u_getProbs(sumScores, groupSize, prop)
+
+ # 4. Use probabilities to ampute pattern candidates:
+ random = rand(rows=groupSize, cols=1, min=0, max=1, pdf="uniform",
seed=seed)
+ amputeds = (random <= probs) * (1 - patterns[patternNum]) # Obtains
matrix with 1's at indices to ampute.
+ while (FALSE) {} # FIX ME
+ groupSamples = groupSamples + replace(target=amputeds, pattern=1,
replacement=NaN)
+
+ # 5. Update output matrix:
+ [start, end] = u_getBounds(numPerGroup, groupSize, patternNum)
+ amputedX[start:end, ] = cbind(groupSamples, backMapping)
+ }
+ }
+
+ # 6. Return amputed data in original order:
+ amputedX = order(target=amputedX, by=numFeatures + 1) # Sort by original
indices.
+ amputedX = amputedX[, 1:numFeatures] # Remove index column.
+}
+
+u_validateInputs = function(Matrix[Double] X, Double prop, Matrix[Double]
freq, Matrix[Double] patterns, String mech, Matrix[Double] weights)
+return (Matrix[Double] freq, Matrix[Double] patterns, Matrix[Double] weights) {
+
+ errors = list()
+ freqProvided = !u_isEmpty(freq) # FIX ME
+ patternsProvided = !u_isEmpty(patterns) # FIX ME
+ weightsProvided = !u_isEmpty(weights) # FIX ME
+
+ # About the input dataset:
+ if (max(is.na(X)) == 1) {
+ errors = append(errors, "Input dataset cannot contain any NaN values.")
+ }
+ if (ncol(X) < 2) {
+ errors = append(errors, "Input dataset must contain at least two columns.
Only contained " + ncol(X) + ". Missingness patterns require multiple variables
to be properly generated.")
+ }
+
+ # About mech:
+ if (mech != "MAR" & mech != "MCAR" & mech != "MNAR") {
+ errors = append(errors, "Invalid option provided for mech: " + mech + ".")
+ }
+ else if (weightsProvided & mech == "MCAR") {
+ print("ampute warning: User-provided weights will be ignored when
mechanism MCAR is chosen.")
+ }
+
+ # About prop:
+ if (!(0 < prop & prop <= 1)) {
+ errors = append(errors, "Value of prop must be within the range of (0, 1].
Was " + prop + ".")
+ }
+
+ # Set defaults for empty freq, patterns and weights matrices:
+ numFeatures = ncol(X)
+ [freq, patterns, weights] = u_handleDefaults(freq, patterns, weights, mech,
numFeatures)
+
+ # About freq:
+ if (nrow(freq) > 1 & ncol(freq) > 1) {
+ errors = append(errors, "freq provided as matrix with dimensions [" +
nrow(freq) + ', ' + ncol(freq) + "], but must be a vector.")
+ }
+ else if (ncol(freq) > 1) {
+ freq = t(freq) # Transposes row to column vector for convenience.
+ }
+ if (length(freq) != nrow(patterns)) {
+ errors = append(errors, "Length of freq must be equal to the number of
rows in the patterns matrix. freq has length "
+ + length(freq) + " while patterns contains " + nrow(patterns) + " rows.")
+ }
+ if (length(freq) != nrow(weights)) {
+ errors = append(errors, "Length of freq must be equal to the number of
rows in the weights matrix. freq has length "
+ + length(freq) + " while weights contains " + nrow(weights) + " rows.")
+ }
+ if (abs(sum(freq) - 1) > 1e-7) {
+ errors = append(errors, "Values in freq vector must approximately sum to
1. Sum was " + sum(freq) + ".")
+ }
+
+ # About patterns
+ if (ncol(X) != ncol(patterns)) {
+ errors = append(errors, "Input dataset must contain the same number of
columns as the patterns matrix. Dataset contains "
+ + ncol(X) + " columns while patterns contains " + ncol(patterns) + ".")
+ }
+ if (ncol(patterns) != ncol(weights)) {
+ errors = append(errors, "The patterns matrix must contain the same number
of columns as the weights matrix. The patterns matrix contains "
+ + ncol(patterns) + " columns while weights contains " + ncol(weights) +
".")
+ }
+ if (max(patterns != 0 & patterns != 1) > 0) {
+ errorPatterns = rowMaxs(patterns > 1 | patterns < 0)
+ errorPatterns = removeEmpty(target=seq(1, nrow(patterns)), margin="rows",
select=errorPatterns)
+ errorString = u_getErrorIndices(errorPatterns)
+ errors = append(errors, "The patterns matrix must contain only values of 0
or 1. The following rows in patterns break this rule: " + errorString + ".")
+ }
+ if (sum(rowMins(patterns)) > 0) {
+ errorPatterns = removeEmpty(target=seq(1, nrow(patterns)), margin="rows",
select=rowMins(patterns) == 1)
+ errorString = u_getErrorIndices(errorPatterns)
+ errors = append(errors, "Each row in the patterns matrix must contain at
least one value of 0. The following rows in patterns break this rule: " +
errorString + ".")
+ }
+
+ # About weights:
+ if (mech != "MCAR" & sum(rowMaxs(weights)) < nrow(weights)) {
+ errorWeights = removeEmpty(target=seq(1, nrow(weights)), margin="rows",
select=rowMaxs(weights) == 1)
+ errorString = u_getErrorIndices(errorWeights)
+ errors = append(errors, "Indicated weights of all 0's for some patterns
when mechanism isn't MCAR. The following rows in weights break this rule: " +
errorString + ".")
+ }
+ if (ncol(X) != ncol(weights)) {
+ errors = append(errors, "Input dataset must contain the same number of
columns as the weights matrix. Dataset contains "
+ + ncol(X) + " columns while weights contains " + ncol(weights) + ".")
+ }
+
+ # Collect errors, if any:
+ if (length(errors) > 0) {
+ errorStrings = ""
+ for (i in 1:length(errors)) {
+ errorStrings = errorStrings + "\nampute: " + as.scalar(errors[i])
+ }
+ stop(errorStrings)
+ }
+}
+
+u_handleDefaults = function(Matrix[Double] freq, Matrix[Double] patterns,
Matrix[Double] weights, String mech, Integer numFeatures)
+return (Matrix[Double] freq, Matrix[Double] patterns, Matrix[Double] weights) {
+ # Patterns: Default is a quadratic matrix wherein pattern i amputes feature
i.
+ empty = u_isEmpty(patterns)
+ if (empty) { # FIX ME
+ patterns = matrix(1, rows=numFeatures, cols=numFeatures) - diag(matrix(1,
rows=numFeatures, cols=1))
+ }
+
+ # Weights: Various defaults based on chosen missingness mechanism:
+ numPatterns = nrow(patterns)
+ empty = u_isEmpty(weights) # FIX ME
+ if (mech == "MCAR") {
+ weights = matrix(0, rows=numPatterns, cols=numFeatures) # MCAR: All 0's
(weights don't matter). Overrides any provided weights.
+ }
+ else if (empty) { # FIX ME
+ if (mech == "MAR") {
+ weights = patterns # MAR: Missing features weighted with 0.
+ }
+ else {
+ weights = 1 - patterns # MNAR case: Observed features weighted with 0.
+ }
+ }
+
+ # Frequencies: Uniform by default.
+ empty = u_isEmpty(freq) # FIX ME
+ if (empty) {
+ freq = matrix(1 / numPatterns, rows=numPatterns, cols=1)
+ }
+}
+
+u_getErrorIndices = function(Matrix[Double] errorPatterns) return (String
errorString) {
+ errorString = ""
+ for (i in 1:length(errorPatterns)) {
+ errorString = errorString + as.integer(as.scalar(errorPatterns[i]))
+ if (i < length(errorPatterns)) {
+ errorString = errorString + ", "
+ }
+ }
+}
+
+u_isEmpty = function(Matrix[Double] X) return (Boolean emptiness) {
+ emptiness = length(X) == 0
+}
+
+# Assigns numSamples to a number of catagories based on the frequencies
provided in freq.
+u_randomChoice = function(Integer numSamples, Matrix[Double] freq, Double seed
= -1)
+return (Matrix[Double] groupAssignments, Matrix[Double] groupCounts) {
+ numGroups = length(freq)
+ if (numGroups == 1) { # Assigns all samples to the same group.
+ groupCounts = matrix(numSamples, rows=1, cols=1)
+ groupAssignments = matrix(1, rows=numSamples, cols=1)
+ }
+ else { # Assigns based on cumulative probability thresholds:
+ cumSum = rbind(matrix(0, rows=1, cols=1), cumsum(freq)) # For, e.g., freq
== [0.1, 0.4, 0.5], we get cumSum = [0.0, 0.1, 0.5, 1.0].
+ random = rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform",
seed=seed)
+ groupCounts = matrix(0, rows=numGroups, cols=1)
+ groupAssignments = matrix(0, rows=numSamples, cols=1)
+
+ for (i in 1:numGroups) {
+ assigned = (random >= cumSum[i]) & (random < cumSum[i + 1])
+ while (FALSE) {} # FIX ME
+ groupCounts[i] = sum(assigned)
+ groupAssignments = groupAssignments + i * assigned
+ }
+ }
+}
+
+u_getGroupSamples = function(Matrix[Double] X, Matrix[Double]
groupAssignments, Integer numSamples, Integer groupSize, Integer numFeatures,
Integer patternNum)
+return (Matrix[Double] groupSamples, Matrix[Double] backMapping) {
+ mask = groupAssignments == patternNum
+ groupSamples = removeEmpty(target=X, margin="rows", select=mask)
+ backMapping = removeEmpty(target=seq(1, numSamples), margin="rows",
select=mask)
+}
+
+# Assigns amputation probabilities to each sample:
+u_getProbs = function(Matrix[Double] sumScores, Integer groupSize, Double prop)
+return(Matrix[Double] probs) {
+ if (length(unique(sumScores)) == 0) { # Checks if weights are all the same
value (including the zero-case), as is the case with, e.g., MCAR chosen.
+ probs = matrix(prop, rows=groupSize, cols=1)
+ }
+ else {
+ zScores = scale(X=sumScores)
+ rounded = round(prop * 100) / 100 # Rounds to two decimals for numeric
stability.
+ probs = u_binaryShiftSearch(zScores=zScores, prop=rounded)
+ }
+}
+
+# Performs a binary search for the optimum shift transformation to the
weighted sum scores in order to obtain the desired missingness proportion.
+u_binaryShiftSearch = function(Matrix[Double] zScores, Double prop)
+return (Matrix[Double] probsArray) {
+ shift = 0
+ counter = 0
+ probsArray = zScores
+ currentProb = NaN
+ lowerRange = -3
+ upperRange = 3
+ epsilon = 0.001
+ maxIter = 100
+
+ while (counter < maxIter & (is.na(currentProb) | abs(currentProb - prop) >=
epsilon)) {
+ counter += 1
+ shift = lowerRange + (upperRange - lowerRange) / 2
+ probsArray = u_sigmoid(zScores + shift) # Calculates Right-Sigmoid
probability (R implementation's default).
+ currentProb = mean(probsArray)
+ if (currentProb - prop > 0) {
+ upperRange = shift
+ }
+ else {
+ lowerRange = shift
+ }
+ }
+}
+
+u_sigmoid = function(Matrix[Double] X)
+return (Matrix[Double] sigmoided) {
+ sigmoided = 1 / (1 + exp(-X))
+}
+
+u_getBounds = function(Matrix[Double] numPerGroup, Integer groupSize, Integer
patternNum)
+return(Integer start, Integer end) {
+ if (patternNum == 1) {
+ start = 1
+ }
+ else {
+ start = sum(numPerGroup[1:(patternNum - 1), ]) + 1
+ }
+ end = start + groupSize - 1
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 4ff5654de0..08e06101b8 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -47,6 +47,7 @@ public enum Builtins {
ALS_DS("alsDS", true),
ALS_PREDICT("alsPredict", true),
ALS_TOPK_PREDICT("alsTopkPredict", true),
+ AMPUTE("ampute", true),
APPLY_PIPELINE("apply_pipeline", true),
APPLY_SCHEMA("applySchema", false),
ARIMA("arima", true),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinAmputeTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinAmputeTest.java
new file mode 100644
index 0000000000..a7491a8817
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinAmputeTest.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part1;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.apache.sysds.utils.Statistics;
+import org.junit.Assert;
+import org.junit.Test;
+import java.util.HashMap;
+
+public class BuiltinAmputeTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "builtinAmputeTest";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinAmputeTest.class.getSimpleName() + "/";
+ private final static String OUTPUT_NAME = "R";
+
+ private final static String WINE_DATA = DATASET_DIR +
"wine/winequality-red-white.csv";
+ private final static String DIABETES_DATA = DATASET_DIR +
"diabetes/diabetes.csv";
+ private final static String MNIST_DATA = DATASET_DIR +
"MNIST/mnist_test.csv";
+ private final static double EPSILON = 0.05;
+ private final static double SMALL_SAMPLE_EPSILON = 0.1; // More leeway
given to smaller proportions of amputed rows.
+ private final static int SEED = 42;
+
+ @Override
+ public void setUp() {
+ for(int i = 1; i <= 4; i++) {
+ addTestConfiguration(TEST_NAME + i, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {OUTPUT_NAME}));
+ }
+ }
+
+ @Test
+ public void testAmputeWine_prop25() {
+ runAmpute(1, WINE_DATA, ExecType.CP, false, 0.25,
SMALL_SAMPLE_EPSILON);
+ }
+
+ @Test
+ public void testAmputeDiabetes_prop25() {
+ runAmpute(2, DIABETES_DATA, ExecType.CP, true, 0.25,
SMALL_SAMPLE_EPSILON);
+ }
+
+ @Test
+ public void testAmputeMNIST_prop25() {
+ runAmpute(3, MNIST_DATA, ExecType.CP, false, 0.25,
SMALL_SAMPLE_EPSILON);
+ }
+
+ @Test
+ public void testAmputeWine_prop50() {
+ runAmpute(1, WINE_DATA, ExecType.CP, false, 0.5, EPSILON);
+ }
+
+ @Test
+ public void testAmputeDiabetes_prop50() {
+ runAmpute(2, DIABETES_DATA, ExecType.CP, true, 0.5, EPSILON);
+ }
+
+ @Test
+ public void testAmputeMNIST_prop50() {
+ runAmpute(3, MNIST_DATA, ExecType.CP, false, 0.5, EPSILON);
+ }
+
+ @Test
+ public void testAmputeWine_prop75() {
+ runAmpute(1, WINE_DATA, ExecType.CP, false, 0.75, EPSILON);
+ }
+
+ @Test
+ public void testAmputeDiabetes_prop75() {
+ runAmpute(2, DIABETES_DATA, ExecType.CP, true, 0.75, EPSILON);
+ }
+
+ @Test
+ public void testAmputeMNIST_prop75() {
+ runAmpute(3, MNIST_DATA, ExecType.CP, false, 0.75, EPSILON);
+ }
+
+ @Test
+ public void testAmputeMNIST_singleRow() {
+ runSingleRowDMLAmpute(4, WINE_DATA, ExecType.CP, false, 0.5,
EPSILON);
+ }
+
+ // This function tests whether MICE ampute (R) and SystemDS ampute.dml
produce approximately the same proportion of amputed rows
+ // and pattern frequencies in their output under the same input
settings. This information is compiled into a single matrix by each script.
+ private void runAmpute(int test, String data, ExecType instType,
boolean header, double prop, double eps) {
+ Types.ExecMode platformOld = setExecMode(instType);
+
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME +
test));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+
+ fullDMLScriptName = HOME + "ampute.dml";
+ programArgs = new String[]{"-stats", "-args", data,
output(OUTPUT_NAME), String.valueOf(SEED), String.valueOf(header),
String.valueOf(prop), String.valueOf(false)};
+
+ fullRScriptName = HOME + "ampute.R";
+ String outPath = expectedDir() + OUTPUT_NAME;
+ rCmd = getRCmd( data, outPath, String.valueOf(SEED),
String.valueOf(header), String.valueOf(prop));
+
+ runRScript(true);
+ runTest(true, false, null, -1);
+
+ HashMap<CellIndex, Double> dmlfile =
readDMLMatrixFromOutputDir(OUTPUT_NAME);
+ HashMap<CellIndex, Double> rfile =
readRMatrixFromExpectedDir(OUTPUT_NAME);
+ TestUtils.compareMatrices(dmlfile, rfile, eps,
"Stat-DML", "Stat-R");
+
+ Assert.assertEquals(0,
Statistics.getNoOfExecutedSPInst());
+ } finally {
+ rtplatform = platformOld;
+ }
+ }
+
+ // This function simply tests implicitly whether an exception is thrown
when running DML ampute with a single input data row:
+ private void runSingleRowDMLAmpute(int test, String data, ExecType
instType, boolean header, double prop, double eps) {
+ Types.ExecMode platformOld = setExecMode(instType);
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME +
test));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+
+ fullDMLScriptName = HOME + "ampute.dml";
+ programArgs = new String[]{"-stats", "-args", data,
output(OUTPUT_NAME), String.valueOf(SEED), String.valueOf(header),
String.valueOf(prop), String.valueOf(true)};
+ runTest(true, false, null, -1);
+
+ //HashMap<CellIndex, Double> dmlfile =
readDMLMatrixFromOutputDir(OUTPUT_NAME);
+ Assert.assertEquals(0,
Statistics.getNoOfExecutedSPInst());
+ } finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/ampute.R
b/src/test/scripts/functions/builtin/ampute.R
new file mode 100644
index 0000000000..4d34663fc1
--- /dev/null
+++ b/src/test/scripts/functions/builtin/ampute.R
@@ -0,0 +1,63 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("mice")
+
+set.seed(args[3])
+prop <- as.numeric(args[5])
+
+col_names <- args[4] == "true"
+X <- as.matrix(read.csv(args[1]), col_names=col_names)
+
+# Create three patterns with different probabilities, each amputing a single
variable:
+numPatterns <- 3
+freq <- rep(1 / numPatterns, numPatterns)
+for (i in 1:numPatterns) {
+ freq[i] <- i / 6
+}
+patterns <- matrix(1, nrow=numPatterns, ncol=ncol(X))
+for (i in 1:numPatterns) {
+ patterns[i, i] <- 0
+}
+
+
+res <- ampute(X, freq=freq, patterns=patterns, prop=prop)$amp
+
+
+# Proportion of amputed rows:
+amputed_rows <- apply(res, 1, function(row) any(is.na(row))) # TRUE if row
has missing values
+proportion_amputed_rows <- mean(amputed_rows)
+num_amputed_rows <- sum(amputed_rows)
+
+# Pattern assigment proportions among amputed rows:
+groupProps <- colSums(is.na(res)) / num_amputed_rows
+
+# Print the result
+cat("Proportion of total rows amputed (%): ", proportion_amputed_rows, "\n")
+cat("Proportion of amputed rows by pattern (%): ", groupProps, "\n")
+
+# Collect results, and output:
+row_vector <- c(proportion_amputed_rows, groupProps)
+# cat(row_vector)
+writeMM(as(row_vector, "CsparseMatrix"), args[2])
diff --git a/src/test/scripts/functions/builtin/ampute.dml
b/src/test/scripts/functions/builtin/ampute.dml
new file mode 100644
index 0000000000..6a4018b84c
--- /dev/null
+++ b/src/test/scripts/functions/builtin/ampute.dml
@@ -0,0 +1,55 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+seed = $3
+header = $4 == "true"
+prop = $5
+oneRow = $6
+X = read($1, data_type="matrix", format="csv", header=header);
+if (oneRow == "true") {
+ X = X[1, ]
+}
+numRows = nrow(X)
+
+
+# Create three patterns with different probabilities, each amputing a single
variable:
+numPatterns = 3
+freq = matrix(1, rows=numPatterns, cols=1)
+for (i in 1:numPatterns) {
+ freq[i, ] = i / 6
+}
+patterns = matrix(1, rows=numPatterns, cols=ncol(X))
+for (i in 1:numPatterns) {
+ patterns[i, i] = 0
+}
+
+res = ampute(X=X, seed=seed, freq=freq, patterns=patterns, prop=prop)
+
+numNaNs = sum(rowMaxs(is.na(res)))
+propNaNs = as.matrix(numNaNs / numRows)
+print("Proportion of total rows amputed (%): " + toString(propNaNs))
+
+groupProps = colSums(is.na(res)) / numNaNs
+print("Proportion of amputed rows by pattern (%): " + toString(groupProps))
+
+output = rbind(propNaNs, t(groupProps))
+
+write(output, $2);
\ No newline at end of file