This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new b1cb505dd7 [SYSTEMDS-3681] Cleanup stepLM builtin function, remove
duplicate
b1cb505dd7 is described below
commit b1cb505dd7c9cf2f2703b143f3567eaaa8509ae7
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat May 4 19:41:25 2024 +0200
[SYSTEMDS-3681] Cleanup stepLM builtin function, remove duplicate
---
scripts/algorithms/StepLinearRegDS.dml | 93 ----------------------
scripts/builtin/steplm.dml | 31 ++++----
.../parttwo/AlgorithmStepwiseRegression.java | 10 ++-
.../functions/codegenalg/Algorithm_StepLM.dml | 38 +++++++++
4 files changed, 60 insertions(+), 112 deletions(-)
diff --git a/scripts/algorithms/StepLinearRegDS.dml
b/scripts/algorithms/StepLinearRegDS.dml
deleted file mode 100644
index a8740f5884..0000000000
--- a/scripts/algorithms/StepLinearRegDS.dml
+++ /dev/null
@@ -1,93 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
-# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
-#
-# INPUT PARAMETERS:
-#
--------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-#
--------------------------------------------------------------------------------------------
-# X String --- Location (on HDFS) to read the matrix X of
feature vectors
-# Y String --- Location (on HDFS) to read the 1-column matrix
Y of response values
-# B String --- Location to store estimated regression
parameters (the betas)
-# S String --- Location to write the selected features ordered
as computed by the algorithm
-# O String " " Location to write the printed statistics; by
default is standard output
-# icpt Int 0 Intercept presence, shifting and rescaling the
columns of X:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor
rescale X;
-# 2 = add intercept, shift & rescale X columns to
mean = 0, variance = 1
-# thr Double 0.01 Threshold to stop the algorithm: if the
decrease in the value of AIC falls below thr
-# no further features are being checked and the
algorithm stops
-# fmt String "text" Matrix output format for B (the betas) only,
usually "text" or "csv"
-# write_beta Boolean TRUE Should the beta's be returned?
-# 0 = no
-# 1 = yes
-#
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on
icpt input value:
-# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM
X AND B:
-# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*%
B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*%
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*%
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, in the last run of linear regression some statistics are
provided in CSV format, one comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME MEANING
-#
-------------------------------------------------------------------------------------
-# AVG_TOT_Y Average of the response value Y
-# STDEV_TOT_Y Standard Deviation of the response value Y
-# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual
bias
-# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION GLM-style dispersion, i.e. residual sum of squares / #
deg. fr.
-# R2 R^2 of residual with bias included vs. total average
-# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total
average
-# R2_NOBIAS R^2 of residual with bias subtracted vs. total average
-# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs.
total average
-# R2_VS_0 * R^2 of residual with bias included vs. zero constant
-# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero
constant
-#
-------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-# If the best AIC is achieved without any features the matrix of selected
features contains 0.
-# Moreover, in this case no further statistics will be produced
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemDS.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X
Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
-# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
write_beta=TRUE
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileS = $S;
-write_beta = ifdef($write_beta, TRUE);
-fmt = ifdef ($fmt, "text");
-intercept = ifdef ($icpt, 1);
-thr = ifdef ($thr, 0.001);
-
-X_orig = read (fileX);
-y = read (fileY);
-
-[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE);
-
-write(Selected, fileS, format=fmt);
-write(beta_out, fileB, format=fmt);
diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml
index bad1e7f35a..83c7c874b2 100644
--- a/scripts/builtin/steplm.dml
+++ b/scripts/builtin/steplm.dml
@@ -38,16 +38,16 @@
#
# INPUT:
#
------------------------------------------------------------------------------------------
-# X Location (on HDFS) to read the matrix X of feature vectors
-# Y Location (on HDFS) to read the 1-column matrix Y of response values
+# X Matrix X of feature vectors
+# Y Single-column Matrix Y of response values
# icpt Intercept presence, shifting and rescaling the columns of X:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance
= 1
-# reg learning rate
+# reg Regularization parameter, 0 for no penalty
# tol Tolerance threshold to train until achieved
-# maxi maximum iterations 0 means until tolerance is reached
-# verbose If the algorithm should be verbose
+# maxi Maximum iterations 0 means until tolerance is reached
+# verbose Indicator for verbose debug output
#
------------------------------------------------------------------------------------------
#
# OUTPUT:
@@ -67,7 +67,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
# start from one feature and iteratively add features until AIC improves
thr = 0.001;
- if(verbose)
+ if(verbose)
print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
X_orig = X;
n = nrow(X_orig);
@@ -76,10 +76,10 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
# BEGIN STEPWISE LINEAR REGRESSION
columns_fixed = matrix(0, 1, m_orig);
columns_fixed_ordered = matrix(0, 1, 1);
-
+
# X_global stores the best model found at each step
X_global = matrix(0, n, 1);
-
+
if (icpt == 1 | icpt == 2) {
beta = mean(y);
AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
@@ -88,7 +88,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
AIC_best_orig = n * log(sum(y ^ 2) / n);
}
if(verbose)
- print("Best AIC without any features: " + AIC_best_orig);
+ print("Best AIC without any features: " + AIC_best_orig);
boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
beta_out_all = matrix(0, boa_ncol, m_orig);
@@ -107,14 +107,14 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
beta_best = beta_out_all[, column_best];
if (column_best == 0) {
if(verbose)
- print("AIC of an empty model is " + AIC_best + " and adding
no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
+ print("AIC of an empty model is " + AIC_best + " and adding no feature
achieves more than " + (thr * 100) + "% decrease in AIC!");
B = matrix(0, m_orig, 1);
if (icpt != 0)
B = rbind(B, as.matrix(beta));
S = matrix(0, 1, 1);
}
else {
- if(verbose)
+ if(verbose)
print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1, column_best] = 1;
@@ -152,7 +152,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
if (as.scalar(columns_fixed[1, column_best]) == 0) {
# new best feature found
if(verbose)
- print("Best AIC " + AIC_best + " achieved
with feature: " + column_best);
+ print("Best AIC " + AIC_best + " achieved with feature: " +
column_best);
columns_fixed[1, column_best] = 1;
columns_fixed_ordered = cbind(columns_fixed_ordered,
as.matrix(column_best));
if (ncol(columns_fixed_ordered) == m_orig) {
@@ -168,7 +168,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
}
# run linear regression with selected set of features
if( verbose )
- print("Running linear regression with selected features...");
+ print("Running linear regression with selected features...");
[AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi,
verbose);
S = columns_fixed_ordered;
if (icpt != 0)
@@ -178,13 +178,13 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y,
Integer icpt = 0,
}
# Computes linear regression using lm and outputs AIC.
-linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
+linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
Double reg, Double tol, Integer maxi, Boolean verbose)
return(Double AIC, Matrix[Double] beta)
{
# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi,
verbose=FALSE);
-
+
# PREPARE X for SCORING
if( icpt != 0 )
X = cbind(X, matrix(1,nrow(X),1))
@@ -224,3 +224,4 @@ reorder_matrix = function(
checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return
(Boolean R) {
R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best))
}
+
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java
b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java
index dff485f76e..f2155225bc 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java
@@ -34,7 +34,8 @@ import org.junit.Test;
public class AlgorithmStepwiseRegression extends AutomatedTestBase
{
- private final static String TEST_NAME1 = "Algorithm_Stepwise";
+ private final static String TEST_NAME1 = "Algorithm_StepLM";
+ private final static String TEST_NAME2 = "Algorithm_StepGLM";
private final static String TEST_DIR = "functions/codegenalg/";
private final static String TEST_CLASS_DIR = TEST_DIR +
AlgorithmStepwiseRegression.class.getSimpleName() + "/";
@@ -58,6 +59,7 @@ public class AlgorithmStepwiseRegression extends
AutomatedTestBase
public void setUp() {
TestUtils.clearAssertionInformation();
addTestConfiguration(TEST_NAME1, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" }));
+ addTestConfiguration(TEST_NAME2, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "w" }));
}
@Test
@@ -188,18 +190,18 @@ public class AlgorithmStepwiseRegression extends
AutomatedTestBase
try
{
- String TEST_NAME = TEST_NAME1;
+ String TEST_NAME = (type==StepwiseType.LINREG_DS) ?
TEST_NAME1 : TEST_NAME2;
TestConfiguration config =
getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
if( type == StepwiseType.LINREG_DS) {
- fullDMLScriptName =
"scripts/algorithms/StepLinearRegDS.dml";
programArgs = new String[]{ "-stats", "-nvargs",
"X="+input("X"), "Y="+input("Y"),
"icpt="+String.valueOf(icpt),
"thr="+String.valueOf(thr),
"B="+output("B"), "S="+output("S")};
}
else { //GLM binomial probit
- fullDMLScriptName =
"scripts/algorithms/StepGLM.dml";
programArgs = new String[]{ "-stats", "-nvargs",
"X="+input("X"), "Y="+input("Y"),
"icpt="+String.valueOf(icpt),
"thr="+String.valueOf(thr), "link=3",
"yneg=0",
diff --git a/src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml
b/src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml
new file mode 100644
index 0000000000..f5e606a591
--- /dev/null
+++ b/src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileS = $S;
+write_beta = ifdef($write_beta, TRUE);
+fmt = ifdef ($fmt, "text");
+intercept = ifdef ($icpt, 1);
+thr = ifdef ($thr, 0.001);
+
+X_orig = read (fileX);
+y = read (fileY);
+
+[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE);
+
+write(Selected, fileS, format=fmt);
+write(beta_out, fileB, format=fmt);
+