This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new b1cb505dd7 [SYSTEMDS-3681] Cleanup stepLM builtin function, remove duplicate b1cb505dd7 is described below commit b1cb505dd7c9cf2f2703b143f3567eaaa8509ae7 Author: Matthias Boehm <mboe...@gmail.com> AuthorDate: Sat May 4 19:41:25 2024 +0200 [SYSTEMDS-3681] Cleanup stepLM builtin function, remove duplicate --- scripts/algorithms/StepLinearRegDS.dml | 93 ---------------------- scripts/builtin/steplm.dml | 31 ++++---- .../parttwo/AlgorithmStepwiseRegression.java | 10 ++- .../functions/codegenalg/Algorithm_StepLM.dml | 38 +++++++++ 4 files changed, 60 insertions(+), 112 deletions(-) diff --git a/scripts/algorithms/StepLinearRegDS.dml b/scripts/algorithms/StepLinearRegDS.dml deleted file mode 100644 index a8740f5884..0000000000 --- a/scripts/algorithms/StepLinearRegDS.dml +++ /dev/null @@ -1,93 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC -# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y -# -# INPUT PARAMETERS: -# -------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# -------------------------------------------------------------------------------------------- -# X String --- Location (on HDFS) to read the matrix X of feature vectors -# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values -# B String --- Location to store estimated regression parameters (the betas) -# S String --- Location to write the selected features ordered as computed by the algorithm -# O String " " Location to write the printed statistics; by default is standard output -# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: -# 0 = no intercept, no shifting, no rescaling; -# 1 = add intercept, but neither shift nor rescale X; -# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr -# no further features are being checked and the algorithm stops -# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" -# write_beta Boolean TRUE Should the beta's be returned? -# 0 = no -# 1 = yes -# -------------------------------------------------------------------------------------------- -# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: -# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: -# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B -# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# Col.2: betas for shifted/rescaled X and intercept -# -# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated -# name-value pair per each line, as follows: -# -# NAME MEANING -# ------------------------------------------------------------------------------------- -# AVG_TOT_Y Average of the response value Y -# STDEV_TOT_Y Standard Deviation of the response value Y -# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias -# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) -# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. -# R2 R^2 of residual with bias included vs. total average -# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average -# R2_NOBIAS R^2 of residual with bias subtracted vs. total average -# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average -# R2_VS_0 * R^2 of residual with bias included vs. zero constant -# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant -# ------------------------------------------------------------------------------------- -# * The last two statistics are only printed if there is no intercept (icpt=0) -# If the best AIC is achieved without any features the matrix of selected features contains 0. -# Moreover, in this case no further statistics will be produced -# -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemDS.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas -# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE - -fileX = $X; -fileY = $Y; -fileB = $B; -fileS = $S; -write_beta = ifdef($write_beta, TRUE); -fmt = ifdef ($fmt, "text"); -intercept = ifdef ($icpt, 1); -thr = ifdef ($thr, 0.001); - -X_orig = read (fileX); -y = read (fileY); - -[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE); - -write(Selected, fileS, format=fmt); -write(beta_out, fileB, format=fmt); diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml index bad1e7f35a..83c7c874b2 100644 --- a/scripts/builtin/steplm.dml +++ b/scripts/builtin/steplm.dml @@ -38,16 +38,16 @@ # # INPUT: # ------------------------------------------------------------------------------------------ -# X Location (on HDFS) to read the matrix X of feature vectors -# Y Location (on HDFS) to read the 1-column matrix Y of response values +# X Matrix X of feature vectors +# Y Single-column Matrix Y of response values # icpt Intercept presence, shifting and rescaling the columns of X: # 0 = no intercept, no shifting, no rescaling; # 1 = add intercept, but neither shift nor rescale X; # 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# reg learning rate +# reg Regularization parameter, 0 for no penalty # tol Tolerance threshold to train until achieved -# maxi maximum iterations 0 means until tolerance is reached -# verbose If the algorithm should be verbose +# maxi Maximum iterations 0 means until tolerance is reached +# verbose Indicator for verbose debug output # ------------------------------------------------------------------------------------------ # # OUTPUT: @@ -67,7 +67,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, # start from one feature and iteratively add features until AIC improves thr = 0.001; - if(verbose) + if(verbose) print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT"); X_orig = X; n = nrow(X_orig); @@ -76,10 +76,10 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, # BEGIN STEPWISE LINEAR REGRESSION columns_fixed = matrix(0, 1, m_orig); columns_fixed_ordered = matrix(0, 1, 1); - + # X_global stores the best model found at each step X_global = matrix(0, n, 1); - + if (icpt == 1 | icpt == 2) { beta = mean(y); AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n); @@ -88,7 +88,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, AIC_best_orig = n * log(sum(y ^ 2) / n); } if(verbose) - print("Best AIC without any features: " + AIC_best_orig); + print("Best AIC without any features: " + AIC_best_orig); boa_ncol = ncol(X_orig) + as.integer(icpt!=0); beta_out_all = matrix(0, boa_ncol, m_orig); @@ -107,14 +107,14 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, beta_best = beta_out_all[, column_best]; if (column_best == 0) { if(verbose) - print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); + print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); B = matrix(0, m_orig, 1); if (icpt != 0) B = rbind(B, as.matrix(beta)); S = matrix(0, 1, 1); } else { - if(verbose) + if(verbose) print("Best AIC " + AIC_best + " achieved with feature: " + column_best); columns_fixed[1, column_best] = 1; @@ -152,7 +152,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, if (as.scalar(columns_fixed[1, column_best]) == 0) { # new best feature found if(verbose) - print("Best AIC " + AIC_best + " achieved with feature: " + column_best); + print("Best AIC " + AIC_best + " achieved with feature: " + column_best); columns_fixed[1, column_best] = 1; columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best)); if (ncol(columns_fixed_ordered) == m_orig) { @@ -168,7 +168,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, } # run linear regression with selected set of features if( verbose ) - print("Running linear regression with selected features..."); + print("Running linear regression with selected features..."); [AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, verbose); S = columns_fixed_ordered; if (icpt != 0) @@ -178,13 +178,13 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, } # Computes linear regression using lm and outputs AIC. -linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt, +linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt, Double reg, Double tol, Integer maxi, Boolean verbose) return(Double AIC, Matrix[Double] beta) { # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL) beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, verbose=FALSE); - + # PREPARE X for SCORING if( icpt != 0 ) X = cbind(X, matrix(1,nrow(X),1)) @@ -224,3 +224,4 @@ reorder_matrix = function( checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return (Boolean R) { R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best)) } + diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java index dff485f76e..f2155225bc 100644 --- a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java +++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmStepwiseRegression.java @@ -34,7 +34,8 @@ import org.junit.Test; public class AlgorithmStepwiseRegression extends AutomatedTestBase { - private final static String TEST_NAME1 = "Algorithm_Stepwise"; + private final static String TEST_NAME1 = "Algorithm_StepLM"; + private final static String TEST_NAME2 = "Algorithm_StepGLM"; private final static String TEST_DIR = "functions/codegenalg/"; private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmStepwiseRegression.class.getSimpleName() + "/"; @@ -58,6 +59,7 @@ public class AlgorithmStepwiseRegression extends AutomatedTestBase public void setUp() { TestUtils.clearAssertionInformation(); addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); + addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "w" })); } @Test @@ -188,18 +190,18 @@ public class AlgorithmStepwiseRegression extends AutomatedTestBase try { - String TEST_NAME = TEST_NAME1; + String TEST_NAME = (type==StepwiseType.LINREG_DS) ? TEST_NAME1 : TEST_NAME2; TestConfiguration config = getTestConfiguration(TEST_NAME); loadTestConfiguration(config); + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; if( type == StepwiseType.LINREG_DS) { - fullDMLScriptName = "scripts/algorithms/StepLinearRegDS.dml"; programArgs = new String[]{ "-stats", "-nvargs", "X="+input("X"), "Y="+input("Y"), "icpt="+String.valueOf(icpt), "thr="+String.valueOf(thr), "B="+output("B"), "S="+output("S")}; } else { //GLM binomial probit - fullDMLScriptName = "scripts/algorithms/StepGLM.dml"; programArgs = new String[]{ "-stats", "-nvargs", "X="+input("X"), "Y="+input("Y"), "icpt="+String.valueOf(icpt), "thr="+String.valueOf(thr), "link=3", "yneg=0", diff --git a/src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml b/src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml new file mode 100644 index 0000000000..f5e606a591 --- /dev/null +++ b/src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml @@ -0,0 +1,38 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +fileX = $X; +fileY = $Y; +fileB = $B; +fileS = $S; +write_beta = ifdef($write_beta, TRUE); +fmt = ifdef ($fmt, "text"); +intercept = ifdef ($icpt, 1); +thr = ifdef ($thr, 0.001); + +X_orig = read (fileX); +y = read (fileY); + +[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE); + +write(Selected, fileS, format=fmt); +write(beta_out, fileB, format=fmt); +