This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new b2699ae [SYSTEMDS-113] Cleanup lmPredict (rename, parameters, eval
accuracy)
b2699ae is described below
commit b2699ae3951c05ba8d6e14888eac6acd3ac5e77c
Author: Matthias Boehm <[email protected]>
AuthorDate: Fri Jan 29 15:38:49 2021 +0100
[SYSTEMDS-113] Cleanup lmPredict (rename, parameters, eval accuracy)
---
docs/site/builtins-reference.md | 21 +++++++++-------
scripts/builtin/cvlm.dml | 2 +-
scripts/builtin/hyperband.dml | 2 +-
scripts/builtin/{lmpredict.dml => lmPredict.dml} | 29 ++++++++++++++--------
scripts/builtin/mice.dml | 2 +-
scripts/builtin/outlierByArima.dml | 2 +-
.../java/org/apache/sysds/common/Builtins.java | 2 +-
src/test/scripts/functions/builtin/lmpredict.dml | 4 +--
.../functions/federated/FederatedLmPipeline.dml | 14 +----------
.../federated/FederatedLmPipeline4Workers.dml | 14 +----------
.../FederatedLmPipeline4WorkersReference.dml | 14 +----------
.../federated/FederatedLmPipelineReference.dml | 14 +----------
.../scripts/functions/lineage/LineageReuseAlg6.dml | 2 +-
.../functions/recompile/IPAFunctionArgsFor.dml | 4 +--
.../functions/recompile/IPAFunctionArgsParfor.dml | 4 +--
15 files changed, 46 insertions(+), 84 deletions(-)
diff --git a/docs/site/builtins-reference.md b/docs/site/builtins-reference.md
index 022b75e..8263c10 100644
--- a/docs/site/builtins-reference.md
+++ b/docs/site/builtins-reference.md
@@ -45,7 +45,7 @@ limitations under the License.
* [`lm`-Function](#lm-function)
* [`lmDS`-Function](#lmds-function)
* [`lmCG`-Function](#lmcg-function)
- * [`lmpredict`-Function](#lmpredict-function)
+ * [`lmPredict`-Function](#lmPredict-function)
* [`mice`-Function](#mice-function)
* [`multiLogReg`-Function](#multiLogReg-function)
* [`pnmf`-Function](#pnmf-function)
@@ -183,7 +183,7 @@ y = toOneHot(X, numClasses)
## `cvlm`-Function
The `cvlm`-function is used for cross-validation of the provided data model.
This function follows a non-exhaustive
-cross validation method. It uses [`lm`](#lm-function) and
[`lmpredict`](#lmpredict-function) functions to solve the linear
+cross validation method. It uses [`lm`](#lm-function) and
[`lmPredict`](#lmPredict-function) functions to solve the linear
regression and to predict the class of a feature vector with no intercept,
shifting, and rescaling.
### Usage
@@ -425,7 +425,7 @@ Through multiple parallel brackets and consecutive trials
it will return the hyp
on a validation dataset. A set of hyper parameter combinations is drawn from
uniform distributions with given ranges; Those
make up the candidates for `hyperband`.
Notes:
-* `hyperband` is hard-coded for `lmCG`, and uses `lmpredict` for validation
+* `hyperband` is hard-coded for `lmCG`, and uses `lmPredict` for validation
* `hyperband` is hard-coded to use the number of iterations as a resource
* `hyperband` can only optimize continuous hyperparameters
@@ -778,14 +778,14 @@ y = X %*% rand(rows = ncol(X), cols = 1)
lmCG(X = X, y = y, maxi = 10)
```
-## `lmpredict`-Function
+## `lmPredict`-Function
-The `lmpredict`-function predicts the class of a feature vector.
+The `lmPredict`-function predicts the class of a feature vector.
### Usage
```r
-lmpredict(X, w)
+lmPredict(X=X, B=w)
```
### Arguments
@@ -793,8 +793,11 @@ lmpredict(X, w)
| Name | Type | Default | Description |
| :------ | :------------- | -------- | :---------- |
| X | Matrix[Double] | required | Matrix of feature vector(s). |
-| w | Matrix[Double] | required | 1-column matrix of weights. |
-| icpt | Matrix[Double] | `0` | Intercept presence, shifting and
rescaling of X ([Details](#icpt-argument))|
+| B | Matrix[Double] | required | 1-column matrix of weights. |
+| ytest | Matrix[Double] | optional | Optional test labels, used only for
verbose output. |
+| icpt | Integer | 0 | Intercept presence, shifting and
rescaling of X ([Details](#icpt-argument))|
+| verbose | Boolean | FALSE | Print various statistics for
evaluating accuracy. |
+
### Returns
@@ -808,7 +811,7 @@ lmpredict(X, w)
X = rand (rows = 50, cols = 10)
y = X %*% rand(rows = ncol(X), cols = 1)
w = lm(X = X, y = y)
-yp = lmpredict(X, w)
+yp = lmPredict(X = X, B = w)
```
## `mice`-Function
diff --git a/scripts/builtin/cvlm.dml b/scripts/builtin/cvlm.dml
index 57759fa..4836d0b 100644
--- a/scripts/builtin/cvlm.dml
+++ b/scripts/builtin/cvlm.dml
@@ -42,7 +42,7 @@ m_cvlm = function(Matrix[Double] X, Matrix[Double] y, Integer
k, Integer icpt =
}
beta = lm(X=trainSet, y=trainRes, icpt=icpt, reg=reg);
- pred = lmpredict(X=testSet, w=beta, icpt=icpt);
+ pred = lmPredict(X=testSet, B=beta, icpt=icpt);
y_predict[testS:testE,] = pred;
allbeta[i,] = t(beta);
}
diff --git a/scripts/builtin/hyperband.dml b/scripts/builtin/hyperband.dml
index b348a9f..e394ccc 100644
--- a/scripts/builtin/hyperband.dml
+++ b/scripts/builtin/hyperband.dml
@@ -104,7 +104,7 @@ m_hyperband = function(Matrix[Double] X_train,
Matrix[Double] y_train,
tol=as.scalar(args[1]), reg=as.scalar(args[2]), maxi=r_i,
verbose=FALSE));
candidateWeights[curCandidate] = t(weights)
- preds = lmpredict(X=X_val, w=weights);
+ preds = lmPredict(X=X_val, B=weights);
scoreboard[curCandidate,1] = as.matrix(sum((y_val - preds)^2));
}
diff --git a/scripts/builtin/lmpredict.dml b/scripts/builtin/lmPredict.dml
similarity index 53%
rename from scripts/builtin/lmpredict.dml
rename to scripts/builtin/lmPredict.dml
index 6797646..1bdabf3 100644
--- a/scripts/builtin/lmpredict.dml
+++ b/scripts/builtin/lmPredict.dml
@@ -19,17 +19,24 @@
#
#-------------------------------------------------------------
-m_lmpredict = function(Matrix[Double] X, Matrix[Double] w, Integer icpt = 0)
return (Matrix[Double] y) {
- intercept_status = icpt;
+m_lmPredict = function(Matrix[Double] X, Matrix[Double] B,
+ Matrix[Double] ytest = matrix(0,1,1), Integer icpt = 0, Boolean verbose =
FALSE)
+ return (Matrix[Double] yhat)
+{
+ intercept = ifelse(icpt==0, matrix(0,1,ncol(B)), B[nrow(B),]);
+ yhat = X %*% B[1:ncol(X)] + matrix(1,nrow(X),1) %*% intercept;
- if (intercept_status == 0) {
- y = X %*% w
- }
- else if (intercept_status == 1) {
- ones_n = matrix (1, rows = nrow (X), cols = 1);
- X = cbind (X, ones_n);
- y = X %*% w;
- } else {
- #ToDo: icpt == 2
+ if( verbose ) {
+ y_residual = ytest - yhat;
+ avg_res = sum(y_residual) / nrow(ytest);
+ ss_res = sum(y_residual^2);
+ ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
+ R2 = 1 - ss_res / (sum(ytest^2) - nrow(ytest) *
(sum(ytest)/nrow(ytest))^2);
+ print("\nAccuracy:" +
+ "\n--sum(ytest) = " + sum(ytest) +
+ "\n--sum(yhat) = " + sum(yhat) +
+ "\n--AVG_RES_Y: " + avg_res +
+ "\n--SS_AVG_RES_Y: " + ss_avg_res +
+ "\n--R2: " + R2 );
}
}
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 48c1dce..ccb8f95 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -133,7 +133,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask,
Integer iter = 3, Boole
# learn a regression line
beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol
= 1e-7);
# predicting missing values
- pred = lmpredict(X=test_X, w=beta, icpt=1)
+ pred = lmPredict(X=test_X, B=beta, icpt=1)
# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
R = removeEmpty(target=Mask_Filled[, in_c] * seq(1,nrow(X1)),
margin="rows");
# TODO modify removeEmpty to return zero row and n columns
diff --git a/scripts/builtin/outlierByArima.dml
b/scripts/builtin/outlierByArima.dml
index b00c07e..114c28d 100644
--- a/scripts/builtin/outlierByArima.dml
+++ b/scripts/builtin/outlierByArima.dml
@@ -64,7 +64,7 @@ m_outlierByArima = function(Matrix[Double] X, Double k = 3,
Integer repairMethod
# TODO replace by ARIMA once fully supported, LM only emulated the AR part
model = lm(X=features, y=X_adapted)
- y_hat = lmpredict(X=features, w=model)
+ y_hat = lmPredict(X=features, B=model)
upperBound = sd(X) + k * y_hat
lowerBound = sd(X) - k * y_hat
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index a1f372c..c76e5ae 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -144,7 +144,7 @@ public enum Builtins {
LM("lm", true),
LMCG("lmCG", true),
LMDS("lmDS", true),
- LMPREDICT("lmpredict", true),
+ LMPREDICT("lmPredict", true),
LOG("log", false),
LOGSUMEXP("logSumExp", true),
LSTM("lstm", false, ReturnType.MULTI_RETURN),
diff --git a/src/test/scripts/functions/builtin/lmpredict.dml
b/src/test/scripts/functions/builtin/lmpredict.dml
index f4f2178..1b0c2ec 100644
--- a/src/test/scripts/functions/builtin/lmpredict.dml
+++ b/src/test/scripts/functions/builtin/lmpredict.dml
@@ -23,5 +23,5 @@ X = read($1) # Training data
y = read($2) # response values
p = read($3) # random data to predict
w = lmDS(X = X, y = y, icpt = 1, reg = 1e-12)
-p = lmpredict(X = X, w = w, icpt = 1)
-write(p, $4)
\ No newline at end of file
+p = lmPredict(X = X, B = w, icpt = 1)
+write(p, $4)
diff --git a/src/test/scripts/functions/federated/FederatedLmPipeline.dml
b/src/test/scripts/functions/federated/FederatedLmPipeline.dml
index a0862fa..fdad81c 100644
--- a/src/test/scripts/functions/federated/FederatedLmPipeline.dml
+++ b/src/test/scripts/functions/federated/FederatedLmPipeline.dml
@@ -47,19 +47,7 @@ X = scale(X=X, center=TRUE, scale=TRUE);
B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE)
# model evaluation on test split
-yhat = lmpredict(X=Xtest, w=B, icpt=1);
-y_residual = ytest - yhat;
-
-avg_res = sum(y_residual) / nrow(ytest);
-ss_res = sum(y_residual^2);
-ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
-R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2);
-print("\nAccuracy:" +
- "\n--sum(ytest) = " + sum(ytest) +
- "\n--sum(yhat) = " + sum(yhat) +
- "\n--AVG_RES_Y: " + avg_res +
- "\n--SS_AVG_RES_Y: " + ss_avg_res +
- "\n--R2: " + R2 );
+yhat = lmPredict(X=Xtest, B=B, icpt=1, ytest=ytest verbose=TRUE);
# write trained model and meta data
write(B, $out)
diff --git
a/src/test/scripts/functions/federated/FederatedLmPipeline4Workers.dml
b/src/test/scripts/functions/federated/FederatedLmPipeline4Workers.dml
index ebd96f7..dce7015 100644
--- a/src/test/scripts/functions/federated/FederatedLmPipeline4Workers.dml
+++ b/src/test/scripts/functions/federated/FederatedLmPipeline4Workers.dml
@@ -49,19 +49,7 @@ X = scale(X=X, center=TRUE, scale=TRUE);
B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE)
# model evaluation on test split
-yhat = lmpredict(X=Xtest, w=B, icpt=1);
-y_residual = ytest - yhat;
-
-avg_res = sum(y_residual) / nrow(ytest);
-ss_res = sum(y_residual^2);
-ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
-R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2);
-print("\nAccuracy:" +
- "\n--sum(ytest) = " + sum(ytest) +
- "\n--sum(yhat) = " + sum(yhat) +
- "\n--AVG_RES_Y: " + avg_res +
- "\n--SS_AVG_RES_Y: " + ss_avg_res +
- "\n--R2: " + R2 );
+yhat = lmPredict(X=Xtest, B=B, icpt=1, ytest=ytest verbose=TRUE);
# write trained model and meta data
write(B, $out)
diff --git
a/src/test/scripts/functions/federated/FederatedLmPipeline4WorkersReference.dml
b/src/test/scripts/functions/federated/FederatedLmPipeline4WorkersReference.dml
index 7888c0a..318f441 100644
---
a/src/test/scripts/functions/federated/FederatedLmPipeline4WorkersReference.dml
+++
b/src/test/scripts/functions/federated/FederatedLmPipeline4WorkersReference.dml
@@ -47,19 +47,7 @@ X = scale(X=X, center=TRUE, scale=TRUE);
B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE)
# model evaluation on test split
-yhat = lmpredict(X=Xtest, w=B, icpt=1);
-y_residual = ytest - yhat;
-
-avg_res = sum(y_residual) / nrow(ytest);
-ss_res = sum(y_residual^2);
-ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
-R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2);
-print("\nAccuracy:" +
- "\n--sum(ytest) = " + sum(ytest) +
- "\n--sum(yhat) = " + sum(yhat) +
- "\n--AVG_RES_Y: " + avg_res +
- "\n--SS_AVG_RES_Y: " + ss_avg_res +
- "\n--R2: " + R2 );
+yhat = lmPredict(X=Xtest, B=B, icpt=1, ytest=ytest verbose=TRUE);
# write trained model and meta data
write(B, $7)
diff --git
a/src/test/scripts/functions/federated/FederatedLmPipelineReference.dml
b/src/test/scripts/functions/federated/FederatedLmPipelineReference.dml
index ffdca07..1fe5c21 100644
--- a/src/test/scripts/functions/federated/FederatedLmPipelineReference.dml
+++ b/src/test/scripts/functions/federated/FederatedLmPipelineReference.dml
@@ -47,19 +47,7 @@ X = scale(X=X, center=TRUE, scale=TRUE);
B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE)
# model evaluation on test split
-yhat = lmpredict(X=Xtest, w=B, icpt=1);
-y_residual = ytest - yhat;
-
-avg_res = sum(y_residual) / nrow(ytest);
-ss_res = sum(y_residual^2);
-ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
-R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2);
-print("\nAccuracy:" +
- "\n--sum(ytest) = " + sum(ytest) +
- "\n--sum(yhat) = " + sum(yhat) +
- "\n--AVG_RES_Y: " + avg_res +
- "\n--SS_AVG_RES_Y: " + ss_avg_res +
- "\n--R2: " + R2 );
+yhat = lmPredict(X=Xtest, B=B, icpt=1, ytest=ytest verbose=TRUE);
# write trained model and meta data
write(B, $7)
diff --git a/src/test/scripts/functions/lineage/LineageReuseAlg6.dml
b/src/test/scripts/functions/lineage/LineageReuseAlg6.dml
index 6d0c14d..082580b 100644
--- a/src/test/scripts/functions/lineage/LineageReuseAlg6.dml
+++ b/src/test/scripts/functions/lineage/LineageReuseAlg6.dml
@@ -89,7 +89,7 @@ Kc = floor(ncol(A) * 0.8);
for (i in 1:10) {
newA1 = PCA(A=A, K=Kc+i);
beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE);
- y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1);
+ y_predict1 = lmPredict(X=newA1, B=beta1, icpt=1);
R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1);
R[,i] = R2_ad1;
}
diff --git a/src/test/scripts/functions/recompile/IPAFunctionArgsFor.dml
b/src/test/scripts/functions/recompile/IPAFunctionArgsFor.dml
index 96b2355..9b32d39 100644
--- a/src/test/scripts/functions/recompile/IPAFunctionArgsFor.dml
+++ b/src/test/scripts/functions/recompile/IPAFunctionArgsFor.dml
@@ -92,7 +92,7 @@ Kc = floor(ncol(A) * 0.8);
for (i in 1:10) {
newA1 = PCA(A=A, K=Kc+i);
beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE);
- y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1);
+ y_predict1 = lmPredict(X=newA1, B=beta1, icpt=1);
R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1);
R[,i] = R2_ad1;
}
@@ -100,7 +100,7 @@ for (i in 1:10) {
for (i in 1:10) {
newA3 = PCA(A=A, K=Kc+5);
beta3 = lm(X=newA3, y=y, icpt=1, reg=0.001*i, verbose=FALSE);
- y_predict3 = lmpredict(X=newA3, w=beta3, icpt=1);
+ y_predict3 = lmPredict(X=newA3, B=beta3, icpt=1);
R2_ad3 = checkR2(newA3, y, y_predict3, beta3, 1);
R[,10+i] = R2_ad3;
}
diff --git a/src/test/scripts/functions/recompile/IPAFunctionArgsParfor.dml
b/src/test/scripts/functions/recompile/IPAFunctionArgsParfor.dml
index f7de41f..415e72e 100644
--- a/src/test/scripts/functions/recompile/IPAFunctionArgsParfor.dml
+++ b/src/test/scripts/functions/recompile/IPAFunctionArgsParfor.dml
@@ -92,7 +92,7 @@ Kc = floor(ncol(A) * 0.8);
for (i in 1:10) {
newA1 = PCA(A=A, K=Kc+i);
beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE);
- y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1);
+ y_predict1 = lmPredict(X=newA1, B=beta1, icpt=1);
R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1);
R[,i] = R2_ad1;
}
@@ -100,7 +100,7 @@ for (i in 1:10) {
parfor (i in 1:10) {
newA3 = PCA(A=A, K=Kc+5);
beta3 = lm(X=newA3, y=y, icpt=1, reg=0.001*i, verbose=FALSE);
- y_predict3 = lmpredict(X=newA3, w=beta3, icpt=1);
+ y_predict3 = lmPredict(X=newA3, B=beta3, icpt=1);
R2_ad3 = checkR2(newA3, y, y_predict3, beta3, 1);
R[,10+i] = R2_ad3;
}