This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new a445fdf [SYSTEMDS-2983] Extended gridSearch by k-fold cross validation
a445fdf is described below
commit a445fdfc589e2862f814d04f22b38a6f88eaa1b6
Author: Matthias Boehm <[email protected]>
AuthorDate: Fri Jun 4 16:02:52 2021 +0200
[SYSTEMDS-2983] Extended gridSearch by k-fold cross validation
This patch extends the existing gridSearch for hyper-parameter
optimization by optional k-fold cross validation (leave-one out). If
enabled, X and y are split into cvk folds, every hyper-parameter
combination is trained and evaluated over cvk combinations of folds, and
the model and loss is obtained via model averaging and average loss.
---
scripts/builtin/gridSearch.dml | 63 ++++++++++++++++++----
.../sysds/hops/ipa/InterProceduralAnalysis.java | 3 ++
.../functions/builtin/BuiltinGridSearchTest.java | 14 ++++-
.../scripts/functions/builtin/GridSearchLMCV.dml | 48 +++++++++++++++++
4 files changed, 116 insertions(+), 12 deletions(-)
diff --git a/scripts/builtin/gridSearch.dml b/scripts/builtin/gridSearch.dml
index 5a5bdba..1c0568d 100644
--- a/scripts/builtin/gridSearch.dml
+++ b/scripts/builtin/gridSearch.dml
@@ -36,6 +36,8 @@
# predictArgs List of arguments to pass to the 'predict' function, where
# gridSearch appends the trained models at the end, if
# not provided or an empty list, list(X, y) is used instead
+# cv flag enabling k-fold cross validation, otherwise training loss
+# cvk if cv=TRUE, specifies the the number of folds, otherwise ignored
# verbose flag for verbose debug output
#-------------------------------------------------------------------------------
# B the trained model with minimal loss (by the 'predict' function)
@@ -45,7 +47,7 @@
m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, String train,
String predict,
Integer numB=ncol(X), List[String] params, List[Unknown] paramValues,
List[Unknown] trainArgs = list(), List[Unknown] predictArgs = list(),
- Boolean verbose = TRUE)
+ Boolean cv = FALSE, Integer cvk = 5, Boolean verbose = TRUE)
return (Matrix[Double] B, Frame[Unknown] opt)
{
# Step 0) handling default arguments, which require access to passed data
@@ -53,6 +55,10 @@ m_gridSearch = function(Matrix[Double] X, Matrix[Double] y,
String train, String
trainArgs = list(X=X, y=y, icpt=0, reg=-1, tol=-1, maxi=-1, verbose=FALSE);
if( length(predictArgs) == 0 )
predictArgs = list(X, y);
+ if( cv & cvk <= 1 ) {
+ print("gridSearch: called with cv=TRUE but cvk="+cvk+", set to default
cvk=5.")
+ cvk = 5;
+ }
# Step 1) preparation of parameters, lengths, and values in convenient form
numParams = length(params);
@@ -83,19 +89,54 @@ m_gridSearch = function(Matrix[Double] X, Matrix[Double] y,
String train, String
}
# Step 3) training/scoring of parameter combinations
- # TODO integrate cross validation
Rbeta = matrix(0, nrow(HP), numB);
Rloss = matrix(0, nrow(HP), 1);
- parfor( i in 1:nrow(HP) ) {
- # a) replace training arguments
- ltrainArgs = trainArgs;
- for( j in 1:numParams )
- ltrainArgs[as.scalar(params[j])] = as.scalar(HP[i,j]);
- # b) core training/scoring and write-back
- lbeta = t(eval(train, ltrainArgs))
- Rbeta[i,1:ncol(lbeta)] = lbeta;
- Rloss[i,] = eval(predict, append(predictArgs,t(lbeta)));
+ # with cross-validation
+ if( cv ) {
+ # a) create folds
+ foldsX = list(); foldsY = list();
+ fs = ceil(nrow(X)/cvk);
+ for( k in 0:(cvk-1) ) {
+ foldsX = append(foldsX, X[(k*fs+1):min((cvk+1)*fs,nrow(X)),]);
+ foldsY = append(foldsY, y[(k*fs+1):min((cvk+1)*fs,nrow(y)),]);
+ }
+ parfor( i in 1:nrow(HP) ) {
+ # a) replace training arguments
+ ltrainArgs = trainArgs;
+ lpredictArgs = predictArgs;
+ for( j in 1:numParams )
+ ltrainArgs[as.scalar(params[j])] = as.scalar(HP[i,j]);
+ # b) cross-validated training/scoring and write-back
+ cvbeta = matrix(0,1,numB);
+ cvloss = matrix(0,1,1);
+ for( k in 1:cvk ) {
+ [tmpX, testX] = remove(foldsX, k);
+ [tmpy, testy] = remove(foldsY, k);
+ ltrainArgs['X'] = rbind(tmpX);
+ ltrainArgs['y'] = rbind(tmpy);
+ lbeta = t(eval(train, ltrainArgs));
+ cvbeta[,1:ncol(lbeta)] = cvbeta[,1:ncol(lbeta)] + lbeta;
+ lpredictArgs[1] = as.matrix(testX);
+ lpredictArgs[2] = as.matrix(testy);
+ cvloss += eval(predict, append(lpredictArgs,t(lbeta)));
+ }
+ Rbeta[i,] = cvbeta / cvk; # model averaging
+ Rloss[i,] = cvloss / cvk;
+ }
+ }
+ # without cross-validation
+ else {
+ parfor( i in 1:nrow(HP) ) {
+ # a) replace training arguments
+ ltrainArgs = trainArgs;
+ for( j in 1:numParams )
+ ltrainArgs[as.scalar(params[j])] = as.scalar(HP[i,j]);
+ # b) core training/scoring and write-back
+ lbeta = t(eval(train, ltrainArgs))
+ Rbeta[i,1:ncol(lbeta)] = lbeta;
+ Rloss[i,] = eval(predict, append(predictArgs,t(lbeta)));
+ }
}
# Step 4) select best parameter combination
diff --git
a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
index 9dcfd53..1518fe1 100644
--- a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
+++ b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
@@ -547,6 +547,9 @@ public class InterProceduralAnalysis
for( int i=0; i<Math.min(inputOps.size(), funArgNames.length);
i++ ) {
//create mapping between input hops and vars
DataIdentifier dat =
fstmt.getInputParam(funArgNames[i]);
+ if( dat == null )
+ throw new HopsException("Failed IPA: function
argument '"+funArgNames[i]+"' "
+ + "does not exist in function signature
of "+fop.getFunctionKey()+".");
Hop input = inputOps.get(i);
if( input.getDataType()==DataType.MATRIX )
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
index 34504c9..2623f18 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
@@ -33,10 +33,11 @@ public class BuiltinGridSearchTest extends AutomatedTestBase
private final static String TEST_NAME1 = "GridSearchLM";
private final static String TEST_NAME2 = "GridSearchMLogreg";
private final static String TEST_NAME3 = "GridSearchLM2";
+ private final static String TEST_NAME4 = "GridSearchLMCV";
private final static String TEST_DIR = "functions/builtin/";
private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinGridSearchTest.class.getSimpleName() + "/";
- private final static int rows = 300;
+ private final static int rows = 400;
private final static int cols = 20;
@Override
@@ -44,6 +45,7 @@ public class BuiltinGridSearchTest extends AutomatedTestBase
addTestConfiguration(TEST_NAME1,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
addTestConfiguration(TEST_NAME2,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2,new String[]{"R"}));
addTestConfiguration(TEST_NAME3,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME3,new String[]{"R"}));
+ addTestConfiguration(TEST_NAME4,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME4,new String[]{"R"}));
}
@Test
@@ -81,6 +83,16 @@ public class BuiltinGridSearchTest extends AutomatedTestBase
runGridSearch(TEST_NAME3, ExecMode.HYBRID);
}
+ @Test
+ public void testGridSearchLmCvCP() {
+ runGridSearch(TEST_NAME4, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testGridSearchLmCvHybrid() {
+ runGridSearch(TEST_NAME4, ExecMode.HYBRID);
+ }
+
private void runGridSearch(String testname, ExecMode et)
{
ExecMode modeOld = setExecMode(et);
diff --git a/src/test/scripts/functions/builtin/GridSearchLMCV.dml
b/src/test/scripts/functions/builtin/GridSearchLMCV.dml
new file mode 100644
index 0000000..2097818
--- /dev/null
+++ b/src/test/scripts/functions/builtin/GridSearchLMCV.dml
@@ -0,0 +1,48 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+l2norm = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B)
+ return (Matrix[Double] loss)
+{
+ yhat = lmPredict(X=X, B=B, ytest=y)
+ loss = as.matrix(sum((y - yhat)^2));
+}
+
+X = read($1);
+y = read($2);
+
+N = 300;
+Xtrain = X[1:N,];
+ytrain = y[1:N,];
+Xtest = X[(N+1):nrow(X),];
+ytest = y[(N+1):nrow(X),];
+
+params = list("icpt","reg", "tol", "maxi");
+paramRanges = list(seq(0,1,2),10^seq(0,-4), 10^seq(-6,-12), 10^seq(1,3));
+[B1, opt] = gridSearch(X=Xtrain, y=ytrain, train="lm", predict="l2norm",
+ numB=ncol(X)+1, params=params, paramValues=paramRanges, cv=TRUE, cvk=3);
+B2 = lm(X=Xtrain, y=ytrain, verbose=FALSE);
+
+l1 = l2norm(Xtest, ytest, B1);
+l2 = l2norm(Xtest, ytest, B2);
+R = as.scalar(l1 < l2);
+
+write(R, $3)