This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new a445fdf  [SYSTEMDS-2983] Extended gridSearch by k-fold cross validation
a445fdf is described below

commit a445fdfc589e2862f814d04f22b38a6f88eaa1b6
Author: Matthias Boehm <[email protected]>
AuthorDate: Fri Jun 4 16:02:52 2021 +0200

    [SYSTEMDS-2983] Extended gridSearch by k-fold cross validation
    
    This patch extends the existing gridSearch for hyper-parameter
    optimization by optional k-fold cross validation (leave-one out). If
    enabled, X and y are split into cvk folds, every hyper-parameter
    combination is trained and evaluated over cvk combinations of folds, and
    the model and loss is obtained via model averaging and average loss.
---
 scripts/builtin/gridSearch.dml                     | 63 ++++++++++++++++++----
 .../sysds/hops/ipa/InterProceduralAnalysis.java    |  3 ++
 .../functions/builtin/BuiltinGridSearchTest.java   | 14 ++++-
 .../scripts/functions/builtin/GridSearchLMCV.dml   | 48 +++++++++++++++++
 4 files changed, 116 insertions(+), 12 deletions(-)

diff --git a/scripts/builtin/gridSearch.dml b/scripts/builtin/gridSearch.dml
index 5a5bdba..1c0568d 100644
--- a/scripts/builtin/gridSearch.dml
+++ b/scripts/builtin/gridSearch.dml
@@ -36,6 +36,8 @@
 # predictArgs  List of arguments to pass to the 'predict' function, where 
 #              gridSearch appends the trained models at the end, if 
 #              not provided or an empty list, list(X, y) is used instead
+# cv           flag enabling k-fold cross validation, otherwise training loss
+# cvk          if cv=TRUE, specifies the the number of folds, otherwise ignored
 # verbose      flag for verbose debug output 
 
#-------------------------------------------------------------------------------
 # B            the trained model with minimal loss (by the 'predict' function) 
@@ -45,7 +47,7 @@
 m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, String train, 
String predict,
     Integer numB=ncol(X), List[String] params, List[Unknown] paramValues,
     List[Unknown] trainArgs = list(), List[Unknown] predictArgs = list(),
-    Boolean verbose = TRUE) 
+    Boolean cv = FALSE, Integer cvk = 5, Boolean verbose = TRUE)
   return (Matrix[Double] B, Frame[Unknown] opt)
 {
   # Step 0) handling default arguments, which require access to passed data
@@ -53,6 +55,10 @@ m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, 
String train, String
     trainArgs = list(X=X, y=y, icpt=0, reg=-1, tol=-1, maxi=-1, verbose=FALSE);
   if( length(predictArgs) == 0 )
     predictArgs = list(X, y);
+  if( cv & cvk <= 1 ) {
+    print("gridSearch: called with cv=TRUE but cvk="+cvk+", set to default 
cvk=5.")
+    cvk = 5;
+  }
 
   # Step 1) preparation of parameters, lengths, and values in convenient form
   numParams = length(params);
@@ -83,19 +89,54 @@ m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, 
String train, String
   }
 
   # Step 3) training/scoring of parameter combinations
-  # TODO integrate cross validation
   Rbeta = matrix(0, nrow(HP), numB);
   Rloss = matrix(0, nrow(HP), 1);
 
-  parfor( i in 1:nrow(HP) ) {
-    # a) replace training arguments
-    ltrainArgs = trainArgs;
-    for( j in 1:numParams )
-      ltrainArgs[as.scalar(params[j])] = as.scalar(HP[i,j]);
-    # b) core training/scoring and write-back
-    lbeta = t(eval(train, ltrainArgs))
-    Rbeta[i,1:ncol(lbeta)] = lbeta;
-    Rloss[i,] = eval(predict, append(predictArgs,t(lbeta)));
+  # with cross-validation
+  if( cv ) {
+    # a) create folds
+    foldsX = list(); foldsY = list();
+    fs = ceil(nrow(X)/cvk);
+    for( k in 0:(cvk-1) ) {
+      foldsX = append(foldsX, X[(k*fs+1):min((cvk+1)*fs,nrow(X)),]);
+      foldsY = append(foldsY, y[(k*fs+1):min((cvk+1)*fs,nrow(y)),]);
+    }
+    parfor( i in 1:nrow(HP) ) {
+      # a) replace training arguments
+      ltrainArgs = trainArgs; 
+      lpredictArgs = predictArgs;
+      for( j in 1:numParams )
+        ltrainArgs[as.scalar(params[j])] = as.scalar(HP[i,j]);
+      # b) cross-validated training/scoring and write-back
+      cvbeta = matrix(0,1,numB);
+      cvloss = matrix(0,1,1);
+      for( k in 1:cvk ) {
+        [tmpX, testX] = remove(foldsX, k);
+        [tmpy, testy] = remove(foldsY, k);
+        ltrainArgs['X'] = rbind(tmpX);
+        ltrainArgs['y'] = rbind(tmpy);
+        lbeta = t(eval(train, ltrainArgs));
+        cvbeta[,1:ncol(lbeta)] = cvbeta[,1:ncol(lbeta)] + lbeta;
+        lpredictArgs[1] = as.matrix(testX);
+        lpredictArgs[2] = as.matrix(testy);
+        cvloss += eval(predict, append(lpredictArgs,t(lbeta)));
+      }
+      Rbeta[i,] = cvbeta / cvk; # model averaging
+      Rloss[i,] = cvloss / cvk;
+    }
+  }
+  # without cross-validation
+  else {
+    parfor( i in 1:nrow(HP) ) {
+      # a) replace training arguments
+      ltrainArgs = trainArgs;
+      for( j in 1:numParams )
+        ltrainArgs[as.scalar(params[j])] = as.scalar(HP[i,j]);
+      # b) core training/scoring and write-back
+      lbeta = t(eval(train, ltrainArgs))
+      Rbeta[i,1:ncol(lbeta)] = lbeta;
+      Rloss[i,] = eval(predict, append(predictArgs,t(lbeta)));
+    }
   }
 
   # Step 4) select best parameter combination
diff --git 
a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java 
b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
index 9dcfd53..1518fe1 100644
--- a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
+++ b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
@@ -547,6 +547,9 @@ public class InterProceduralAnalysis
                for( int i=0; i<Math.min(inputOps.size(), funArgNames.length); 
i++ ) {
                        //create mapping between input hops and vars
                        DataIdentifier dat = 
fstmt.getInputParam(funArgNames[i]);
+                       if( dat == null )
+                               throw new HopsException("Failed IPA: function 
argument '"+funArgNames[i]+"' "
+                                       + "does not exist in function signature 
of "+fop.getFunctionKey()+".");
                        Hop input = inputOps.get(i);
                        
                        if( input.getDataType()==DataType.MATRIX )
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
index 34504c9..2623f18 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGridSearchTest.java
@@ -33,10 +33,11 @@ public class BuiltinGridSearchTest extends AutomatedTestBase
        private final static String TEST_NAME1 = "GridSearchLM";
        private final static String TEST_NAME2 = "GridSearchMLogreg";
        private final static String TEST_NAME3 = "GridSearchLM2";
+       private final static String TEST_NAME4 = "GridSearchLMCV";
        private final static String TEST_DIR = "functions/builtin/";
        private final static String TEST_CLASS_DIR = TEST_DIR + 
BuiltinGridSearchTest.class.getSimpleName() + "/";
        
-       private final static int rows = 300;
+       private final static int rows = 400;
        private final static int cols = 20;
        
        @Override
@@ -44,6 +45,7 @@ public class BuiltinGridSearchTest extends AutomatedTestBase
                addTestConfiguration(TEST_NAME1,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
                addTestConfiguration(TEST_NAME2,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2,new String[]{"R"}));
                addTestConfiguration(TEST_NAME3,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME3,new String[]{"R"}));
+               addTestConfiguration(TEST_NAME4,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME4,new String[]{"R"}));
        }
        
        @Test
@@ -81,6 +83,16 @@ public class BuiltinGridSearchTest extends AutomatedTestBase
                runGridSearch(TEST_NAME3, ExecMode.HYBRID);
        }
        
+       @Test
+       public void testGridSearchLmCvCP() {
+               runGridSearch(TEST_NAME4, ExecMode.SINGLE_NODE);
+       }
+       
+       @Test
+       public void testGridSearchLmCvHybrid() {
+               runGridSearch(TEST_NAME4, ExecMode.HYBRID);
+       }
+       
        private void runGridSearch(String testname, ExecMode et)
        {
                ExecMode modeOld = setExecMode(et);
diff --git a/src/test/scripts/functions/builtin/GridSearchLMCV.dml 
b/src/test/scripts/functions/builtin/GridSearchLMCV.dml
new file mode 100644
index 0000000..2097818
--- /dev/null
+++ b/src/test/scripts/functions/builtin/GridSearchLMCV.dml
@@ -0,0 +1,48 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+l2norm = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
+  return (Matrix[Double] loss)
+{
+  yhat = lmPredict(X=X, B=B, ytest=y)
+  loss = as.matrix(sum((y - yhat)^2));
+}
+
+X = read($1);
+y = read($2);
+
+N = 300;
+Xtrain = X[1:N,];
+ytrain = y[1:N,];
+Xtest = X[(N+1):nrow(X),];
+ytest = y[(N+1):nrow(X),];
+
+params = list("icpt","reg", "tol", "maxi");
+paramRanges = list(seq(0,1,2),10^seq(0,-4), 10^seq(-6,-12), 10^seq(1,3));
+[B1, opt] = gridSearch(X=Xtrain, y=ytrain, train="lm", predict="l2norm",
+  numB=ncol(X)+1, params=params, paramValues=paramRanges, cv=TRUE, cvk=3);
+B2 = lm(X=Xtrain, y=ytrain, verbose=FALSE);
+
+l1 = l2norm(Xtest, ytest, B1);
+l2 = l2norm(Xtest, ytest, B2);
+R = as.scalar(l1 < l2);
+
+write(R, $3)

Reply via email to