[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-15 Thread mengxr
Github user mengxr commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40452444
  
Jenkins, retest this please.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-15 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40452733
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-15 Thread dbtsai
Github user dbtsai closed the pull request at:

https://github.com/apache/spark/pull/353


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread dbtsai
Github user dbtsai commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11605070
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV, axpy}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(private var gradient: Gradient, private var updater: Updater)
+  extends Optimizer with Logging {
+
+  private var numCorrections = 10
+  private var convergenceTol = 1E-4
+  private var maxNumIterations = 100
+  private var regParam = 0.0
+  private var miniBatchFraction = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of numCorrections less than 3 are not recommended; large values
+   * of numCorrections will result in excessive computing time.
+   * 3  numCorrections  10 is recommended.
+   * Restriction: numCorrections  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvergenceTol(tolerance: Int): this.type = {
+this.convergenceTol = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  override def optimize(data: RDD[(Double, Vector)], initialWeights: 
Vector): Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFraction,
+  initialWeights)
+weights
+  }
+
+}
+
+/**
+ * Top-level method to run LBFGS.
+ */
+object LBFGS extends Logging {
+  /**
+   * Run Limited-memory BFGS (L-BFGS) in parallel using mini batches.
+   * In each iteration, we sample a subset (fraction miniBatchFraction) of 
the total data
+   * in order to 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40414083
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14117/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40429267
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread dbtsai
Github user dbtsai closed the pull request at:

https://github.com/apache/spark/pull/353


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread dbtsai
Github user dbtsai commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40434555
  
Jenkins, retest this please.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread dbtsai
GitHub user dbtsai reopened a pull request:

https://github.com/apache/spark/pull/353

[SPARK-1157][MLlib] L-BFGS Optimizer based on Breeze's implementation.

This PR uses Breeze's L-BFGS implement, and Breeze dependency has already 
been introduced by Xiangrui's sparse input format work in SPARK-1212. Nice 
work, @mengxr !

When use with regularized updater, we need compute the regVal and 
regGradient (the gradient of regularized part in the cost function), and in the 
currently updater design, we can compute those two values by the following way.

Let's review how updater works when returning newWeights given the input 
parameters.

w' = w - thisIterStepSize * (gradient + regGradient(w))  Note that 
regGradient is function of w!
If we set gradient = 0, thisIterStepSize = 1, then
regGradient(w) = w - w'

As a result, for regVal, it can be computed by 

val regVal = updater.compute(
  weights,
  new DoubleMatrix(initialWeights.length, 1), 0, 1, regParam)._2
and for regGradient, it can be obtained by

  val regGradient = weights.sub(
updater.compute(weights, new DoubleMatrix(initialWeights.length, 
1), 1, 1, regParam)._1)

The PR includes the tests which compare the result with SGD with/without 
regularization.

We did a comparison between LBFGS and SGD, and often we saw 10x less
steps in LBFGS while the cost of per step is the same (just computing
the gradient).

The following is the paper by Prof. Ng at Stanford comparing different
optimizers including LBFGS and SGD. They use them in the context of
deep learning, but worth as reference.
http://cs.stanford.edu/~jngiam/papers/LeNgiamCoatesLahiriProchnowNg2011.pdf

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/dbtsai/spark dbtsai-LBFGS

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/353.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #353


commit 984b18e21396eae84656e15da3539ff3b5f3bf4a
Author: DB Tsai dbt...@alpinenow.com
Date:   2014-04-05T00:06:50Z

L-BFGS Optimizer based on Breeze's implementation. Also fixed indentation 
issue in GradientDescent optimizer.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread dbtsai
Github user dbtsai commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40434626
  
Jenkins, retest this please.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread dbtsai
Github user dbtsai commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40434691
  
Timeout for lastest jenkins run. It seems that CI is not stable now.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40434890
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40434895
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40439479
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14126/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-14 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40439478
  
Merged build finished. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-13 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11571545
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV, axpy}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(private var gradient: Gradient, private var updater: Updater)
+  extends Optimizer with Logging {
+
+  private var numCorrections = 10
+  private var convergenceTol = 1E-4
+  private var maxNumIterations = 100
+  private var regParam = 0.0
+  private var miniBatchFraction = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of numCorrections less than 3 are not recommended; large values
+   * of numCorrections will result in excessive computing time.
+   * 3  numCorrections  10 is recommended.
+   * Restriction: numCorrections  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvergenceTol(tolerance: Int): this.type = {
+this.convergenceTol = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  override def optimize(data: RDD[(Double, Vector)], initialWeights: 
Vector): Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFraction,
+  initialWeights)
+weights
+  }
+
+}
+
+/**
+ * Top-level method to run LBFGS.
+ */
+object LBFGS extends Logging {
+  /**
+   * Run Limited-memory BFGS (L-BFGS) in parallel using mini batches.
+   * In each iteration, we sample a subset (fraction miniBatchFraction) of 
the total data
+   * in order to 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-12 Thread mengxr
Github user mengxr commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40281879
  
Jenkins, retest this please.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-12 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40281918
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-12 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40281922
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-12 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40283049
  
Merged build finished. All automated tests passed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-12 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40283050
  
All automated tests passed.
Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14077/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40174782
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40174791
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40174857
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14051/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40174856
  
Merged build finished. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40177717
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40177723
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40177788
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14052/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40177786
  
Merged build finished. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11528081
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV, axpy}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(private var gradient: Gradient, private var updater: Updater)
+  extends Optimizer with Logging {
+
+  private var numCorrections = 10
+  private var convergenceTol = 1E-4
+  private var maxNumIterations = 100
+  private var regParam = 0.0
+  private var miniBatchFraction = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of numCorrections less than 3 are not recommended; large values
+   * of numCorrections will result in excessive computing time.
+   * 3  numCorrections  10 is recommended.
+   * Restriction: numCorrections  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvergenceTol(tolerance: Int): this.type = {
+this.convergenceTol = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  override def optimize(data: RDD[(Double, Vector)], initialWeights: 
Vector): Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFraction,
+  initialWeights)
+weights
+  }
+
+}
+
+/**
+ * Top-level method to run LBFGS.
+ */
+object LBFGS extends Logging {
+  /**
+   * Run Limited-memory BFGS (L-BFGS) in parallel using mini batches.
+   * In each iteration, we sample a subset (fraction miniBatchFraction) of 
the total data
+   * in order to 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11528087
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
LocalSparkContext with ShouldMatchers {
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add an extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  lazy val dataRDD = sc.parallelize(data, 2).cache()
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(LBFGS loss should be decreasing and match the result of Gradient 
Descent.) {
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+val convergenceTol = 1e-12
+val maxNumIterations = 10
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  simpleUpdater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// Since the cost function is convex, the loss is guaranteed to be 
monotonically decreasing with L-BFGS optimizer.
+// (SGD doesn't guarantee this, and the loss will be fluctuating in 
the optimization process.)
+assert((loss, loss.tail).zipped.forall(_  _), loss should be 
monotonically decreasing.)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  simpleUpdater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// GD converges a way slower than L-BFGS. To achieve 1% difference,
+// it requires 90 iterations in GD. No matter how hard we increase
+// the number of iterations in GD here, the lossGD will be always
+// larger than lossLBFGS. This is based on observation, no 
theoretically guaranteed
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.02,
+  LBFGS should match GD result within 2% difference.)
+  }
+
+  test(LBFGS and Gradient Descent with L2 regularization should get the 
same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+val convergenceTol = 1e-12
+val maxNumIterations = 10
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+val numGDIterations = 50
+val stepSize = 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11528239
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
LocalSparkContext with ShouldMatchers {
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add an extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  lazy val dataRDD = sc.parallelize(data, 2).cache()
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(LBFGS loss should be decreasing and match the result of Gradient 
Descent.) {
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+val convergenceTol = 1e-12
+val maxNumIterations = 10
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  simpleUpdater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// Since the cost function is convex, the loss is guaranteed to be 
monotonically decreasing with L-BFGS optimizer.
--- End diff --

line too long?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40250806
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40250825
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40250899
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14062/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40250897
  
Merged build finished. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40252639
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40252651
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40252713
  
Merged build finished. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-11 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40252714
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/14064/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11457830
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
--- End diff --

Scala imports `Array` by default.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11457867
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
--- End diff --

Provide a reference for L-BFGS. Either the wikipedia page or the original 
paper should work.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458037
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
--- End diff --

Move `{` to the line above. Maybe `extends ...` fits in the `class ...` 
line.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11457976
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
--- End diff --

mark `gradient` and `updater` private 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458103
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
--- End diff --

`conv` is not a common acronym for `convergence`, better use the full name. 
However, `tol` is a common acronym for `tolerance`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458125
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
--- End diff --

You don't need to declare the type info for primitive types.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458182
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
--- End diff --

`m` is not defined.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458258
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
--- End diff --

change `mcsrch` to `line search`. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458457
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
--- End diff --

Is `lineSearchTolerance` really used somewhere? Breeze uses fixed constants 
for line search.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458695
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
+this.lineSearchTolerance = tolerance
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvTolerance(tolerance: Int): this.type = {
+this.convTolerance = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  def optimize(data: RDD[(Double, Vector)], initialWeights: Vector): 
Vector = {
--- End diff --

Append `override` to `def` so we know that it will 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11458730
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
+this.lineSearchTolerance = tolerance
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvTolerance(tolerance: Int): this.type = {
+this.convTolerance = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  def optimize(data: RDD[(Double, Vector)], initialWeights: Vector): 
Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459107
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
+this.lineSearchTolerance = tolerance
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvTolerance(tolerance: Int): this.type = {
+this.convTolerance = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  def optimize(data: RDD[(Double, Vector)], initialWeights: Vector): 
Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459284
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
+this.lineSearchTolerance = tolerance
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvTolerance(tolerance: Int): this.type = {
+this.convTolerance = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  def optimize(data: RDD[(Double, Vector)], initialWeights: Vector): 
Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459572
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
+this.lineSearchTolerance = tolerance
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvTolerance(tolerance: Int): this.type = {
+this.convTolerance = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  def optimize(data: RDD[(Double, Vector)], initialWeights: Vector): 
Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459633
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
--- End diff --

Use `LocalSparkContext` to avoid dealing with `sc` setup directly. There is 
one in MLlib.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459647
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
--- End diff --

space after `:`


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459694
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
--- End diff --

an extra


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459772
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
--- End diff --

remove Assert so it reads test LBFGS loss is ...


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11459992
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
--- End diff --

Could you put an comment about the test here? Why 0.8 is a reasonable bound?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460048
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
--- End diff --

ditto. Remove Assert that 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460030
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
--- End diff --

Why?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460273
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460320
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460344
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460419
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460436
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460449
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread dbtsai
Github user dbtsai commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11460767
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
--- End diff --

@mengxr  
I know. I pretty much follow the existing coding style in 
GradientDescent.scala 
Should I also change the one in other place?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread dbtsai
Github user dbtsai commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11461398
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
+  private var lineSearchTolerance: Double = 0.9
+  private var convTolerance: Double = 1E-4
+  private var maxNumIterations: Int = 100
+  private var regParam: Double = 0.0
+  private var miniBatchFraction: Double = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of m less than 3 are not recommended; large values of m
+   * will result in excessive computing time. 3  m  10 is recommended.
+   * Restriction: m  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set the tolerance to control the accuracy of the line search in 
mcsrch step. Default 0.9.
+   * If the function and gradient evaluations are inexpensive with respect 
to the cost of
+   * the iteration (which is sometimes the case when solving very large 
problems) it may
+   * be advantageous to set to a small value. A typical small value is 0.1.
+   * Restriction: should be greater than 1e-4.
+   */
+  def setLineSearchTolerance(tolerance: Double): this.type = {
--- End diff --

Good catch! It's used in RISO implementation. Just remove them. Thks.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11463225
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.Array
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(var gradient: Gradient, var updater: Updater)
+  extends Optimizer with Logging
+{
+  private var numCorrections: Int = 10
--- End diff --

No, it is not necessary to do it in this PR.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread dbtsai
Github user dbtsai commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11463764
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
--- End diff --

This 0.8 bound is copying from GradientDescentSuite, and L-BFGS should at 
least have the same performance.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread dbtsai
Github user dbtsai commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11464280
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread dbtsai
Github user dbtsai commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11464736
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
ShouldMatchers {
+  @transient private var sc: SparkContext = _
+  var dataRDD:RDD[(Double, Vector)] = _
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  val lineSearchTolerance = 0.9
+  var convTolerance = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add a extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  override def beforeAll() {
+sc = new SparkContext(local, test)
+dataRDD = sc.parallelize(data, 2).cache()
+  }
+
+  override def afterAll() {
+sc.stop()
+System.clearProperty(spark.driver.port)
+  }
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(Assert LBFGS loss is decreasing and matches the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.05,
+  LBFGS should match GD result within 5% error.)
+  }
+
+  test(Assert that LBFGS and Gradient Descent with L2 regularization get 
the same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  lineSearchTolerance,
+  convTolerance,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// With regularization, GD converges faster now!
+// So we only need 20 iterations to get the 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40032548
  
Merged build started. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40032538
  
 Merged build triggered. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/353#issuecomment-40035145
  

Refer to this link for build results: 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13972/


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11468969
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
LocalSparkContext with ShouldMatchers {
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  var convergenceTol = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add an extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  lazy val dataRDD = sc.parallelize(data, 2).cache()
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(LBFGS loss should be decreasing and match the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+// This 0.8 bound is copying from GradientDescentSuite, and L-BFGS 
should
+// at least have the same performance. It's based on observation, no 
theoretically guaranteed.
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// GD converges a way slower than L-BFGS. To achieve 1% difference,
+// it requires 90 iterations in GD. No matter how hard we increase
+// the number of iterations in GD here, the lossGD will be always
+// larger than lossLBFGS. This is based on observation, no 
theoretically guaranteed
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.02,
+  LBFGS should match GD result within 2% difference.)
+  }
+
+  test(LBFGS and Gradient Descent with L2 regularization should get the 
same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11468996
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV, axpy}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(private var gradient: Gradient, private var updater: Updater)
+  extends Optimizer with Logging {
+
+  private var numCorrections = 10
+  private var convergenceTol = 1E-4
+  private var maxNumIterations = 100
+  private var regParam = 0.0
+  private var miniBatchFraction = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of numCorrections less than 3 are not recommended; large values
+   * of numCorrections will result in excessive computing time.
+   * 3  numCorrections  10 is recommended.
+   * Restriction: numCorrections  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvergenceTol(tolerance: Int): this.type = {
+this.convergenceTol = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  override def optimize(data: RDD[(Double, Vector)], initialWeights: 
Vector): Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFraction,
+  initialWeights)
+weights
+  }
+
+}
+
+/**
+ * Top-level method to run LBFGS.
+ */
+object LBFGS extends Logging {
+  /**
+   * Run Limited-memory BFGS (L-BFGS) in parallel using mini batches.
+   * In each iteration, we sample a subset (fraction miniBatchFraction) of 
the total data
+   * in order to 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11468997
  
--- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala ---
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import scala.collection.mutable.ArrayBuffer
+
+import breeze.linalg.{DenseVector = BDV, axpy}
+import breeze.optimize.{CachedDiffFunction, DiffFunction}
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Class used to solve an optimization problem using Limited-memory BFGS.
+ * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every 
iteration.
+ */
+class LBFGS(private var gradient: Gradient, private var updater: Updater)
+  extends Optimizer with Logging {
+
+  private var numCorrections = 10
+  private var convergenceTol = 1E-4
+  private var maxNumIterations = 100
+  private var regParam = 0.0
+  private var miniBatchFraction = 1.0
+
+  /**
+   * Set the number of corrections used in the LBFGS update. Default 10.
+   * Values of numCorrections less than 3 are not recommended; large values
+   * of numCorrections will result in excessive computing time.
+   * 3  numCorrections  10 is recommended.
+   * Restriction: numCorrections  0
+   */
+  def setNumCorrections(corrections: Int): this.type = {
+assert(corrections  0)
+this.numCorrections = corrections
+this
+  }
+
+  /**
+   * Set fraction of data to be used for each L-BFGS iteration. Default 
1.0.
+   */
+  def setMiniBatchFraction(fraction: Double): this.type = {
+this.miniBatchFraction = fraction
+this
+  }
+
+  /**
+   * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
+   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
+   */
+  def setConvergenceTol(tolerance: Int): this.type = {
+this.convergenceTol = tolerance
+this
+  }
+
+  /**
+   * Set the maximal number of iterations for L-BFGS. Default 100.
+   */
+  def setMaxNumIterations(iters: Int): this.type = {
+this.maxNumIterations = iters
+this
+  }
+
+  /**
+   * Set the regularization parameter. Default 0.0.
+   */
+  def setRegParam(regParam: Double): this.type = {
+this.regParam = regParam
+this
+  }
+
+  /**
+   * Set the gradient function (of the loss function of one single data 
example)
+   * to be used for L-BFGS.
+   */
+  def setGradient(gradient: Gradient): this.type = {
+this.gradient = gradient
+this
+  }
+
+  /**
+   * Set the updater function to actually perform a gradient step in a 
given direction.
+   * The updater is responsible to perform the update from the 
regularization term as well,
+   * and therefore determines what kind or regularization is used, if any.
+   */
+  def setUpdater(updater: Updater): this.type = {
+this.updater = updater
+this
+  }
+
+  override def optimize(data: RDD[(Double, Vector)], initialWeights: 
Vector): Vector = {
+val (weights, _) = LBFGS.runMiniBatchLBFGS(
+  data,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFraction,
+  initialWeights)
+weights
+  }
+
+}
+
+/**
+ * Top-level method to run LBFGS.
+ */
+object LBFGS extends Logging {
+  /**
+   * Run Limited-memory BFGS (L-BFGS) in parallel using mini batches.
+   * In each iteration, we sample a subset (fraction miniBatchFraction) of 
the total data
+   * in order to 

[GitHub] spark pull request: [SPARK-1157][MLlib] L-BFGS Optimizer based on ...

2014-04-09 Thread mengxr
Github user mengxr commented on a diff in the pull request:

https://github.com/apache/spark/pull/353#discussion_r11469009
  
--- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala ---
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.optimization
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class LBFGSSuite extends FunSuite with BeforeAndAfterAll with 
LocalSparkContext with ShouldMatchers {
+
+  val nPoints = 1
+  val A = 2.0
+  val B = -1.5
+
+  val initialB = -1.0
+  val initialWeights = Array(initialB)
+
+  val gradient = new LogisticGradient()
+  val numCorrections = 10
+  var convergenceTol = 1e-12
+  var maxNumIterations = 10
+  val miniBatchFrac = 1.0
+
+  val simpleUpdater = new SimpleUpdater()
+  val squaredL2Updater = new SquaredL2Updater()
+
+  // Add an extra variable consisting of all 1.0's for the intercept.
+  val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
+  val data = testData.map { case LabeledPoint(label, features) =
+label - Vectors.dense(1.0, features.toArray: _*)
+  }
+
+  lazy val dataRDD = sc.parallelize(data, 2).cache()
+
+  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
+math.abs(x - y) / (math.abs(y) + 1e-15)  tol
+  }
+
+  test(LBFGS loss should be decreasing and match the result of Gradient 
Descent.) {
+val updater = new SimpleUpdater()
+val regParam = 0
+
+val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: 
_*)
+
+val (_, loss) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+assert(loss.last - loss.head  0, loss isn't decreasing.)
+
+val lossDiff = loss.init.zip(loss.tail).map {
+  case (lhs, rhs) = lhs - rhs
+}
+// This 0.8 bound is copying from GradientDescentSuite, and L-BFGS 
should
+// at least have the same performance. It's based on observation, no 
theoretically guaranteed.
+assert(lossDiff.count(_  0).toDouble / lossDiff.size  0.8)
+
+val stepSize = 1.0
+// Well, GD converges slower, so it requires more iterations!
+val numGDIterations = 50
+val (_, lossGD) = GradientDescent.runMiniBatchSGD(
+  dataRDD,
+  gradient,
+  updater,
+  stepSize,
+  numGDIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)
+
+// GD converges a way slower than L-BFGS. To achieve 1% difference,
+// it requires 90 iterations in GD. No matter how hard we increase
+// the number of iterations in GD here, the lossGD will be always
+// larger than lossLBFGS. This is based on observation, no 
theoretically guaranteed
+assert(Math.abs((lossGD.last - loss.last) / loss.last)  0.02,
+  LBFGS should match GD result within 2% difference.)
+  }
+
+  test(LBFGS and Gradient Descent with L2 regularization should get the 
same result.) {
+val regParam = 0.2
+
+// Prepare another non-zero weights to compare the loss in the first 
iteration.
+val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
+
+val (weightLBFGS, lossLBFGS) = LBFGS.runMiniBatchLBFGS(
+  dataRDD,
+  gradient,
+  squaredL2Updater,
+  numCorrections,
+  convergenceTol,
+  maxNumIterations,
+  regParam,
+  miniBatchFrac,
+  initialWeightsWithIntercept)