[GitHub] spark pull request #14834: [SPARK-17163][ML][WIP] Unified LogisticRegression...

dbtsai Thu, 08 Sep 2016 21:50:24 -0700

Github user dbtsai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14834#discussion_r78128581
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 ---
    @@ -460,33 +564,74 @@ class LogisticRegression @Since("1.2.0") (
                as a result, no scaling is needed.
              */
             val rawCoefficients = state.x.toArray.clone()
    -        var i = 0
    -        while (i < numFeatures) {
    -          rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / 
featuresStd(i) else 0.0 }
    -          i += 1
    +        val coefficientArray = Array.tabulate(numCoefficientSets * 
numFeatures) { i =>
    +          // flatIndex will loop though rawCoefficients, and skip the 
intercept terms.
    +          val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
    +          val featureIndex = i % numFeatures
    +          if (featuresStd(featureIndex) != 0.0) {
    +            rawCoefficients(flatIndex) / featuresStd(featureIndex)
    +          } else {
    +            0.0
    +          }
    +        }
    +        val coefficientMatrix =
    +          new DenseMatrix(numCoefficientSets, numFeatures, 
coefficientArray, isTransposed = true)
    +
    +        if ($(regParam) == 0.0 && isMultinomial) {
    +          /*
    +            When no regularization is applied, the coefficients lack 
identifiability because
    +            we do not use a pivot class. We can add any constant value to 
the coefficients and
    +            get the same likelihood. So here, we choose the mean centered 
coefficients for
    +            reproducibility. This method follows the approach in glmnet, 
described here:
    +
    +            Friedman, et al. "Regularization Paths for Generalized Linear 
Models via
    +              Coordinate Descent," 
https://core.ac.uk/download/files/153/6287975.pdf
    +           */
    +          val coefficientMean = coefficientMatrix.values.sum / 
coefficientMatrix.values.length
    +          coefficientMatrix.update(_ - coefficientMean)
             }
    -        bcFeaturesStd.destroy(blocking = false)
     
    -        if ($(fitIntercept)) {
    -          (Vectors.dense(rawCoefficients.dropRight(1)).compressed, 
rawCoefficients.last,
    -            arrayBuilder.result())
    +        val interceptsArray: Array[Double] = if ($(fitIntercept)) {
    +          Array.tabulate(numCoefficientSets) { i =>
    +            val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1
    +            rawCoefficients(coefIndex)
    +          }
    +        } else {
    +          Array[Double]()
    +        }
    +        /*
    +          The intercepts are never regularized, so we always center the 
mean.
    +         */
    +        val interceptVector = if (interceptsArray.nonEmpty && 
isMultinomial) {
    +          val interceptMean = interceptsArray.sum / numClasses
    +          interceptsArray.indices.foreach { i => interceptsArray(i) -= 
interceptMean }
    +          Vectors.dense(interceptsArray)
    +        } else if (interceptsArray.length == 1) {
    +          Vectors.dense(interceptsArray)
             } else {
    -          (Vectors.dense(rawCoefficients).compressed, 0.0, 
arrayBuilder.result())
    +          Vectors.sparse(numCoefficientSets, Seq())
             }
    +        (coefficientMatrix, interceptVector, arrayBuilder.result())
           }
         }
     
         if (handlePersistence) instances.unpersist()
     
    -    val model = copyValues(new LogisticRegressionModel(uid, coefficients, 
intercept))
    -    val (summaryModel, probabilityColName) = 
model.findSummaryModelAndProbabilityCol()
    -    val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
    -      summaryModel.transform(dataset),
    -      probabilityColName,
    -      $(labelCol),
    -      $(featuresCol),
    -      objectiveHistory)
    --- End diff --
    
    Change the outer
    
    ```scala
        val (coefficients, intercept, objectiveHistory) = {
          .....
         }
    ```
    to
    ```scala
        val (coefficientMatrix, interceptVector, objectiveHistory) = {
          .....
         }
    ```
    for clarity.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #14834: [SPARK-17163][ML][WIP] Unified LogisticRegression...

Reply via email to