Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/14834#discussion_r78464517 --- Diff: mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala --- @@ -460,33 +577,74 @@ class LogisticRegression @Since("1.2.0") ( as a result, no scaling is needed. */ val rawCoefficients = state.x.toArray.clone() - var i = 0 - while (i < numFeatures) { - rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } - i += 1 + val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i => + // flatIndex will loop though rawCoefficients, and skip the intercept terms. + val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i + val featureIndex = i % numFeatures + if (featuresStd(featureIndex) != 0.0) { + rawCoefficients(flatIndex) / featuresStd(featureIndex) + } else { + 0.0 + } + } + val coefficientMatrix = + new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true) + + if ($(regParam) == 0.0 && isMultinomial) { + /* + When no regularization is applied, the coefficients lack identifiability because + we do not use a pivot class. We can add any constant value to the coefficients and + get the same likelihood. So here, we choose the mean centered coefficients for + reproducibility. This method follows the approach in glmnet, described here: + + Friedman, et al. "Regularization Paths for Generalized Linear Models via + Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf + */ + val coefficientMean = coefficientMatrix.values.sum / coefficientMatrix.values.length + coefficientMatrix.update(_ - coefficientMean) } - bcFeaturesStd.destroy(blocking = false) - if ($(fitIntercept)) { - (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last, - arrayBuilder.result()) + val interceptsArray: Array[Double] = if ($(fitIntercept)) { + Array.tabulate(numCoefficientSets) { i => + val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1 + rawCoefficients(coefIndex) + } + } else { + Array[Double]() + } + /* + The intercepts are never regularized, so we always center the mean. + */ + val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) { + val interceptMean = interceptsArray.sum / numClasses + interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } + Vectors.dense(interceptsArray) + } else if (interceptsArray.length == 1) { + Vectors.dense(interceptsArray) } else { - (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result()) + Vectors.sparse(numCoefficientSets, Seq()) } + (coefficientMatrix, interceptVector, arrayBuilder.result()) --- End diff -- I implemented this logic, good suggestion. Let me know if that's what you had in mind.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org