Github user akopich commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19565#discussion_r146572407
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala ---
    @@ -497,40 +495,38 @@ final class OnlineLDAOptimizer extends LDAOptimizer 
with Logging {
           (u._1, u._2, u._3 + v._3)
         }
     
    -    val (statsSum: BDM[Double], logphatOption: Option[BDV[Double]], 
nonEmptyDocsN: Long) = stats
    +    val (statsSum: BDM[Double], logphatOption: Option[BDV[Double]], 
batchSize: Long) = stats
           .treeAggregate((BDM.zeros[Double](k, vocabSize), 
logphatPartOptionBase(), 0L))(
             elementWiseSum, elementWiseSum
           )
     
         expElogbetaBc.destroy(false)
     
    -    if (nonEmptyDocsN == 0) {
    +    if (batchSize == 0) {
           logWarning("No non-empty documents were submitted in the batch.")
           // Therefore, there is no need to update any of the model parameters
           return this
         }
     
         val batchResult = statsSum *:* expElogbeta.t
    -    // Note that this is an optimization to avoid batch.count
    -    val batchSize = (miniBatchFraction * corpusSize).ceil.toInt
         updateLambda(batchResult, batchSize)
     
    -    logphatOption.foreach(_ /= nonEmptyDocsN.toDouble)
    -    logphatOption.foreach(updateAlpha(_, nonEmptyDocsN))
    +    logphatOption.foreach(_ /= batchSize.toDouble)
    +    logphatOption.foreach(updateAlpha(_, batchSize))
     
         this
       }
     
       /**
        * Update lambda based on the batch submitted. batchSize can be 
different for each iteration.
        */
    -  private def updateLambda(stat: BDM[Double], batchSize: Int): Unit = {
    +  private def updateLambda(stat: BDM[Double], batchSize: Double): Unit = {
    --- End diff --
    
    I've done this just in order to achieve consistency with `updateAlpha`. Is 
it a bad idea?


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to