Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18924#discussion_r140621444
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala ---
    @@ -462,36 +462,55 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
         val expElogbetaBc = batch.sparkContext.broadcast(expElogbeta)
         val alpha = this.alpha.asBreeze
         val gammaShape = this.gammaShape
    +    val optimizeDocConcentration = this.optimizeDocConcentration
    +    // We calculate logphat in the same pass as other statistics, but we 
only need
    +    // it if we are optimizing docConcentration
    +    val logphatPartOptionBase = () => if (optimizeDocConcentration) 
Some(BDV.zeros[Double](k))
    +                                      else None
     
    -    val stats: RDD[(BDM[Double], List[BDV[Double]])] = batch.mapPartitions 
{ docs =>
    +    val stats: RDD[(BDM[Double], Option[BDV[Double]], Int)] = 
batch.mapPartitions { docs =>
    --- End diff --
    
    Let's use Long for the doc count since it could overflow for large datasets 
and miniBatchFraction


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to