Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/18924#discussion_r140621444 --- Diff: mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala --- @@ -462,36 +462,55 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val expElogbetaBc = batch.sparkContext.broadcast(expElogbeta) val alpha = this.alpha.asBreeze val gammaShape = this.gammaShape + val optimizeDocConcentration = this.optimizeDocConcentration + // We calculate logphat in the same pass as other statistics, but we only need + // it if we are optimizing docConcentration + val logphatPartOptionBase = () => if (optimizeDocConcentration) Some(BDV.zeros[Double](k)) + else None - val stats: RDD[(BDM[Double], List[BDV[Double]])] = batch.mapPartitions { docs => + val stats: RDD[(BDM[Double], Option[BDV[Double]], Int)] = batch.mapPartitions { docs => --- End diff -- Let's use Long for the doc count since it could overflow for large datasets and miniBatchFraction
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org