Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/4419#discussion_r29296377 --- Diff: mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala --- @@ -208,3 +225,224 @@ class EMLDAOptimizer extends LDAOptimizer{ new DistributedLDAModel(this, iterationTimes) } } + + +/** + * :: Experimental :: + * + * An online optimizer for LDA. The Optimizer implements the Online LDA algorithm, which + * processes a subset of the corpus by each call to next, and update the term-topic + * distribution adaptively. + * + * References: + * Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010. + */ +@Experimental +class OnlineLDAOptimizer extends LDAOptimizer { + + // LDA common parameters + private var k: Int = 0 + private var D: Int = 0 + private var vocabSize: Int = 0 + private var alpha: Double = 0 + private var eta: Double = 0 + private var randomSeed: Long = 0 + + // Online LDA specific parameters + private var tau_0: Double = -1 + private var kappa: Double = -1 + private var batchSize: Int = -1 + + // internal data structure + private var docs: RDD[(Long, Vector)] = null + private var lambda: BDM[Double] = null + private var Elogbeta: BDM[Double]= null + private var expElogbeta: BDM[Double] = null + + // count of invocation to next, used to help deciding the weight for each iteration + private var iteration = 0 + + /** + * A (positive) learning parameter that downweights early iterations + */ + def getTau_0: Double = { + if (this.tau_0 == -1) { --- End diff -- This default is not problem-dependent, so I'd recommend initializing the value to 1024 and not having a special value -1. Can you please also make this fix for other parameters with defaults?
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org