Github user ygcao commented on a diff in the pull request: https://github.com/apache/spark/pull/10152#discussion_r47429372 --- Diff: mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala --- @@ -469,13 +495,13 @@ class Word2VecModel private[spark] ( this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model)) } - private def cosineSimilarity(v1: Array[Float], v2: Array[Float]): Double = { - require(v1.length == v2.length, "Vectors should have the same length") - val n = v1.length - val norm1 = blas.snrm2(n, v1, 1) - val norm2 = blas.snrm2(n, v2, 1) - if (norm1 == 0 || norm2 == 0) return 0.0 - blas.sdot(n, v1, 1, v2, 1) / norm1 / norm2 + /** + * get the built vocabulary from the input + * this is useful for getting the whole vocabulary to join with other data or filtering other data + * @return a map of word to its index + */ + def getVocabulary: Map[String, Int] = { --- End diff -- Another thing I want to raise your attention is that the slice function in scala (used in getVectors) is super slow for whatever reason. We can output needed vector for join purpose from getWordVectors's return value by using indexes 100x faster than getVectors function call for each single word.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org