This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 934a91f [SPARK-21481][ML][FOLLOWUP][TRIVIAL] HashingTF use util.collection.OpenHashMap instead of mutable.HashMap 934a91f is described below commit 934a91fcb4de1e5c4b93b58e7452afa4bb4a9586 Author: zhengruifeng <ruife...@foxmail.com> AuthorDate: Sat Sep 26 08:16:39 2020 -0500 [SPARK-21481][ML][FOLLOWUP][TRIVIAL] HashingTF use util.collection.OpenHashMap instead of mutable.HashMap ### What changes were proposed in this pull request? `HashingTF` use `util.collection.OpenHashMap` instead of `mutable.HashMap` ### Why are the changes needed? according to `util.collection.OpenHashMap` 's doc: > This map is about 5X faster than java.util.HashMap, while using much less space overhead. according to performance tests like ([Simple microbenchmarks comparing Scala vs Java mutable map performance ](https://gist.github.com/pchiusano/1423303)), `mutable.HashMap` maybe more inefficient than `java.util.HashMap` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing testsuites Closes #29852 from zhengruifeng/hashingtf_opt. Authored-by: zhengruifeng <ruife...@foxmail.com> Signed-off-by: Sean Owen <sro...@gmail.com> --- .../org/apache/spark/ml/feature/HashingTF.scala | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index d2bb013..f4223bc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -17,8 +17,6 @@ package org.apache.spark.ml.feature -import scala.collection.mutable - import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup @@ -32,6 +30,7 @@ import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} import org.apache.spark.util.Utils import org.apache.spark.util.VersionUtils.majorMinorVersion +import org.apache.spark.util.collection.OpenHashMap /** * Maps a sequence of terms to their term frequencies using the hashing trick. @@ -91,20 +90,13 @@ class HashingTF @Since("3.0.0") private[ml] ( @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) - val localNumFeatures = $(numFeatures) - val localBinary = $(binary) + val n = $(numFeatures) + val updateFunc = if ($(binary)) (v: Double) => 1.0 else (v: Double) => v + 1.0 val hashUDF = udf { terms: Seq[_] => - val termFrequencies = mutable.HashMap.empty[Int, Double].withDefaultValue(0.0) - terms.foreach { term => - val i = indexOf(term) - if (localBinary) { - termFrequencies(i) = 1.0 - } else { - termFrequencies(i) += 1.0 - } - } - Vectors.sparse(localNumFeatures, termFrequencies.toSeq) + val map = new OpenHashMap[Int, Double]() + terms.foreach { term => map.changeValue(indexOf(term), 1.0, updateFunc) } + Vectors.sparse(n, map.toSeq) } dataset.withColumn($(outputCol), hashUDF(col($(inputCol))), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org