This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new a285c0d [SPARK-28421][ML] SparseVector.apply performance optimization a285c0d is described below commit a285c0d4fe071ceed05d8ea693febfbac5962d64 Author: zhengruifeng <ruife...@foxmail.com> AuthorDate: Tue Jul 23 20:20:22 2019 -0500 [SPARK-28421][ML] SparseVector.apply performance optimization ## What changes were proposed in this pull request? optimize the `SparseVector.apply` by avoiding internal conversion Since the speed up is significant (2.5X ~ 5X), and this method is widely used in ml, I suggest back porting. | size| nnz | apply(old) | apply2(new impl) | apply3(new impl with extra range check)| |------|----------|------------|----------|----------| |10000000|100|75294|12208|18682| |10000000|10000|75616|23132|32932| |10000000|1000000|92949|42529|48821| ## How was this patch tested? existing tests using following code to test performance (here the new impl is named `apply2`, and another impl with extra range check is named `apply3`): ``` import scala.util.Random import org.apache.spark.ml.linalg._ val size = 10000000 for (nnz <- Seq(100, 10000, 1000000)) { val rng = new Random(123) val indices = Array.fill(nnz + nnz)(rng.nextInt.abs % size).distinct.take(nnz).sorted val values = Array.fill(nnz)(rng.nextDouble) val vec = Vectors.sparse(size, indices, values).toSparse val tic1 = System.currentTimeMillis; (0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec(i); i+=1} }; val toc1 = System.currentTimeMillis; val tic2 = System.currentTimeMillis; (0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec.apply2(i); i+=1} }; val toc2 = System.currentTimeMillis; val tic3 = System.currentTimeMillis; (0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec.apply3(i); i+=1} }; val toc3 = System.currentTimeMillis; println((size, nnz, toc1 - tic1, toc2 - tic2, toc3 - tic3)) } ``` Closes #25178 from zhengruifeng/sparse_vec_apply. Authored-by: zhengruifeng <ruife...@foxmail.com> Signed-off-by: Sean Owen <sean.o...@databricks.com> --- .../src/main/scala/org/apache/spark/ml/linalg/Vectors.scala | 9 +++++++++ mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala index 5824e46..827b620 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala @@ -603,6 +603,15 @@ class SparseVector @Since("2.0.0") ( private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size) + override def apply(i: Int): Double = { + if (i < 0 || i >= size) { + throw new IndexOutOfBoundsException(s"Index $i out of bounds [0, $size)") + } + + val j = util.Arrays.binarySearch(indices, i) + if (j < 0) 0.0 else values(j) + } + override def foreachActive(f: (Int, Double) => Unit): Unit = { var i = 0 val localValuesSize = values.length diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 6e68d96..fbc9358 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -785,6 +785,15 @@ class SparseVector @Since("1.0.0") ( private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size) + override def apply(i: Int): Double = { + if (i < 0 || i >= size) { + throw new IndexOutOfBoundsException(s"Index $i out of bounds [0, $size)") + } + + val j = util.Arrays.binarySearch(indices, i) + if (j < 0) 0.0 else values(j) + } + @Since("1.6.0") override def foreachActive(f: (Int, Double) => Unit): Unit = { var i = 0 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org