Repository: spark Updated Branches: refs/heads/master 3cac6614a -> 8b292b19c
[SPARK-10654][MLLIB] Add columnSimilarities to IndexedRowMatrix Add columnSimilarities to IndexedRowMatrix by delegating to functionality already in RowMatrix. With a test. Author: Reza Zadeh <r...@databricks.com> Closes #8792 from rezazadeh/colsims. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b292b19 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b292b19 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b292b19 Branch: refs/heads/master Commit: 8b292b19c9b3aaaa51b919a12132e099e5be832d Parents: 3cac661 Author: Reza Zadeh <r...@databricks.com> Authored: Mon Oct 26 22:00:24 2015 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Oct 26 22:00:24 2015 -0700 ---------------------------------------------------------------------- .../mllib/linalg/distributed/IndexedRowMatrix.scala | 13 +++++++++++++ .../linalg/distributed/IndexedRowMatrixSuite.scala | 12 ++++++++++++ 2 files changed, 25 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/8b292b19/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index e6af0c0..9762991 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -68,6 +68,19 @@ class IndexedRowMatrix @Since("1.0.0") ( nRows } + + /** + * Compute all cosine similarities between columns of this matrix using the brute-force + * approach of computing normalized dot products. + * + * @return An n x n sparse upper-triangular matrix of cosine similarities between + * columns of this matrix. + */ + @Since("1.6.0") + def columnSimilarities(): CoordinateMatrix = { + toRowMatrix().columnSimilarities() + } + /** * Drops row indices and converts this matrix to a * [[org.apache.spark.mllib.linalg.distributed.RowMatrix]]. http://git-wip-us.apache.org/repos/asf/spark/blob/8b292b19/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala index 0ecb7a2..6de6cf2 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala @@ -153,6 +153,18 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("similar columns") { + val A = new IndexedRowMatrix(indexedRows) + val gram = A.computeGramianMatrix().toBreeze.toDenseMatrix + + val G = A.columnSimilarities().toBreeze() + + for (i <- 0 until n; j <- i + 1 until n) { + val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j)) + assert(math.abs(G(i, j) - trueResult) < 1e-6) + } + } + def closeToZero(G: BDM[Double]): Boolean = { G.valuesIterator.map(math.abs).sum < 1e-6 } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org