Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/1778#discussion_r17579677 --- Diff: mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala --- @@ -390,6 +393,113 @@ class RowMatrix( new RowMatrix(AB, nRows, B.numCols) } + /** + * Compute all cosine similarities between columns of this matrix using the brute-force + * approach of computing normalized dot products. + * + * @return An n x n sparse upper-triangular matrix of cosine similarities between + * columns of this matrix. + */ + def columnSimilarities(): CoordinateMatrix = { + similarColumns(0.0) + } + + /** + * Compute all similarities between columns of this matrix using a sampling approach. + * + * The threshold parameter is a trade-off knob between estimate quality and computational cost. + * + * Setting a threshold of 0 guarantees deterministic correct results, but comes at exactly + * the same cost as the brute-force approach. Setting the threshold to positive values + * incurs strictly less computational cost than the brute-force approach, however the + * similarities computed will be estimates. + * + * The sampling guarantees relative-error correctness for those pairs of columns that have + * similarity greater than the given similarity threshold. + * + * To describe the guarantee, we set some notation: + * Let A be the smallest in magnitude non-zero element of this matrix. + * Let B be the largest in magnitude non-zero element of this matrix. + * Let L be the number of non-zeros per row. + * + * For example, for {0,1} matrices: A=B=1. + * Another example, for the Netflix matrix: A=1, B=5 + * + * For those column pairs that are above the threshold, + * the computed similarity is correct to within 20% relative error with probability + * at least 1 - (0.981)^(100/B) + * + * The shuffle size is bounded by the *smaller* of the following two expressions: + * + * O(n log(n) L / (threshold * A)) + * O(m L^2) + * + * The latter is the cost of the brute-force approach, so for non-zero thresholds, + * the cost is always cheaper than the brute-force approach. + * + * @param threshold Similarities above this threshold are probably computed correctly. + * Set to 0 for deterministic guaranteed correctness. + * @return An n x n sparse upper-triangular matrix of cosine similarities + * between columns of this matrix. + */ + def similarColumns(threshold: Double): CoordinateMatrix = { + require(threshold >= 0 && threshold <= 1, s"Threshold not in [0,1]: $threshold") + + val gamma = if (math.abs(threshold) < 1e-6) { + Double.PositiveInfinity + } else { + 100 * math.log(numCols()) / threshold + } + + similarColumnsDIMSUM(computeColumnSummaryStatistics().normL2.toArray, gamma) + } + + /** + * Find all similar columns using the DIMSUM sampling algorithm, described in two papers + * + * http://arxiv.org/abs/1206.2082 + * http://arxiv.org/abs/1304.1467 + * + * @param colMags A vector of column magnitudes + * @param gamma The oversampling parameter. For provable results, set to 100 * log(n) / s, + * where s is the smallest similarity score to be estimated, + * and n is the number of columns + * @return An n x n sparse upper-triangular matrix of cosine similarities + * between columns of this matrix. + */ + private[mllib] def similarColumnsDIMSUM(colMags: Array[Double], + gamma: Double): CoordinateMatrix = { + require(gamma > 1.0, s"Oversampling should be greater than 1: $gamma") + require(colMags.size == this.numCols(), "Number of magnitudes didn't match column dimension") + + val sg = math.sqrt(gamma) // sqrt(gamma) used many times + + val sims = rows.flatMap { row => + val buf = new ListBuffer[((Int, Int), Double)]() + row.toBreeze.activeIterator.foreach { + case (_, 0.0) => // Skip explicit zero elements. + case (i, iVal) => + val rand = new scala.util.Random(iVal.toLong) --- End diff -- It won't give you a pseudo random sequence. Think about the case when all values are the same. The seed should be set on a partition level. Could you update this block with `mapPartitionsWithIndex`?
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org