Github user KyleLi1985 commented on the issue: https://github.com/apache/spark/pull/22893 There is my test for situation sparse-sparse, dense-dense, sparse-dense case ` import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import com.github.fommil.netlib.F2jBLAS import java.util.Date object SparkMLlibTest { class VectorWithNorm(val vector: Vector, val norm: Double) extends Serializable { def this(vector: Vector) = this(vector, Vectors.norm(vector, 2.0)) def this(array: Array[Double]) = this(Vectors.dense(array)) /** Converts the vector to a dense vector. */ def toDense: VectorWithNorm = new VectorWithNorm(Vectors.dense(vector.toArray), norm) } def dot(x: Vector, y: Vector): Double = { require(x.size == y.size, "BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes:" + " x.size = " + x.size + ", y.size = " + y.size) (x, y) match { case (dx: DenseVector, dy: DenseVector) => dot(dx, dy) case (sx: SparseVector, dy: DenseVector) => dot(sx, dy) case (dx: DenseVector, sy: SparseVector) => dot(sy, dx) case (sx: SparseVector, sy: SparseVector) => dot(sx, sy) case _ => throw new IllegalArgumentException(s"dot doesn't support (${x.getClass}, ${y.getClass}).") } } def dot(x: DenseVector, y: DenseVector): Double = { val n = x.size new F2jBLAS().ddot(n, x.values, 1, y.values, 1) } def dot(x: SparseVector, y: DenseVector): Double = { val xValues = x.values val xIndices = x.indices val yValues = y.values val nnz = xIndices.length var sum = 0.0 var k = 0 while (k < nnz) { sum += xValues(k) * yValues(xIndices(k)) k += 1 } sum } /** * dot(x, y) */ def dot(x: SparseVector, y: SparseVector): Double = { val xValues = x.values val xIndices = x.indices val yValues = y.values val yIndices = y.indices val nnzx = xIndices.length val nnzy = yIndices.length var kx = 0 var ky = 0 var sum = 0.0 // y catching x while (kx < nnzx && ky < nnzy) { val ix = xIndices(kx) while (ky < nnzy && yIndices(ky) < ix) { ky += 1 } if (ky < nnzy && yIndices(ky) == ix) { sum += xValues(kx) * yValues(ky) ky += 1 } kx += 1 } sum } lazy val EPSILON = { var eps = 1.0 while ((1.0 + (eps / 2.0)) != 1.0) { eps /= 2.0 } eps } def fastSquaredDistanceAddedPatch( v1: Vector, norm1: Double, v2: Vector, norm2: Double, precision: Double = 1e-6): Double = { val n = v1.size require(v2.size == n) require(norm1 >= 0.0 && norm2 >= 0.0) val sumSquaredNorm = norm1 * norm1 + norm2 * norm2 val normDiff = norm1 - norm2 var sqDist = 0.0 /* * The relative error is * <pre> * EPSILON * ( \|a\|_2^2 + \|b\\_2^2 + 2 |a^T b|) / ( \|a - b\|_2^2 ), * </pre> * which is bounded by * <pre> * 2.0 * EPSILON * ( \|a\|_2^2 + \|b\|_2^2 ) / ( (\|a\|_2 - \|b\|_2)^2 ). * </pre> * The bound doesn't need the inner product, so we can use it as a sufficient condition to * check quickly whether the inner product approach is accurate. */ val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON) if (precisionBound1 < precision && (!v1.isInstanceOf[DenseVector] || !v2.isInstanceOf[DenseVector])) { sqDist = sumSquaredNorm - 2.0 * dot(v1, v2) } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) { val dotValue = dot(v1, v2) sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0) val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { sqDist = Vectors.sqdist(v1, v2) } } else { sqDist = Vectors.sqdist(v1, v2) } sqDist } def fastSquaredDistanceOriginal( v1: Vector, norm1: Double, v2: Vector, norm2: Double, precision: Double = 1e-6): Double = { val n = v1.size require(v2.size == n) require(norm1 >= 0.0 && norm2 >= 0.0) val sumSquaredNorm = norm1 * norm1 + norm2 * norm2 val normDiff = norm1 - norm2 var sqDist = 0.0 /* * The relative error is * <pre> * EPSILON * ( \|a\|_2^2 + \|b\\_2^2 + 2 |a^T b|) / ( \|a - b\|_2^2 ), * </pre> * which is bounded by * <pre> * 2.0 * EPSILON * ( \|a\|_2^2 + \|b\|_2^2 ) / ( (\|a\|_2 - \|b\|_2)^2 ). * </pre> * The bound doesn't need the inner product, so we can use it as a sufficient condition to * check quickly whether the inner product approach is accurate. */ val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON) if (precisionBound1 < precision) { sqDist = sumSquaredNorm - 2.0 * dot(v1, v2) } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) { val dotValue = dot(v1, v2) sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0) val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { sqDist = Vectors.sqdist(v1, v2) } } else { sqDist = Vectors.sqdist(v1, v2) } sqDist } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Test") conf.setMaster("local[*]") conf.set("spark.testing.memory", "512000000") val sc = new SparkContext(conf) val testData1 = sc.parallelize(List(Array(0,0,2,3,4,1,3,5,3,2,1,5,0,5,0,0,3,0,1,2,3,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,2)),1) val testData2 = sc.parallelize(List(Array(1,2,3,0,2,3,0,3,5,7,2,1,3,2,0,3,2,1,0,4,0,1,0,1,0,1,0,0,0,0,0,2,0,0,0,1,0)),1) val vector1dense = testData1.map{ f => val vector = Vectors.dense(f.map(f=>f.toDouble)) new VectorWithNorm(vector, Vectors.norm(vector,2)) }.take(1).apply(0) val vector2dense = testData2.map{ f => val vector = Vectors.dense(f.map(f=>f.toDouble)) new VectorWithNorm(vector, Vectors.norm(vector,2)) }.take(1).apply(0) val vector1sparse = testData1.map{ f => val vector = Vectors.sparse(f.length, f.zipWithIndex.map(f=>f._2),f.map(f=>f.toDouble)) val spasevector = vector.toSparse new VectorWithNorm(spasevector, Vectors.norm(spasevector,2)) }.take(1).apply(0) val vector2sparse = testData2.map{ f => val vector = Vectors.sparse(f.length, f.zipWithIndex.map(f=>f._2),f.map(f=>f.toDouble)) val spasevector = vector.toSparse new VectorWithNorm(spasevector, Vectors.norm(spasevector,2)) }.take(1).apply(0) var vector1 = Vectors.zeros(0) var vector2 = Vectors.zeros(0) var norm1 = 0.0 var norm2 = 0.0 //use different input data mode to check before and after add patch //INPUT mode: //sparse/sparseCase //dense/denseCase //dense/sparseCase val key = "dense/sparseCase" key match { case "sparse/sparseCase" => vector1 = vector1sparse.vector vector2 = vector2sparse.vector norm1 = vector1sparse.norm norm2 = vector2sparse.norm case "dense/denseCase" => vector1 = vector1dense.vector vector2 = vector2dense.vector norm1 = vector1dense.norm norm2 = vector2dense.norm case "dense/sparseCase" => vector1 = vector1sparse.vector vector2 = vector2dense.vector norm1 = vector1sparse.norm norm2 = vector2dense.norm } var sqOri = 0.0 val startTime1 = new Date().getTime for (i <- 0 until(100000000)) {//100000000 sqOri = fastSquaredDistanceAddedPatch(vector1,norm1,vector2,norm2) } val endTime1 = new Date().getTime println("sqOri: " + sqOri) println("costTime1: " + (endTime1 - startTime1)) var sqAP = 0.0 val startTime2 = new Date().getTime for (a <- 0 until(100000000)) { sqAP = fastSquaredDistanceAddedPatch(vector1,norm1,vector2,norm2) } val endTime2 = new Date().getTime println("sqAp: " + sqAP) println("costTime2: " + (endTime2 - startTime2)) sc.stop() } } ` [SparkMLlibTest.txt](https://github.com/apache/spark/files/2541002/SparkMLlibTest.txt)
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org