Github user KyleLi1985 commented on the issue:

    https://github.com/apache/spark/pull/22893
  
    There is my test for situation sparse-sparse, dense-dense, sparse-dense case
    `
    
      import org.apache.spark.{SparkConf, SparkContext}
      import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, 
Vectors}
      import com.github.fommil.netlib.F2jBLAS
      import java.util.Date
      
      
      object SparkMLlibTest {
        class VectorWithNorm(val vector: Vector, val norm: Double) extends 
Serializable {
      
          def this(vector: Vector) = this(vector, Vectors.norm(vector, 2.0))
      
          def this(array: Array[Double]) = this(Vectors.dense(array))
      
          /** Converts the vector to a dense vector. */
          def toDense: VectorWithNorm = new 
VectorWithNorm(Vectors.dense(vector.toArray), norm)
        }
      
        def dot(x: Vector, y: Vector): Double = {
          require(x.size == y.size,
            "BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching 
sizes:" +
              " x.size = " + x.size + ", y.size = " + y.size)
          (x, y) match {
            case (dx: DenseVector, dy: DenseVector) =>
              dot(dx, dy)
            case (sx: SparseVector, dy: DenseVector) =>
              dot(sx, dy)
            case (dx: DenseVector, sy: SparseVector) =>
              dot(sy, dx)
            case (sx: SparseVector, sy: SparseVector) =>
              dot(sx, sy)
            case _ =>
              throw new IllegalArgumentException(s"dot doesn't support 
(${x.getClass}, ${y.getClass}).")
          }
        }
      
        def dot(x: DenseVector, y: DenseVector): Double = {
          val n = x.size
          new F2jBLAS().ddot(n, x.values, 1, y.values, 1)
        }
      
        def dot(x: SparseVector, y: DenseVector): Double = {
          val xValues = x.values
          val xIndices = x.indices
          val yValues = y.values
          val nnz = xIndices.length
      
          var sum = 0.0
          var k = 0
          while (k < nnz) {
            sum += xValues(k) * yValues(xIndices(k))
            k += 1
          }
          sum
        }
      
        /**
          * dot(x, y)
          */
        def dot(x: SparseVector, y: SparseVector): Double = {
          val xValues = x.values
          val xIndices = x.indices
          val yValues = y.values
          val yIndices = y.indices
          val nnzx = xIndices.length
          val nnzy = yIndices.length
      
          var kx = 0
          var ky = 0
          var sum = 0.0
          // y catching x
          while (kx < nnzx && ky < nnzy) {
            val ix = xIndices(kx)
            while (ky < nnzy && yIndices(ky) < ix) {
              ky += 1
            }
            if (ky < nnzy && yIndices(ky) == ix) {
              sum += xValues(kx) * yValues(ky)
              ky += 1
            }
            kx += 1
          }
          sum
        }
      
        lazy val EPSILON = {
          var eps = 1.0
          while ((1.0 + (eps / 2.0)) != 1.0) {
            eps /= 2.0
          }
          eps
        }
      
        def fastSquaredDistanceAddedPatch(
                                                v1: Vector,
                                                norm1: Double,
                                                v2: Vector,
                                                norm2: Double,
                                                precision: Double = 1e-6): 
Double = {
          val n = v1.size
          require(v2.size == n)
          require(norm1 >= 0.0 && norm2 >= 0.0)
          val sumSquaredNorm = norm1 * norm1 + norm2 * norm2
          val normDiff = norm1 - norm2
          var sqDist = 0.0
          /*
           * The relative error is
           * <pre>
           * EPSILON * ( \|a\|_2^2 + \|b\\_2^2 + 2 |a^T b|) / ( \|a - b\|_2^2 ),
           * </pre>
           * which is bounded by
           * <pre>
           * 2.0 * EPSILON * ( \|a\|_2^2 + \|b\|_2^2 ) / ( (\|a\|_2 - 
\|b\|_2)^2 ).
           * </pre>
           * The bound doesn't need the inner product, so we can use it as a 
sufficient condition to
           * check quickly whether the inner product approach is accurate.
           */
          val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * 
normDiff + EPSILON)
      
          if (precisionBound1 < precision && (!v1.isInstanceOf[DenseVector]
            || !v2.isInstanceOf[DenseVector])) {
            sqDist = sumSquaredNorm - 2.0 * dot(v1, v2)
          } else if (v1.isInstanceOf[SparseVector] || 
v2.isInstanceOf[SparseVector]) {
            val dotValue = dot(v1, v2)
            sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0)
            val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * 
math.abs(dotValue)) /
              (sqDist + EPSILON)
            if (precisionBound2 > precision) {
              sqDist = Vectors.sqdist(v1, v2)
            }
          } else {
            sqDist = Vectors.sqdist(v1, v2)
          }
      
          sqDist
        }
      
        def fastSquaredDistanceOriginal(
                                           v1: Vector,
                                           norm1: Double,
                                           v2: Vector,
                                           norm2: Double,
                                           precision: Double = 1e-6): Double = {
          val n = v1.size
          require(v2.size == n)
          require(norm1 >= 0.0 && norm2 >= 0.0)
          val sumSquaredNorm = norm1 * norm1 + norm2 * norm2
          val normDiff = norm1 - norm2
          var sqDist = 0.0
          /*
           * The relative error is
           * <pre>
           * EPSILON * ( \|a\|_2^2 + \|b\\_2^2 + 2 |a^T b|) / ( \|a - b\|_2^2 ),
           * </pre>
           * which is bounded by
           * <pre>
           * 2.0 * EPSILON * ( \|a\|_2^2 + \|b\|_2^2 ) / ( (\|a\|_2 - 
\|b\|_2)^2 ).
           * </pre>
           * The bound doesn't need the inner product, so we can use it as a 
sufficient condition to
           * check quickly whether the inner product approach is accurate.
           */
          val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * 
normDiff + EPSILON)
      
          if (precisionBound1 < precision) {
            sqDist = sumSquaredNorm - 2.0 * dot(v1, v2)
          } else if (v1.isInstanceOf[SparseVector] || 
v2.isInstanceOf[SparseVector]) {
            val dotValue = dot(v1, v2)
            sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0)
            val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * 
math.abs(dotValue)) /
              (sqDist + EPSILON)
            if (precisionBound2 > precision) {
              sqDist = Vectors.sqdist(v1, v2)
            }
          } else {
            sqDist = Vectors.sqdist(v1, v2)
          }
      
          sqDist
        }
      
        def main(args: Array[String]): Unit = {
          val conf = new SparkConf().setAppName("Test")
          conf.setMaster("local[*]")
          conf.set("spark.testing.memory", "512000000")
          val sc = new SparkContext(conf)
      
          val testData1 = 
sc.parallelize(List(Array(0,0,2,3,4,1,3,5,3,2,1,5,0,5,0,0,3,0,1,2,3,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,2)),1)
          val testData2 = 
sc.parallelize(List(Array(1,2,3,0,2,3,0,3,5,7,2,1,3,2,0,3,2,1,0,4,0,1,0,1,0,1,0,0,0,0,0,2,0,0,0,1,0)),1)
      
          val vector1dense = testData1.map{
            f =>
              val vector = Vectors.dense(f.map(f=>f.toDouble))
              new VectorWithNorm(vector, Vectors.norm(vector,2))
          }.take(1).apply(0)
          val vector2dense = testData2.map{
            f =>
              val vector = Vectors.dense(f.map(f=>f.toDouble))
              new VectorWithNorm(vector, Vectors.norm(vector,2))
          }.take(1).apply(0)
          val vector1sparse = testData1.map{
            f =>
              val vector = Vectors.sparse(f.length, 
f.zipWithIndex.map(f=>f._2),f.map(f=>f.toDouble))
              val spasevector = vector.toSparse
              new VectorWithNorm(spasevector, Vectors.norm(spasevector,2))
          }.take(1).apply(0)
          val vector2sparse = testData2.map{
            f =>
              val vector = Vectors.sparse(f.length, 
f.zipWithIndex.map(f=>f._2),f.map(f=>f.toDouble))
              val spasevector = vector.toSparse
              new VectorWithNorm(spasevector, Vectors.norm(spasevector,2))
          }.take(1).apply(0)
      
      
          var vector1 = Vectors.zeros(0)
          var vector2 = Vectors.zeros(0)
          var norm1 = 0.0
          var norm2 = 0.0
      
      
          //use different input data mode to check before and after add patch
          //INPUT mode:
          //sparse/sparseCase
          //dense/denseCase
          //dense/sparseCase
      
          val key = "dense/sparseCase"
          key match {
            case "sparse/sparseCase" =>
              vector1 = vector1sparse.vector
              vector2 = vector2sparse.vector
              norm1 = vector1sparse.norm
              norm2 = vector2sparse.norm
            case "dense/denseCase" =>
              vector1 = vector1dense.vector
              vector2 = vector2dense.vector
              norm1 = vector1dense.norm
              norm2 = vector2dense.norm
            case "dense/sparseCase" =>
              vector1 = vector1sparse.vector
              vector2 = vector2dense.vector
              norm1 = vector1sparse.norm
              norm2 = vector2dense.norm
          }
      
          var sqOri = 0.0
          val startTime1 = new Date().getTime
          for (i <- 0 until(100000000)) {//100000000
            sqOri = fastSquaredDistanceAddedPatch(vector1,norm1,vector2,norm2)
          }
          val endTime1 = new Date().getTime
      
          println("sqOri: " + sqOri)
          println("costTime1: " + (endTime1 - startTime1))
          
          var sqAP = 0.0
          val startTime2 = new Date().getTime
          for (a <- 0 until(100000000)) {
            sqAP = fastSquaredDistanceAddedPatch(vector1,norm1,vector2,norm2)
          }
          val endTime2 = new Date().getTime
          println("sqAp: " + sqAP)
          println("costTime2: " + (endTime2 - startTime2))
      
      
          sc.stop()
        }
      }
    `
    
[SparkMLlibTest.txt](https://github.com/apache/spark/files/2541002/SparkMLlibTest.txt)



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to