I'm trying to compute the Frobenius norm error in approximating an
IndexedRowMatrix A with the product L*R where L and R are Breeze
DenseMatrices. 

I've written the following function that computes the squared error over
each partition of rows then sums up to get the total squared error (ignore
the mean argument, it is not used). It works on a smaller dataset that I've
been using to test my code, but fails on the full-sized A with an error
about the java heap being out of size.

A here is an 8mil-by-100K matrix partitioned into 5015 parts, L is
8mil-by-16, and R is 16-by-100K. Together L and R take up way less memory
than I have on each executor (64 Gb), so I don't understand the cause of the
error.

def calcCenteredFrobNormErr(mat: IndexedRowMatrix, lhsTall: BDM[Double],
rhsFat: BDM[Double], mean: BDV[Double] ) : Double = {
    val lhsFactor = mat.rows.context.broadcast(lhsTall)
    val rhsFactor = mat.rows.context.broadcast(rhsFat)

    def partitionDiffFrobNorm2(rowiter : Iterator[IndexedRow], lhsFactor:
Broadcast[BDM[Double]], rhsFactor: Broadcast[BDM[Double]]) :
Iterator[Double] = {

      val lhsTall = lhsFactor.value
      val rhsFat = rhsFactor.value

      val rowlist = rowiter.toList
      val numrows = rowlist.length
      val matSubMat = BDM.zeros[Double](numrows, mat.numCols.toInt)
      val lhsSubMat = BDM.zeros[Double](numrows, lhsTall.cols)

      var currowindex = 0
      rowlist.foreach(
        (currow: IndexedRow) => {
          currow.vector.foreachActive { case (j, v) =>
matSubMat(currowindex, j) = v }
          lhsSubMat(currowindex, ::) := lhsTall(currow.index.toInt, ::)
          currowindex += 1
        }
      )

      val diffmat = matSubMat - lhsSubMat * rhsFat
      List(sum(diffmat :* diffmat)).iterator
    }

    report("Beginning to compute Frobenius norm", true)
    val res = mat.rows.mapPartitions(rowiter =>
partitionDiffFrobNorm2(rowiter, lhsFactor, rhsFactor)).reduce(_ + _)
    report("Finished computing Frobenius norm", true)
    math.sqrt(res)
  }



--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/java-heap-error-tp23856.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to