[ https://issues.apache.org/jira/browse/FLINK-1718?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14384080#comment-14384080 ]
ASF GitHub Bot commented on FLINK-1718: --------------------------------------- Github user rmetzger commented on a diff in the pull request: https://github.com/apache/flink/pull/539#discussion_r27308257 --- Diff: flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/SparseMatrix.scala --- @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import scala.util.Sorting + +/** Sparse matrix using the compressed sparse column (CSC) representation. + * + * More details concerning the compressed sparse column (CSC) representation can be found + * [http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29]. + * + * @param numRows Number of rows + * @param numCols Number of columns + * @param rowIndices Array containing the row indices of non-zero entries + * @param colPtrs Array containing the starting offsets in data for each column + * @param data Array containing the non-zero entries in column-major order + */ +class SparseMatrix( + val numRows: Int, + val numCols: Int, + val rowIndices: Array[Int], + val colPtrs: Array[Int], + val data: Array[Double]) + extends Matrix { + + /** Element wise access function + * + * @param row row index + * @param col column index + * @return matrix entry at (row, col) + */ + override def apply(row: Int, col: Int): Double = { + + val index = locate(row, col) + + if(index < 0){ + 0 + } else { + data(index) + } + } + + def toDenseMatrix: DenseMatrix = { + val result = DenseMatrix.zeros(numRows, numCols) + + for(row <- 0 until numRows; col <- 0 until numCols) { + result(row, col) = apply(row, col) + } + + result + } + + /** Element wise update function + * + * @param row row index + * @param col column index + * @param value value to set at (row, col) + */ + override def update(row: Int, col: Int, value: Double): Unit = { + val index = locate(row, col) + + if(index < 0) { + throw new IllegalArgumentException("Cannot update zero value of sparse matrix at index " + + s"($row, $col)") + } else { + data(index) = value + } + } + + override def toString: String = { + val result = StringBuilder.newBuilder + + result.append(s"SparseMatrix($numRows, $numCols)\n") + + var columnIndex = 0 + + val fieldWidth = math.max(numRows, numCols).toString.length + val valueFieldWidth = data.map(_.toString.length).max + 2 + + for(index <- 0 until colPtrs.last) { + while(colPtrs(columnIndex + 1) <= index){ + columnIndex += 1 + } + + val rowStr = rowIndices(index).toString + val columnStr = columnIndex.toString + val valueStr = data(index).toString + + result.append("(" + " " * (fieldWidth - rowStr.length) + rowStr + "," + + " " * (fieldWidth - columnStr.length) + columnStr + ")") + result.append(" " * (valueFieldWidth - valueStr.length) + valueStr) + result.append("\n") + } + + result.toString + } + + private def locate(row: Int, col: Int): Int = { + require(0 <= row && row < numRows, s"Row $row is out of bounds [0, $numRows).") + require(0 <= col && col < numCols, s"Col $col is out of bounds [0, $numCols).") --- End diff -- Many ops in this class seem to use this method. I'm not sure how expensive the string interpolation is. > Add sparse vector and sparse matrix types to machine learning library > --------------------------------------------------------------------- > > Key: FLINK-1718 > URL: https://issues.apache.org/jira/browse/FLINK-1718 > Project: Flink > Issue Type: New Feature > Components: Machine Learning Library > Reporter: Till Rohrmann > Assignee: Till Rohrmann > Labels: ML > > Currently, the machine learning library only supports dense matrix and dense > vectors. For future algorithms it would be beneficial to also support sparse > vectors and matrices. > I'd propose to use the compressed sparse column (CSC) representation, because > it allows rather efficient operations compared to a map backed sparse > matrix/vector implementation. Furthermore, this is also the format the Breeze > library expects for sparse matrices/vectors. Thus, it is easy to convert to a > sparse breeze data structure which provides us with many linear algebra > operations. > BIDMat [1] uses the same data representation. > Resources: > [1] [https://github.com/BIDData/BIDMat] -- This message was sent by Atlassian JIRA (v6.3.4#6332)