[
https://issues.apache.org/jira/browse/FLINK-3919?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15343904#comment-15343904
]
ASF GitHub Bot commented on FLINK-3919:
---------------------------------------
Github user chobeat commented on a diff in the pull request:
https://github.com/apache/flink/pull/1996#discussion_r68012799
--- Diff:
flink-libraries/flink-ml/src/main/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrix.scala
---
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.math.distributed
+
+import org.apache.flink.api.scala._
+import org.apache.flink.ml.math.Breeze._
+import org.apache.flink.ml.math.distributed.DistributedMatrix._
+import org.apache.flink.ml.math._
+
+/**
+ * Distributed row-major matrix representation.
+ * @param numRows Number of rows.
+ * @param numCols Number of columns.
+ */
+class DistributedRowMatrix(val data: DataSet[IndexedRow],
+ val numRows: Int,
+ val numCols: Int)
+ extends DistributedMatrix {
+
+ /**
+ * Collects the data in the form of a sequence of coordinates
associated with their values.
+ * @return
+ */
+ def toCOO: Seq[(MatrixRowIndex, MatrixColIndex, Double)] = {
+
+ val localRows = data.collect()
+
+ for (IndexedRow(rowIndex, vector) <- localRows;
+ (columnIndex, value) <- vector) yield (rowIndex, columnIndex,
value)
+ }
+
+ /**
+ * Collects the data in the form of a SparseMatrix
+ * @return
+ */
+ def toLocalSparseMatrix: SparseMatrix = {
+ val localMatrix =
+ SparseMatrix.fromCOO(this.numRows, this.numCols, this.toCOO)
+ require(localMatrix.numRows == this.numRows)
+ require(localMatrix.numCols == this.numCols)
+ localMatrix
+ }
+
+ //TODO: convert to dense representation on the distributed matrix and
collect it afterward
+ def toLocalDenseMatrix: DenseMatrix =
this.toLocalSparseMatrix.toDenseMatrix
+
+ /**
+ * Apply a high-order function to couple of rows
+ * @param fun
+ * @param other
+ * @return
+ */
+ def byRowOperation(fun: (Vector, Vector) => Vector,
+ other: DistributedRowMatrix): DistributedRowMatrix = {
+ val otherData = other.data
+ require(this.numCols == other.numCols)
+ require(this.numRows == other.numRows)
+
+ val result = this.data
+ .fullOuterJoin(otherData)
+ .where("rowIndex")
+ .equalTo("rowIndex")(
+ (left: IndexedRow, right: IndexedRow) => {
+ val row1 = Option(left) match {
+ case Some(row: IndexedRow) => row
+ case None =>
+ IndexedRow(
+ right.rowIndex,
+ SparseVector.fromCOO(right.values.size, List((0,
0.0))))
+ }
+ val row2 = Option(right) match {
+ case Some(row: IndexedRow) => row
+ case None =>
+ IndexedRow(
+ left.rowIndex,
+ SparseVector.fromCOO(left.values.size, List((0, 0.0))))
+ }
+ IndexedRow(row1.rowIndex, fun(row1.values, row2.values))
+ }
+ )
+ new DistributedRowMatrix(result, numRows, numCols)
+ }
+
+ /**
+ * Add the matrix to another matrix.
+ * @param other
+ * @return
+ */
+ def sum(other: DistributedRowMatrix): DistributedRowMatrix = {
--- End diff --
My reply probably got lost because I posted it on Jira instead of Github,
sorry.
"Umh, probably you're right. I checked breeze and they use addition for
matrix addition and sum for element-wise sum."
> Distributed Linear Algebra: row-based matrix
> --------------------------------------------
>
> Key: FLINK-3919
> URL: https://issues.apache.org/jira/browse/FLINK-3919
> Project: Flink
> Issue Type: New Feature
> Components: Machine Learning Library
> Reporter: Simone Robutti
> Assignee: Simone Robutti
>
> Distributed matrix implementation as a DataSet of IndexedRow and related
> operations
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)