phaniarnab commented on code in PR #1650:
URL: https://github.com/apache/systemds/pull/1650#discussion_r932961300
##########
src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCountDistinct.java:
##########
@@ -102,66 +114,255 @@ else if(in.getNonZeros() < minimumSize) {
*
* Benefit: precise, but uses memory, on the scale of inputs number of
distinct values.
*
- * @param in The input matrix to count number distinct values in
- * @return The absolute distinct count
+ * @param blkIn The input matrix to count number distinct values in
+ * @return A matrix block containing the absolute distinct count for
the entire input or along given row/col axis
*/
- private static int countDistinctValuesNaive(MatrixBlock in) {
+ private static MatrixBlock countDistinctValuesNaive(MatrixBlock blkIn,
CountDistinctOperator op) {
+
+ if (blkIn.isEmpty()) {
+ return new MatrixBlock(1);
+ }
+ else if(blkIn instanceof CompressedMatrixBlock) {
+ throw new NotImplementedException("countDistinct() does
not support CompressedMatrixBlock");
+ }
+
Set<Double> distinct = new HashSet<>();
+ MatrixBlock blkOut;
double[] data;
- if(in.isEmpty())
- return 1;
- else if(in instanceof CompressedMatrixBlock)
- throw new NotImplementedException();
- long nonZeros = in.getNonZeros();
+ if (op.getDirection().isRowCol()) {
+ blkOut = new MatrixBlock(1, 1, false);
- if(nonZeros != -1 && nonZeros < in.getNumColumns() *
in.getNumRows()) {
- distinct.add(0d);
- }
+ long distinctCount = 0;
+ long nonZeros = blkIn.getNonZeros();
- if(in.sparseBlock != null) {
- SparseBlock sb = in.sparseBlock;
+ // Check if input matrix contains any 0 values for
RowCol case.
+ // This does not apply to row/col case, where we count
nnz per row or col during iteration.
+ if(nonZeros != -1 && nonZeros < blkIn.getNumColumns() *
blkIn.getNumRows()) {
+ distinct.add(0d);
+ }
- if(in.sparseBlock.isContiguous()) {
- data = sb.values(0);
- countDistinctValuesNaive(data, distinct);
+ if(blkIn.getSparseBlock() != null) {
+ SparseBlock sb = blkIn.getSparseBlock();
+ if(blkIn.getSparseBlock().isContiguous()) {
+ // COO, CSR
+ data = sb.values(0);
+ distinctCount =
countDistinctValuesNaive(data, distinct);
+ } else {
+ // MCSR
+ for(int i = 0; i < blkIn.getNumRows();
i++) {
+ if(!sb.isEmpty(i)) {
+ data =
blkIn.getSparseBlock().values(i);
+ distinctCount =
countDistinctValuesNaive(data, distinct);
+ }
+ }
+ }
+ } else if(blkIn.getDenseBlock() != null) {
+ DenseBlock db = blkIn.getDenseBlock();
+ for (int i = 0; i <= db.numBlocks(); i++) {
+ data = db.valuesAt(i);
+ distinctCount =
countDistinctValuesNaive(data, distinct);
+ }
}
- else {
- for(int i = 0; i < in.getNumRows(); i++) {
- if(!sb.isEmpty(i)) {
- data = in.sparseBlock.values(i);
+
+ blkOut.setValue(0, 0, distinctCount);
+ } else if (op.getDirection().isRow()) {
+ blkOut = blkIn.slice(0, blkIn.getNumRows() - 1, 0, 0);
+
+ if (blkIn.getDenseBlock() != null) {
+ // The naive approach would be to iterate
through every (i, j) in the input. However, can do better
+ // by exploiting the physical layout of dense
blocks - contiguous blocks in row-major order - in memory.
+ DenseBlock db = blkIn.getDenseBlock();
+ for (int bix=0; bix<db.numBlocks(); ++bix) {
+ data = db.valuesAt(bix);
+ for (int rix=bix * db.blockSize();
rix<blkIn.getNumRows(); rix++) {
+ distinct.clear();
+ for (int cix=0;
cix<blkIn.getNumColumns(); ++cix) {
+
distinct.add(data[db.pos(rix, cix)]);
+ }
+ blkOut.setValue(rix, 0,
distinct.size());
+ }
+ }
+ } else if (blkIn.getSparseBlock() != null) {
+ // Each sparse block type - COO, CSR, MCSR -
has a different data representation, which we will exploit
+ // separately.
+ SparseBlock sb = blkIn.getSparseBlock();
+ if (SparseBlockFactory.isSparseBlockType(sb,
SparseBlock.Type.MCSR)) {
+ // Currently, SparseBlockIterator only
provides an interface for cell-wise iteration.
+ // TODO Explore row-wise and
column-wise methods for SparseBlockIterator
+
+ // MCSR enables O(1) access to column
values per row
+ for (int rix=0; rix<blkIn.getNumRows();
++rix) {
+ if (sb.isEmpty(rix)) {
+ continue;
+ }
+ distinct.clear();
+ data = sb.values(rix);
countDistinctValuesNaive(data,
distinct);
+ blkOut.setValue(rix, 0,
distinct.size());
+ }
+ } else if
(SparseBlockFactory.isSparseBlockType(sb, SparseBlock.Type.CSR)) {
+ // Casting is safe given if-condition
+ SparseBlockCSR csrBlock =
(SparseBlockCSR) sb;
+
+ // Data lies in one contiguous block in
CSR format. We will iterate in row-major using O(1) op
+ // size(row) to determine the number of
columns per row.
+ data = csrBlock.values();
+ // We want to iterate through all rows
to keep track of the row index for constructing the output
+ for (int rix=0; rix<blkIn.getNumRows();
++rix) {
+ if (csrBlock.isEmpty(rix)) {
+ continue;
+ }
+ distinct.clear();
+ int rpos = csrBlock.pos(rix);
+ int clen = csrBlock.size(rix);
+ for (int colOffset=0;
colOffset<clen; ++colOffset) {
+ distinct.add(data[rpos
+ colOffset]);
+ }
+ blkOut.setValue(rix, 0,
distinct.size());
+ }
+ } else { // COO
+ if (!(sb instanceof SparseBlockCOO)) {
+ throw new
IllegalArgumentException("Input matrix is of unrecognized type: "
+ +
sb.getClass().getSimpleName());
+ }
+ SparseBlockCOO cooBlock =
(SparseBlockCOO) sb;
+
+ // For COO, we want to avoid using
pos(row) and size(row) as they use binary search, which is a
+ // O(log N) op. Also, isEmpty(row) uses
pos(row) internally.
+ int[] rixs = cooBlock.rowIndexes();
+ data = cooBlock.values();
+ int i = 0; // data iterator
+ int rix = 0; // row index
+ while (rix < cooBlock.numRows() && i <
rixs.length) {
+ distinct.clear();
+ while (i + 1 < rixs.length &&
rixs[i] == rixs[i + 1]) {
+ distinct.add(data[i]);
+ i++;
+ }
+ if (i + 1 < rixs.length) { //
rixs[i] != rixs[i + 1]
+ distinct.add(data[i]);
+ }
+ blkOut.setValue(rix, 0,
distinct.size());
+ rix = (i + 1 < rixs.length)?
rixs[i + 1] : rix;
+ i++;
}
}
}
- }
- else if(in.denseBlock != null) {
- DenseBlock db = in.denseBlock;
- for(int i = 0; i <= db.numBlocks(); i++) {
- data = db.valuesAt(i);
- countDistinctValuesNaive(data, distinct);
+ } else { // Col aggregation
+ blkOut = blkIn.slice(0, 0, 0, blkIn.getNumColumns() -
1);
Review Comment:
Similar to above.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]