Author: tdunning
Date: Thu Sep  6 22:14:51 2012
New Revision: 1381778

URL: http://svn.apache.org/viewvc?rev=1381778&view=rev
Log:
MAHOUT-1059 - Oops.  Added test without the class.  This class isn't really 
ready, but the build should work.

Added:
    
mahout/trunk/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java

Added: 
mahout/trunk/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java?rev=1381778&view=auto
==============================================================================
--- 
mahout/trunk/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
 (added)
+++ 
mahout/trunk/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
 Thu Sep  6 22:14:51 2012
@@ -0,0 +1,484 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+
+import javax.annotation.Nullable;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Provides a way to get data from a file and treat it as if it were a matrix, 
but avoids putting
+ * all that data onto the Java heap.  Instead, the file is mapped into 
non-heap memory as a
+ * DoubleBuffer and we access that instead.  The interesting aspect of this is 
that the values in
+ * the matrix are binary and sparse so we don't need to store the actual data, 
just the location of
+ * non-zero values.
+ * <p/>
+ * Currently file data is formatted as follows:
+ * <p/>
+ * <ul> <li>A magic number to indicate the file format.</li> <li>The size of 
the matrix (max rows
+ * and columns possible)</li> <li>Number of non-zeros in each row.</li> <li>A 
list of non-zero
+ * columns for each row.  The list starts with a count and then has column 
numbers</li> </ul>
+ * <p/>
+ * It would be preferable to use something like protobufs to define the format 
so that we can use
+ * different row formats for different kinds of data.  For instance, Golay 
coding of column numbers
+ * or compressed bit vectors might be good representations for some purposes.
+ */
+public class FileBasedSparseBinaryMatrix extends AbstractMatrix {
+  private static final int MAGIC_NUMBER_V0 = 0x12d7067d;
+
+  private List<IntBuffer> data = Lists.newArrayList();
+  private int[] bufferIndex;
+  private int[] rowOffset;
+  private int[] rowSize;
+
+  /**
+   * Constructs an empty matrix of the given size.
+   *
+   * @param rows    The number of rows in the result.
+   * @param columns The number of columns in the result.
+   */
+  public FileBasedSparseBinaryMatrix(int rows, int columns) {
+    super(rows, columns);
+  }
+
+  public void setData(File f, boolean loadNow) throws IOException {
+    List<ByteBuffer> buffers = Lists.newArrayList();
+    FileChannel input = new FileInputStream(f).getChannel();
+
+    buffers.add(input.map(FileChannel.MapMode.READ_ONLY, 0, 
Math.min(Integer.MAX_VALUE, f.length())));
+    data.add(buffers.get(0).asIntBuffer());
+    Preconditions.checkArgument(buffers.get(0).getInt() == MAGIC_NUMBER_V0, 
"Wrong type of file");
+
+    int rows = buffers.get(0).getInt();
+    int cols = buffers.get(0).getInt();
+    Preconditions.checkArgument(rows == rowSize());
+    Preconditions.checkArgument(cols == columnSize());
+
+    rowOffset = new int[rows];
+    rowSize = new int[rows];
+    bufferIndex = new int[rows];
+
+    int offset = 12 + 4 * rows;
+    for (int i = 0; i < rows; i++) {
+      int size = buffers.get(0).getInt();
+      int buffer = 0;
+      while (buffer < buffers.size()) {
+        if (offset + size * 4 <= buffers.get(buffer).limit()) {
+          break;
+        } else {
+          offset -= buffers.get(buffer).capacity();
+        }
+      }
+      if (buffer == buffers.size()) {
+        buffers.add(input.map(FileChannel.MapMode.READ_ONLY, 0, 
Math.min(Integer.MAX_VALUE, f.length() - offset)));
+        data.add(buffers.get(buffer).asIntBuffer());
+      }
+      rowOffset[i] = offset / 4;
+      rowSize[i] = size;
+      bufferIndex[i] = buffer;
+
+//      final SparseBinaryVector v = new 
SparseBinaryVector(buffers.get(buffer), columns, offset, size);
+//      this.rows.add(v);
+      offset += size * 4;
+    }
+  }
+
+  public static void writeMatrix(File f, Matrix m) throws IOException {
+    Preconditions.checkArgument(f.canWrite(), "Can't write to output file");
+    FileOutputStream fos = new FileOutputStream(f);
+
+    // write header
+    DataOutputStream out = new DataOutputStream(fos);
+    out.writeInt(MAGIC_NUMBER_V0);
+    out.writeInt(m.rowSize());
+    out.writeInt(m.columnSize());
+
+    // compute offsets and write row headers
+    for (MatrixSlice row : m) {
+      final int nondefaultElements = row.vector().getNumNondefaultElements();
+      out.writeInt(nondefaultElements);
+    }
+
+    // write rows
+    for (MatrixSlice row : m) {
+      List<Integer> columns = 
Lists.newArrayList(Iterators.transform(row.vector().iterateNonZero(),
+        new Function<Vector.Element, Integer>() {
+          @Override
+          public Integer apply(@Nullable Vector.Element element) {
+            return element.index();
+          }
+        }));
+      Collections.sort(columns);
+
+      for (Integer column : columns) {
+        out.writeInt(column);
+      }
+    }
+
+    out.close();
+    fos.close();
+  }
+
+  /**
+   * Assign the other vector values to the column of the receiver
+   *
+   * @param column the int row to assign
+   * @param other  a Vector
+   * @return the modified receiver
+   * @throws org.apache.mahout.math.CardinalityException
+   *          if the cardinalities differ
+   */
+  @Override
+  public Matrix assignColumn(int column, Vector other) {
+    throw new UnsupportedOperationException("Default operation");
+  }
+
+  /**
+   * Assign the other vector values to the row of the receiver
+   *
+   * @param row   the int row to assign
+   * @param other a Vector
+   * @return the modified receiver
+   * @throws org.apache.mahout.math.CardinalityException
+   *          if the cardinalities differ
+   */
+  @Override
+  public Matrix assignRow(int row, Vector other) {
+    throw new UnsupportedOperationException("Default operation");
+  }
+
+  /**
+   * Return the value at the given indexes, without checking bounds
+   *
+   * @param rowIndex    an int row index
+   * @param columnIndex an int column index
+   * @return the double at the index
+   */
+  @Override
+  public double getQuick(int rowIndex, int columnIndex) {
+    IntBuffer tmp = data.get(bufferIndex[rowIndex]).asReadOnlyBuffer();
+    tmp.position(rowOffset[rowIndex]);
+    tmp.limit(rowSize[rowIndex]);
+    tmp = tmp.slice();
+    return searchForIndex(tmp, columnIndex);
+  }
+
+  private static double searchForIndex(IntBuffer row, int columnIndex) {
+    int low = 0;
+    int high = row.limit();
+    if (high == 0) {
+      return 0;
+    }
+    while (high > low) {
+      int mid = (low + high) / 2;
+      if (row.get(mid) < columnIndex) {
+        low = mid + 1;
+      } else {
+        high = mid;
+      }
+    }
+    if (low >= row.limit()) {
+      return 0;
+    }
+    else if (high == low && row.get(low) == columnIndex) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  /**
+   * Return an empty matrix of the same underlying class as the receiver
+   *
+   * @return a Matrix
+   */
+  @Override
+  public Matrix like() {
+    throw new UnsupportedOperationException("Default operation");
+  }
+
+  /**
+   * Returns an empty matrix of the same underlying class as the receiver and 
of the specified
+   * size.
+   *
+   * @param rows    the int number of rows
+   * @param columns the int number of columns
+   */
+  @Override
+  public Matrix like(int rows, int columns) {
+    return new DenseMatrix(rows, columns);
+  }
+
+  /**
+   * Set the value at the given index, without checking bounds
+   *
+   * @param row    an int row index into the receiver
+   * @param column an int column index into the receiver
+   * @param value  a double value to set
+   */
+  @Override
+  public void setQuick(int row, int column, double value) {
+    throw new UnsupportedOperationException("Default operation");
+  }
+
+  /**
+   * Return a view into part of a matrix.  Changes to the view will change the 
original matrix.
+   *
+   * @param offset an int[2] offset into the receiver
+   * @param size   the int[2] size of the desired result
+   * @return a matrix that shares storage with part of the original matrix.
+   * @throws org.apache.mahout.math.CardinalityException
+   *          if the length is greater than the cardinality of the receiver
+   * @throws org.apache.mahout.math.IndexException
+   *          if the offset is negative or the offset+length is outside of the 
receiver
+   */
+  @Override
+  public Matrix viewPart(int[] offset, int[] size) {
+    throw new UnsupportedOperationException("Default operation");
+  }
+
+  /**
+   * Returns a view of a row.  Changes to the view will affect the original.
+   *
+   * @param rowIndex Which row to return.
+   * @return A vector that references the desired row.
+   */
+  @Override
+  public Vector viewRow(int rowIndex) {
+    IntBuffer tmp = data.get(bufferIndex[rowIndex]).asReadOnlyBuffer();
+    tmp.position(rowOffset[rowIndex]);
+    tmp.limit(rowOffset[rowIndex] + rowSize[rowIndex]);
+    tmp = tmp.slice();
+    return new SparseBinaryVector(rowSize[rowIndex], tmp, columnSize());
+  }
+
+  private static class SparseBinaryVector extends AbstractVector {
+    private IntBuffer buffer;
+    private int maxIndex;
+
+    private SparseBinaryVector(int size, IntBuffer buffer, int maxIndex) {
+      super(maxIndex);
+      this.buffer = buffer;
+      this.maxIndex = maxIndex;
+    }
+
+    public SparseBinaryVector(ByteBuffer row, int maxIndex, int offset, int 
size) {
+      super(maxIndex);
+      row = row.asReadOnlyBuffer();
+      row.position(offset);
+      row.limit(offset + size * 4);
+      row = row.slice();
+      this.buffer = row.slice().asIntBuffer();
+      this.maxIndex = maxIndex;
+    }
+
+    /**
+     * Subclasses must override to return an appropriately sparse or dense 
result
+     *
+     * @param rows    the row cardinality
+     * @param columns the column cardinality
+     * @return a Matrix
+     */
+    @Override
+    protected Matrix matrixLike(int rows, int columns) {
+      throw new UnsupportedOperationException("Default operation");
+    }
+
+    /**
+     * @return true iff this implementation should be considered dense -- that 
it explicitly represents
+     *         every value
+     */
+    @Override
+    public boolean isDense() {
+      return false;
+    }
+
+    /**
+     * @return true iff this implementation should be considered to be 
iterable in index order in an
+     *         efficient way. In particular this implies that {@link 
#iterator()} and {@link
+     *         #iterateNonZero()} return elements in ascending order by index.
+     */
+    @Override
+    public boolean isSequentialAccess() {
+      return true;
+    }
+
+    /**
+     * Iterates over all elements
+     *
+     * NOTE: Implementations may choose to reuse the Element returned
+     * for performance reasons, so if you need a copy of it, you should call 
{@link #getElement(int)}
+     * for the given index
+     *
+     * @return An {@link java.util.Iterator} over all elements
+     */
+    @Override
+    public Iterator<Element> iterator() {
+      return new AbstractIterator<Element>() {
+        int i = 0;
+        int index = 0;
+
+        @Override
+        protected Element computeNext() {
+          if (i < maxIndex) {
+            return new Element() {
+              int index = i++;
+              /**
+               * @return the value of this vector element.
+               */
+              @Override
+              public double get() {
+                return getQuick(index);
+              }
+
+              /**
+               * @return the index of this vector element.
+               */
+              @Override
+              public int index() {
+                return index;
+              }
+
+              /**
+               * @param value Set the current element to value.
+               */
+              @Override
+              public void set(double value) {
+                throw new UnsupportedOperationException("Default operation");
+              }
+            };
+          } else {
+            return endOfData();
+          }
+        }
+      };
+    }
+
+    /**
+      * Iterates over all non-zero elements. <p/> NOTE: Implementations may 
choose to reuse the Element
+      * returned for performance reasons, so if you need a copy of it, you 
should call {@link
+      * #getElement(int)} for the given index
+      *
+      * @return An {@link java.util.Iterator} over all non-zero elements
+      */
+    @Override
+    public Iterator<Element> iterateNonZero() {
+      return new AbstractIterator<Element>() {
+        int i = 0;
+        @Override
+        protected Element computeNext() {
+          if (i < buffer.limit()) {
+          return new BinaryReadOnlyElement(buffer.get(i++));
+        } else {
+          return endOfData();
+        }
+      }
+    };
+  }
+
+  /**
+     * Return the value at the given index, without checking bounds
+     *
+     * @param index an int index
+     * @return the double at the index
+     */
+    @Override
+    public double getQuick(int index) {
+      return searchForIndex(buffer, index);
+    }
+
+    /**
+     * Return an empty vector of the same underlying class as the receiver
+     *
+     * @return a Vector
+     */
+    @Override
+    public Vector like() {
+      return new RandomAccessSparseVector(size());
+    }
+
+    /**
+     * Set the value at the given index, without checking bounds
+     *
+     * @param index an int index into the receiver
+     * @param value a double value to set
+     */
+    @Override
+    public void setQuick(int index, double value) {
+      throw new UnsupportedOperationException("Read-only view");
+    }
+
+    /**
+     * Return the number of values in the recipient which are not the default 
value.  For instance, for
+     * a sparse vector, this would be the number of non-zero values.
+     *
+     * @return an int
+     */
+    @Override
+    public int getNumNondefaultElements() {
+      return buffer.limit();
+    }
+  }
+
+  public static class BinaryReadOnlyElement implements Vector.Element {
+    private int index;
+
+    public BinaryReadOnlyElement(int index) {
+      this.index = index;
+    }
+
+    /**
+     * @return the value of this vector element.
+     */
+    @Override
+    public double get() {
+      return 1;
+    }
+
+    /**
+     * @return the index of this vector element.
+     */
+    @Override
+    public int index() {
+      return index;
+    }
+
+    /**
+     * @param value Set the current element to value.
+     */
+    @Override
+    public void set(double value) {
+      throw new UnsupportedOperationException("Can't set binary value");
+    }
+  }
+}


Reply via email to