org.apache.mahout.math.FileBasedSparseBinaryMatrix Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-math Show documentation
Show all versions of mahout-math Show documentation
High performance scientific and technical computing data structures and methods,
mostly based on CERN's
Colt Java API
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
/**
* Provides a way to get data from a file and treat it as if it were a matrix, but avoids putting
* all that data onto the Java heap. Instead, the file is mapped into non-heap memory as a
* DoubleBuffer and we access that instead. The interesting aspect of this is that the values in
* the matrix are binary and sparse so we don't need to store the actual data, just the location of
* non-zero values.
*
* Currently file data is formatted as follows:
*
* - A magic number to indicate the file format.
- The size of the matrix (max rows
* and columns possible)
- Number of non-zeros in each row.
- A list of non-zero
* columns for each row. The list starts with a count and then has column numbers
*
* It would be preferable to use something like protobufs to define the format so that we can use
* different row formats for different kinds of data. For instance, Golay coding of column numbers
* or compressed bit vectors might be good representations for some purposes.
*/
public final class FileBasedSparseBinaryMatrix extends AbstractMatrix {
private static final int MAGIC_NUMBER_V0 = 0x12d7067d;
private final List data = Lists.newArrayList();
private int[] bufferIndex;
private int[] rowOffset;
private int[] rowSize;
/**
* Constructs an empty matrix of the given size.
*
* @param rows The number of rows in the result.
* @param columns The number of columns in the result.
*/
public FileBasedSparseBinaryMatrix(int rows, int columns) {
super(rows, columns);
}
public void setData(File f) throws IOException {
List buffers = Lists.newArrayList();
FileChannel input = new FileInputStream(f).getChannel();
buffers.add(input.map(FileChannel.MapMode.READ_ONLY, 0, Math.min(Integer.MAX_VALUE, f.length())));
data.add(buffers.get(0).asIntBuffer());
Preconditions.checkArgument(buffers.get(0).getInt() == MAGIC_NUMBER_V0, "Wrong type of file");
int rows = buffers.get(0).getInt();
int cols = buffers.get(0).getInt();
Preconditions.checkArgument(rows == rowSize());
Preconditions.checkArgument(cols == columnSize());
rowOffset = new int[rows];
rowSize = new int[rows];
bufferIndex = new int[rows];
int offset = 12 + 4 * rows;
for (int i = 0; i < rows; i++) {
int size = buffers.get(0).getInt();
int buffer = 0;
while (buffer < buffers.size()) {
if (offset + size * 4 <= buffers.get(buffer).limit()) {
break;
} else {
offset -= buffers.get(buffer).capacity();
}
}
if (buffer == buffers.size()) {
buffers.add(input.map(FileChannel.MapMode.READ_ONLY, 0, Math.min(Integer.MAX_VALUE, f.length() - offset)));
data.add(buffers.get(buffer).asIntBuffer());
}
rowOffset[i] = offset / 4;
rowSize[i] = size;
bufferIndex[i] = buffer;
// final SparseBinaryVector v = new SparseBinaryVector(buffers.get(buffer), columns, offset, size);
// this.rows.add(v);
offset += size * 4;
}
}
public static void writeMatrix(File f, Matrix m) throws IOException {
Preconditions.checkArgument(f.canWrite(), "Can't write to output file");
FileOutputStream fos = new FileOutputStream(f);
// write header
DataOutputStream out = new DataOutputStream(fos);
out.writeInt(MAGIC_NUMBER_V0);
out.writeInt(m.rowSize());
out.writeInt(m.columnSize());
// compute offsets and write row headers
for (MatrixSlice row : m) {
int nondefaultElements = row.vector().getNumNondefaultElements();
out.writeInt(nondefaultElements);
}
// write rows
for (MatrixSlice row : m) {
List columns = Lists.newArrayList(Iterables.transform(row.vector().nonZeroes(),
new Function() {
@Override
public Integer apply(Vector.Element element) {
return element.index();
}
}));
Collections.sort(columns);
for (Integer column : columns) {
out.writeInt(column);
}
}
out.close();
fos.close();
}
/**
* Assign the other vector values to the column of the receiver
*
* @param column the int row to assign
* @param other a Vector
* @return the modified receiver
* @throws org.apache.mahout.math.CardinalityException
* if the cardinalities differ
*/
@Override
public Matrix assignColumn(int column, Vector other) {
throw new UnsupportedOperationException("Default operation");
}
/**
* Assign the other vector values to the row of the receiver
*
* @param row the int row to assign
* @param other a Vector
* @return the modified receiver
* @throws org.apache.mahout.math.CardinalityException
* if the cardinalities differ
*/
@Override
public Matrix assignRow(int row, Vector other) {
throw new UnsupportedOperationException("Default operation");
}
/**
* Return the value at the given indexes, without checking bounds
*
* @param rowIndex an int row index
* @param columnIndex an int column index
* @return the double at the index
*/
@Override
public double getQuick(int rowIndex, int columnIndex) {
IntBuffer tmp = data.get(bufferIndex[rowIndex]).asReadOnlyBuffer();
tmp.position(rowOffset[rowIndex]);
tmp.limit(rowSize[rowIndex]);
tmp = tmp.slice();
return searchForIndex(tmp, columnIndex);
}
private static double searchForIndex(IntBuffer row, int columnIndex) {
int high = row.limit();
if (high == 0) {
return 0;
}
int low = 0;
while (high > low) {
int mid = (low + high) / 2;
if (row.get(mid) < columnIndex) {
low = mid + 1;
} else {
high = mid;
}
}
if (low >= row.limit()) {
return 0;
} else if (high == low && row.get(low) == columnIndex) {
return 1;
} else {
return 0;
}
}
/**
* Return an empty matrix of the same underlying class as the receiver
*
* @return a Matrix
*/
@Override
public Matrix like() {
throw new UnsupportedOperationException("Default operation");
}
/**
* Returns an empty matrix of the same underlying class as the receiver and of the specified
* size.
*
* @param rows the int number of rows
* @param columns the int number of columns
*/
@Override
public Matrix like(int rows, int columns) {
return new DenseMatrix(rows, columns);
}
/**
* Set the value at the given index, without checking bounds
*
* @param row an int row index into the receiver
* @param column an int column index into the receiver
* @param value a double value to set
*/
@Override
public void setQuick(int row, int column, double value) {
throw new UnsupportedOperationException("Default operation");
}
/**
* Return a view into part of a matrix. Changes to the view will change the original matrix.
*
* @param offset an int[2] offset into the receiver
* @param size the int[2] size of the desired result
* @return a matrix that shares storage with part of the original matrix.
* @throws org.apache.mahout.math.CardinalityException
* if the length is greater than the cardinality of the receiver
* @throws org.apache.mahout.math.IndexException
* if the offset is negative or the offset+length is outside of the receiver
*/
@Override
public Matrix viewPart(int[] offset, int[] size) {
throw new UnsupportedOperationException("Default operation");
}
/**
* Returns a view of a row. Changes to the view will affect the original.
*
* @param rowIndex Which row to return.
* @return A vector that references the desired row.
*/
@Override
public Vector viewRow(int rowIndex) {
IntBuffer tmp = data.get(bufferIndex[rowIndex]).asReadOnlyBuffer();
tmp.position(rowOffset[rowIndex]);
tmp.limit(rowOffset[rowIndex] + rowSize[rowIndex]);
tmp = tmp.slice();
return new SparseBinaryVector(tmp, columnSize());
}
private static class SparseBinaryVector extends AbstractVector {
private final IntBuffer buffer;
private final int maxIndex;
private SparseBinaryVector(IntBuffer buffer, int maxIndex) {
super(maxIndex);
this.buffer = buffer;
this.maxIndex = maxIndex;
}
SparseBinaryVector(ByteBuffer row, int maxIndex, int offset, int size) {
super(maxIndex);
row = row.asReadOnlyBuffer();
row.position(offset);
row.limit(offset + size * 4);
row = row.slice();
this.buffer = row.slice().asIntBuffer();
this.maxIndex = maxIndex;
}
/**
* Subclasses must override to return an appropriately sparse or dense result
*
* @param rows the row cardinality
* @param columns the column cardinality
* @return a Matrix
*/
@Override
protected Matrix matrixLike(int rows, int columns) {
throw new UnsupportedOperationException("Default operation");
}
/**
* Used internally by assign() to update multiple indices and values at once.
* Only really useful for sparse vectors (especially SequentialAccessSparseVector).
*
* If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
*
* @param updates a mapping of indices to values to merge in the vector.
*/
@Override
public void mergeUpdates(OrderedIntDoubleMapping updates) {
throw new UnsupportedOperationException("Cannot mutate SparseBinaryVector");
}
/**
* @return true iff this implementation should be considered dense -- that it explicitly represents
* every value
*/
@Override
public boolean isDense() {
return false;
}
/**
* @return true iff this implementation should be considered to be iterable in index order in an
* efficient way. In particular this implies that {@link #iterator()} and {@link
* #iterateNonZero()} return elements in ascending order by index.
*/
@Override
public boolean isSequentialAccess() {
return true;
}
/**
* Iterates over all elements
*
* NOTE: Implementations may choose to reuse the Element returned
* for performance reasons, so if you need a copy of it, you should call {@link #getElement(int)}
* for the given index
*
* @return An {@link java.util.Iterator} over all elements
*/
@Override
public Iterator iterator() {
return new AbstractIterator() {
int i = 0;
@Override
protected Element computeNext() {
if (i < maxIndex) {
return new Element() {
int index = i++;
/**
* @return the value of this vector element.
*/
@Override
public double get() {
return getQuick(index);
}
/**
* @return the index of this vector element.
*/
@Override
public int index() {
return index;
}
/**
* @param value Set the current element to value.
*/
@Override
public void set(double value) {
throw new UnsupportedOperationException("Default operation");
}
};
} else {
return endOfData();
}
}
};
}
/**
* Iterates over all non-zero elements. NOTE: Implementations may choose to reuse the Element
* returned for performance reasons, so if you need a copy of it, you should call {@link
* #getElement(int)} for the given index
*
* @return An {@link java.util.Iterator} over all non-zero elements
*/
@Override
public Iterator iterateNonZero() {
return new AbstractIterator() {
int i = 0;
@Override
protected Element computeNext() {
if (i < buffer.limit()) {
return new BinaryReadOnlyElement(buffer.get(i++));
} else {
return endOfData();
}
}
};
}
/**
* Return the value at the given index, without checking bounds
*
* @param index an int index
* @return the double at the index
*/
@Override
public double getQuick(int index) {
return searchForIndex(buffer, index);
}
/**
* Return an empty vector of the same underlying class as the receiver
*
* @return a Vector
*/
@Override
public Vector like() {
return new RandomAccessSparseVector(size());
}
@Override
public Vector like(int cardinality) {
return new RandomAccessSparseVector(cardinality);
}
/**
* Copy the vector for fast operations.
*
* @return a Vector
*/
@Override
protected Vector createOptimizedCopy() {
return new RandomAccessSparseVector(size()).assign(this);
}
/**
* Set the value at the given index, without checking bounds
*
* @param index an int index into the receiver
* @param value a double value to set
*/
@Override
public void setQuick(int index, double value) {
throw new UnsupportedOperationException("Read-only view");
}
/**
* Set the value at the given index, without checking bounds
*
* @param index an int index into the receiver
* @param increment a double value to set
*/
@Override
public void incrementQuick(int index, double increment) {
throw new UnsupportedOperationException("Read-only view");
}
/**
* Return the number of values in the recipient which are not the default value. For instance, for
* a sparse vector, this would be the number of non-zero values.
*
* @return an int
*/
@Override
public int getNumNondefaultElements() {
return buffer.limit();
}
@Override
public double getLookupCost() {
return 1;
}
@Override
public double getIteratorAdvanceCost() {
return 1;
}
@Override
public boolean isAddConstantTime() {
throw new UnsupportedOperationException("Can't add binary value");
}
}
public static class BinaryReadOnlyElement implements Vector.Element {
private final int index;
public BinaryReadOnlyElement(int index) {
this.index = index;
}
/**
* @return the value of this vector element.
*/
@Override
public double get() {
return 1;
}
/**
* @return the index of this vector element.
*/
@Override
public int index() {
return index;
}
/**
* @param value Set the current element to value.
*/
@Override
public void set(double value) {
throw new UnsupportedOperationException("Can't set binary value");
}
}
}