All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.matrix.DenseMatrix Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
package org.wikibrain.matrix;

import gnu.trove.map.hash.TIntLongHashMap;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.LongBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Implementation of a dense matrix.
 * The rows are memory mapped, so they can be immediately read from disk.
 * All rows must have the same columns in the same order.
 */
public class DenseMatrix implements Matrix {

    public static final Logger LOG = LoggerFactory.getLogger(DenseMatrix.class);

    public static final int FILE_HEADER = 0xabccba;

    private int numRows;
    private IntBuffer rowIds;
    private LongBuffer rowOffsets;

    private int colIds[];
    private FileChannel channel;
    private File path;

    MemoryMappedMatrix rowBuffers;
    private ValueConf vconf;

    // default header page size is 100MB, will be expanded if necessary
    public static final int DEFAULT_HEADER_SIZE = 100 * 1024 * 1024;

    /**
     * Create a dense matrix based on the data in a particular file.
     * @param path Path to the matrix data file.
     * @throws java.io.IOException
     */
    public DenseMatrix(File path) throws IOException {
        this.path = path;
        info("initializing sparse matrix with file length " + FileUtils.sizeOf(path));
        this.channel = (new FileInputStream(path)).getChannel();
        readHeaders();
        rowBuffers = new MemoryMappedMatrix(path, channel, rowIds, rowOffsets);
    }

    private void readHeaders() throws IOException {
        long size = Math.min(channel.size(), DEFAULT_HEADER_SIZE);
        MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, size);

        // read header
        if (buffer.getInt(0) != FILE_HEADER) {
            throw new IOException("invalid file header: " + buffer.getInt(0));
        }
        this.vconf = new ValueConf(buffer.getFloat(4), buffer.getFloat(8));
        this.numRows = buffer.getInt(12);
        int numCols = buffer.getInt(16);
        int headerSize = 20 + 12 * numRows + 4 * numCols;
        if (headerSize > DEFAULT_HEADER_SIZE) {
            info("maxPageSize not large enough for entire header. Resizing to " + headerSize);
            buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, headerSize);
        }

        debug("preparing buffer for " + numRows + " rows");
        buffer.position(20);
        buffer.limit(buffer.position() + 4 * numRows);
        rowIds = buffer.slice().asIntBuffer();
        if (rowIds.capacity() != numRows) {
            throw new IllegalStateException();
        }
        buffer.position(20 + 4 * numRows);
        buffer.limit(buffer.position() + 8 * numRows);
        rowOffsets = buffer.slice().asLongBuffer();
        if (rowOffsets.capacity() != numRows) {
            throw new IllegalStateException();
        }

        // read column ids
        buffer.limit(headerSize);
        int pos = 20 + 12 * numRows;
        colIds = new int[numCols];
        for (int i = 0; i < numCols; i++) {
            colIds[i] = buffer.getInt(pos);
            pos += 4;
        }
        if (!SparseMatrixUtils.isIncreasing(colIds)) {
            throw new IllegalArgumentException("Columns must be sorted by id");
        }
        info("read " + colIds.length + " column ids");
    }

    @Override
    public DenseMatrixRow getRow(int rowId) throws IOException {
        ByteBuffer bb = rowBuffers.getRow(rowId);
        if (bb == null) {
            return null;
        } else {
            return new DenseMatrixRow(vconf, colIds, bb);
        }
    }

    @Override
    public int[] getRowIds() {
        return rowBuffers.getRowIdsInDiskOrder();
    }

    public int[] getColIds() {
        return colIds;
    }

    @Override
    public int getNumRows() {
        return numRows;
    }

    public ValueConf getValueConf() {
        return vconf;
    }


    @Override
    public Iterator iterator() {
        return new DenseMatrixIterator();
    }

    public class DenseMatrixIterator implements Iterator {
        private AtomicInteger i = new AtomicInteger();
        private int[] rowIds = rowBuffers.getRowIdsInDiskOrder();
        @Override
        public boolean hasNext() {
            return i.get() < numRows;
        }
        @Override
        public DenseMatrixRow next() {
            try {
                return getRow(rowIds[i.getAndIncrement()]);
            } catch (IOException e) {
                LOG.error("getRow failed", e);
                return null;
            }
        }
        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    @Override
    public File getPath() {
        return path;
    }

    @Override
    public void close() throws IOException {
        rowBuffers.close();
    }

    private void info(String message) {
        LOG.error("dense matrix " + path + ": " + message);
    }

    private void debug(String message) {
        LOG.error("dense matrix " + path + ": " + message);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy