All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.matrix.SparseOnDiskMatrix Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.matrix;

import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.SparseVector;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.io.RandomAccessFile;

import java.nio.DoubleBuffer;

import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;

import java.util.WeakHashMap;
import java.util.Map;

import java.util.concurrent.atomic.AtomicInteger;


/**
 * A {@code SparseMatrix} implementation that uses a binary file to read and
 * write. The matrix is still stored in a dense row-column order on disk, so
 * in-order column accesses to elments in a row will perform much better than
 * sequential row accesses to the same column.  However calls to {@link
 * #getRowVector(int) getRowVector} and {@link #getColumnVector(int)
 * getColumnVector} will return {@link SparseVector} instances.  This class is
 * intended for large matrices that need to be on disk due to their dimensions,
 * but whose data is mostly sparse 

* * The {@link DoubleVector} representations returned reflect a snapshot of the state * of the matrix at the time of access. Subsequent updates to the matrix will * not be reflected in these vectors, nor will changes to the vector be * propagated to the matrix. * *

* * If a {@link IOException} is ever raised as a part of executing an the methods * of an instance, the exception is rethrown as a {@link IOError}. * * @author David Jurgens */ public class SparseOnDiskMatrix extends OnDiskMatrix implements SparseMatrix { /** * A weak mapping from the row index to a {@link SparseVector} that was * created for that row. */ private final Map rowToVectorCache; /** * A weak mapping from the column index to a {@link SparseVector} that was * created for that column. */ private final Map colToVectorCache; /** * A counter to keep track of any modifications to this matrix. Any * modification causes this counter to increment. * * @see VersionedVector */ private final AtomicInteger version; /** * Create a matrix of the provided size using a temporary file. * * @throws IOError if the backing file for this matrix cannot be created */ public SparseOnDiskMatrix(int rows, int cols) { super(rows, cols); version = new AtomicInteger(0); rowToVectorCache = new WeakHashMap(); colToVectorCache = new WeakHashMap(); } /** * {@inheritDoc} */ @Override public SparseDoubleVector getColumnVector(int column) { // Check whether we have this column cached VersionedVector cachedCol = colToVectorCache.get(column); // If the cache was empty or if the matrix has been updated since this // vector was created, recreate the vector and cache it before returning if (cachedCol == null || cachedCol.version != version.get()) { cachedCol = new VersionedVector(rows(), version.get()); double[] col = getColumn(column); for (int i = 0; i < col.length; ++i) { double d = col[i]; if (d != 0) cachedCol.set(i, d); } colToVectorCache.put(column, cachedCol); } return cachedCol; } /** * {@inheritDoc} */ @Override public SparseDoubleVector getRowVector(int row) { // Check whether we have this row cached VersionedVector cachedRow = colToVectorCache.get(row); // If the cache was empty or if the matrix has been updated since this // vector was created, recreate the vector and cache it before returning if (cachedRow == null || cachedRow.version != version.get()) { cachedRow = new VersionedVector(columns(), version.get()); double[] r = getRow(row); for (int i = 0; i < r.length; ++i) { double d = r[i]; if (d != 0) cachedRow.set(i, d); } rowToVectorCache.put(row, cachedRow); } return cachedRow; } /** * {@inheritDoc} */ @Override public void set(int row, int col, double val) { super.set(row, col, val); version.incrementAndGet(); } /** * {@inheritDoc} */ @Override public void setColumn(int column, double[] values) { super.setColumn(column, values); version.incrementAndGet(); } /** * {@inheritDoc} */ @Override public void setColumn(int column, DoubleVector values) { super.setColumn(column, values); version.incrementAndGet(); } /** * {@inheritDoc} */ @Override public void setRow(int row, double[] vals) { super.setRow(row, vals); version.incrementAndGet(); } /** * {@inheritDoc} */ @Override public void setRow(int row, DoubleVector values) { super.setRow(row, values); version.incrementAndGet(); } /** * A {@code DoubleVector} instance that keeps track of a versioned state. * This class is intended to mark when the instance was created to enable * checked whether its data might be inconsistent with the matrix from which * it was generated. */ private static class VersionedVector extends SparseHashDoubleVector { private static final long serialVersionUID = 1L; private final int version; public VersionedVector(int length, int version) { super(length); this.version = version; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy