edu.ucla.sspace.matrix.ClutoSparseMatrixBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.matrix;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.util.SparseArray;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.Vector;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.util.Arrays;
import java.util.logging.Logger;
/**
* A {@code MatrixBuilder} for building sparse cluto matrix input files.
*
* @author Keith Stevens
*/
public class ClutoSparseMatrixBuilder implements MatrixBuilder {
/**
* Logger for the {@code ClutoSparseMatrixBuilder} class
*/
private static final Logger LOGGER =
Logger.getLogger(ClutoSparseMatrixBuilder.class.getName());
/**
* The file to which the matrix will be written
*/
private final File matrixFile;
/**
* The writer used to add data to the transposed matrix file
*/
private final PrintWriter writer;
/**
* Whether the builder has finished adding data to the matrix array
*/
private boolean isFinished;
/**
* The number of the row that will next be assigned. Once this matrix
* has been finished, this value will reflect the total number of rows in
* the matrix.
*/
private int curRow;
/**
* The total number of columns in the matrix. This value is continuously
* updated as new columns are seen and is not valid until the matrix has
* been finished.
*/
private int numCols;
/**
* The total number of non-zero values in the matrix. This value is
* continuously updated as new columns are seen and is not valid until the
* matrix has been finished.
*/
private int nonZeroValues;
/**
* Creates a new {@code ClutoSparseMatrixBuilder} using a constructed temp
* file.
*/
public ClutoSparseMatrixBuilder() {
this(getTempMatrixFile());
}
/**
* Creates a new {@code ClutoSparseMatrixBuilder} using a the given
* file.
*/
public ClutoSparseMatrixBuilder(File backingFile) {
this.matrixFile = backingFile;
curRow = 0;
numCols = 0;
nonZeroValues = 0;
isFinished = false;
try {
writer = new PrintWriter(backingFile);
// Write a temporary header of whitespace using 100 chars.
char[] whiteSpace = new char[100];
Arrays.fill(whiteSpace, ' ');
writer.println(whiteSpace);
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Returns a temporary file that will be deleted on JVM exit.
*'
* @return a temporary file used to store a matrix
*/
private static File getTempMatrixFile() {
File tmp = null;
try {
tmp = File.createTempFile("cluto-sparse-matrix", ".dat");
tmp.deleteOnExit();
} catch (IOException ioe) {
throw new IOError(ioe);
}
return tmp;
}
/**
* {@inheritDoc}
*/
public synchronized int addColumn(double[] row) {
if (isFinished)
throw new IllegalStateException(
"Cannot add columns to a MatrixBuilder that is finished");
// Update the size of the matrix based on the size of the array
if (row.length > numCols)
numCols = row.length;
// Write the row to file
int nonZero = 0;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < row.length; ++i) {
if (row[i] != 0d) {
sb.append(i+1).append(" ").append(row[i]).append(" ");
nonZero++;
}
}
writer.println(sb.toString());
// Update the total number of non-zero values for the entire matrix
nonZeroValues += nonZero;
return ++curRow;
}
/**
* {@inheritDoc}
*/
public synchronized int addColumn(SparseArray extends Number> row) {
if (isFinished)
throw new IllegalStateException(
"Cannot add columns to a MatrixBuilder that is finished");
// Update the size of the matrix based on the size of the array
if (row.length() > numCols)
numCols = row.length();
// SparseArray instances can take on the maximum possible array size
// when the array length isn't specified. This ruins the matrix size
// specification since the matrix shouldn't actually be that big.
// However, because this is an implementation artifact, we can't check
// for it explicitly with an exception. Therefore, put in an assert to
// indicate what is likely going on if asserts are enabled for debugging
// they symptoms.
assert row.length() != Integer.MAX_VALUE : "adding a column whose " +
"length is Integer.MAX_VALUE (was likley left unspecified in the " +
" constructor).";
// Write the row to the file.
int[] nonZero = row.getElementIndices();
nonZeroValues += nonZero.length;
StringBuilder sb = new StringBuilder();
for (int i : nonZero) {
sb.append(i+1).append(" ");
sb.append(row.get(i).floatValue()).append(" ");
}
writer.println(sb.toString());
return ++curRow;
}
/**
* {@inheritDoc}
*/
public synchronized int addColumn(Vector row) {
if (isFinished)
throw new IllegalStateException(
"Cannot add columns to a MatrixBuilder that is finished");
// Update the size of the matrix based on the size of the array
if (row.length() > numCols)
numCols = row.length();
// Vector instances can take on the maximum possible array size when the
// vector length isn't specified. This ruins the matrix size
// specification since the matrix shouldn't actually be that big.
// However, because this is an implementation artifact, we can't check
// for it explicitly with an exception. Therefore, put in an assert to
// indicate what is likely going on if asserts are enabled for debugging
// they symptoms.
assert row.length() != Integer.MAX_VALUE : "adding a column whose " +
"length is Integer.MAX_VALUE (was likley left unspecified in the " +
" constructor).";
// Special case for sparse Vectors, for which we already know the
// non-zero indices for the column
if (row instanceof SparseVector) {
SparseVector s = (SparseVector)row;
int[] nonZero = s.getNonZeroIndices();
nonZeroValues += nonZero.length;
StringBuilder sb = new StringBuilder();
for (int i : nonZero) {
sb.append(i+1).append(" ");
sb.append(row.getValue(i).doubleValue()).append(" ");
}
writer.println(sb.toString());
}
// For dense Vectors, find which values are non-zero and write only
// those
else {
int nonZero = 0;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < row.length(); ++i) {
double d = row.getValue(i).doubleValue();
if (d != 0d) {
sb.append(i+1).append(" ").append(d).append(" ");
nonZero++;
}
}
writer.println(sb.toString());
// Update the matrix count
nonZeroValues += nonZero;
}
return ++curRow;
}
/**
* {@inheritDoc} Once this method has been called, any subsequent calls will
* have no effect and will not throw an exception.
*/
public synchronized void finish() {
if (!isFinished) {
isFinished = true;
try {
writer.close();
// Re-open as a random access file so we can overwrite the 3 int
// header that specifies the number of dimensions and values.
// Note that the location of the matrix data is dependent on
// whether the matrix is to be transposed.
RandomAccessFile matrixRaf =
new RandomAccessFile(matrixFile, "rw");
// Write the header in the first 100 characters. The header is
// the number rows, the number of columns, and the number of non
// zeros on a single line with spaces between them.
StringBuilder sb = new StringBuilder();
sb.append(curRow).append(" ");
sb.append(numCols).append(" ");
sb.append(nonZeroValues).append(" ");
matrixRaf.write(sb.toString().getBytes());
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
}
/**
* {@inheritDoc}
*/
public synchronized File getFile() {
if (!isFinished)
throw new IllegalStateException(
"Cannot access matrix file until finished has been called");
return matrixFile;
}
/**
* Returns {@link MatrixIO#Format.CLUTO_SPARSE.
* CLUTO_SPARSE}.
*/
public Format getMatrixFormat() {
return MatrixIO.Format.CLUTO_SPARSE;
}
/**
* {@inheritDoc}
*/
public MatrixFile getMatrixFile() {
return new MatrixFile(getFile(), getMatrixFormat());
}
/**
* {@inheritDoc}
*/
public synchronized boolean isFinished() {
return isFinished;
}
}