All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.matrix.SvdlibcSparseBinaryMatrixBuilder Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.matrix;

import edu.ucla.sspace.matrix.MatrixIO.Format;

import edu.ucla.sspace.util.SparseArray;

import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.IOError;
import java.io.IOException;
import java.io.RandomAccessFile;

import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * A {@code MatrixBuilder} implementation for creating matrix files in the SVDLIBC sparse
 * binary matrix format.
 *
 * 

The {@code addColumn} method is capable of throwing an {@code IOError} if any * {@code IOException} occurs while operating on the underlying matrix file. * *

This class is thread-safe. * * @author David Jurgens */ public class SvdlibcSparseBinaryMatrixBuilder implements MatrixBuilder { /** * Logger for the {@code SvdlibcSparseBinaryMatrixBuilder} class */ private static final Logger LOGGER = Logger.getLogger(SvdlibcSparseBinaryMatrixBuilder.class.getName()); /** * The writer used to add data to the transposed matrix file */ private final DataOutputStream matrixDos; /** * Whether the inputted matrix columns should be transposed as rows in the * final matrix data file. */ private final boolean transposeData; /** * Whether the builder has finished adding data to the matrix array */ private boolean isFinished; /** * The number of the column that will next be assigned. Once this matrix * has been finished, this value will reflect the total number of rows in * the matrix. */ private int curCol; /** * The file to which the matrix will be written */ private File matrixFile; /** * The total number of columns in the matrix. This value is continuously * updated as new columns are seen and is not valid until the matrix has been * finished. */ private int numRows; /** * The total number of non-zero values in the matrix. This value is * continuously updated as new columns are seen and is not valid until the * matrix has been finished. */ private int nonZeroValues; /** * The file to which the matrix will be written if the data is to be * transposed in its final form. The data is originally written in its * non-transposed form to this file and then is converted when {@link * #finish() finish} is called. */ private File transposedMatrixFile; /** * Creates a builder for a matrix in the {@link * MatrixIO.Format#SVDLIBC_SPARSE_BINARY SVDLIBC_SPARSE_BINARY} format to be * stored in a temporary file. */ public SvdlibcSparseBinaryMatrixBuilder() { this(getTempMatrixFile(), false); } /** * Creates a builder for a matrix in the {@link * MatrixIO.Format#SVDLIBC_SPARSE_BINARY SVDLIBC_SPARSE_BINARY} format to be * stored in a temporary file. * * @param transposeData {@code true} if the input matrix columns should be * tranposed in the backing matrix file */ public SvdlibcSparseBinaryMatrixBuilder(boolean transposeData) { this(getTempMatrixFile(), transposeData); } /** * Creates a builder for a matrix in the {@link * MatrixIO.Format#SVDLIBC_SPARSE_BINARY SVDLIBC_SPARSE_BINARY} format, * which will be stored in the specified file. * * @param backingFile the file to which the matrix should be written */ public SvdlibcSparseBinaryMatrixBuilder(File backingFile) { this(backingFile, false); } /** * Creates a builder for a matrix in the {@link * MatrixIO.Format#SVDLIBC_SPARSE_BINARY SVDLIBC_SPARSE_BINARY} format, * which will be stored in the specified file. * * @param backingFile the file to which the matrix should be written * @param transposeData {@code true} if the input matrix columns should be * tranposed in the backing matrix file */ public SvdlibcSparseBinaryMatrixBuilder(File backingFile, boolean transposeData) { this.matrixFile = backingFile; this.transposeData = transposeData; curCol = 0; numRows = 0; nonZeroValues = 0; isFinished = false; try { File matrixDataFile = (transposeData) ? (transposedMatrixFile = getTempMatrixFile()) : backingFile; // Interact with the matrix using a RandomAccessFile //matrixRaf = new RandomAccessFile(backingFile, "rw"); matrixDos = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(matrixDataFile))); // write the 12 byte header to advance the file pointer to where the // matrix data will start. The header will be back filled once the // matrix data has been finalized for (int i = 0; i < 3; ++i) matrixDos.writeInt(0); } catch (IOException ioe) { throw new IOError(ioe); } } /** * Returns a temporary file that will be deleted on JVM exit. *' * @return a temporary file used to store a matrix */ private static File getTempMatrixFile() { File tmp = null; try { tmp = File.createTempFile("svdlibc-sparse-binary-matrix", ".dat"); //tmp.deleteOnExit(); } catch (IOException ioe) { throw new IOError(ioe); } return tmp; } /** * {@inheritDoc} */ public synchronized int addColumn(double[] column) { if (isFinished) throw new IllegalStateException( "Cannot add columns to a MatrixBuilder that is finished"); // Update the size of the matrix based on the size of the array if (column.length > numRows) numRows = column.length; // Identify how many non-zero values are present in the column int nonZero = 0; for (int i = 0; i < column.length; ++i) { if (column[i] != 0d) nonZero++; } // Update the total number of non-zero values for the entire matrix nonZeroValues += nonZero; // Write the column to file try { matrixDos.writeInt(nonZero); for (int i = 0; i < column.length; ++i) { if (column[i] != 0d) { matrixDos.writeInt(i); // write the row index matrixDos.writeFloat((float)column[i]); } } } catch (IOException ioe) { throw new IOError(ioe); } return ++curCol; } /** * {@inheritDoc} */ public synchronized int addColumn(SparseArray column) { if (isFinished) throw new IllegalStateException( "Cannot add columns to a MatrixBuilder that is finished"); if (column.length() > numRows) numRows = column.length(); // SparseArray instances can take on the maximum possible array size // when the array length isn't specified. This ruins the matrix size // specification since the matrix shouldn't actually be that big. // However, because this is an implementation artifact, we can't check // for it explicitly with an exception. Therefore, put in an assert to // indicate what is likely going on if asserts are enabled for debugging // they symptoms. assert column.length() != Integer.MAX_VALUE : "adding a column whose " + "length is Integer.MAX_VALUE (was likley left unspecified in the " + " constructor)."; int[] nonZero = column.getElementIndices(); nonZeroValues += nonZero.length; try { matrixDos.writeInt(nonZero.length); for (int i : nonZero) { matrixDos.writeInt(i); // write the row index matrixDos.writeFloat(column.get(i).floatValue()); } } catch (IOException ioe) { throw new IOError(ioe); } return ++curCol; } /** * {@inheritDoc} */ public synchronized int addColumn(Vector col) { if (isFinished) throw new IllegalStateException( "Cannot add columns to a MatrixBuilder that is finished"); DoubleVector column = Vectors.asDouble(col); if (column.length() > numRows) numRows = column.length(); // Vector instances can take on the maximum possible array size when the // vector length isn't specified. This ruins the matrix size // specification since the matrix shouldn't actually be that big. // However, because this is an implementation artifact, we can't check // for it explicitly with an exception. Therefore, put in an assert to // indicate what is likely going on if asserts are enabled for debugging // they symptoms. assert column.length() != Integer.MAX_VALUE : "adding a column whose " + "length is Integer.MAX_VALUE (was likley left unspecified in the " + " constructor)."; // Special case for sparse Vectors, for which we already know the // non-zero indices for the column if (column instanceof SparseVector) { SparseVector s = (SparseVector)column; int[] nonZero = s.getNonZeroIndices(); nonZeroValues += nonZero.length; System.out.println(nonZero.length); try { matrixDos.writeInt(nonZero.length); for (int i : nonZero) { double val = column.get(i); matrixDos.writeInt(i); // write the row index matrixDos.writeFloat((float)val); } } catch (IOException ioe) { throw new IOError(ioe); } } // For dense Vectors, find which values are non-zero and write only // those else { int nonZero = 0; for (int i = 0; i < column.length(); ++i) { double d = column.get(i); if (d != 0d) nonZero++; } // Update the matrix count nonZeroValues += nonZero; try { matrixDos.writeInt(nonZero); // Write the number of non-zero values in the column for (int i = 0; i < column.length(); ++i) { double value = column.get(i); if (value != 0d) { matrixDos.writeInt(i); // write the row index matrixDos.writeFloat((float)value); } } } catch (IOException ioe) { throw new IOError(ioe); } } return ++curCol; } /** * {@inheritDoc} Once this method has been called, any subsequent calls will * have no effect and will not throw an exception. */ public synchronized void finish() { if (!isFinished) { isFinished = true; try { matrixDos.close(); // Re-open as a random access file so we can overwrite the 3 int // header that specifies the number of dimensions and values. // Note that the location of the matrix data is dependent on // whether the matrix is to be transposed. File dataFile = (transposeData) ? transposedMatrixFile : matrixFile; RandomAccessFile matrixRaf = new RandomAccessFile(dataFile, "rw"); // Back fill the dimensions of the matrix and the number of // non-zero values as the 3 int header in the file matrixRaf.writeInt(numRows); matrixRaf.writeInt(curCol); matrixRaf.writeInt(nonZeroValues); matrixRaf.close(); } catch (IOException ioe) { throw new IOError(ioe); } // If the user specified that the matrix should be tranposed, then // transposedMatrixFile will contain the matrix in its un-transposed // form. Issue a call to SVDLIBC to transposed the file contents // for us. if (transposeData) { boolean svdlibcFailed = false; try { String commandLine = "svd -r sb " + " -w sb -t -c " + transposedMatrixFile + " " + matrixFile; Process svdlibc = Runtime.getRuntime().exec(commandLine); LOGGER.fine("transposing svdlibc sparse matrix: " + commandLine); BufferedReader stdout = new BufferedReader( new InputStreamReader(svdlibc.getInputStream())); BufferedReader stderr = new BufferedReader( new InputStreamReader(svdlibc.getErrorStream())); StringBuilder output = new StringBuilder("SVDLIBC output:\n"); for (String line = null; (line = stderr.readLine()) != null; ) { output.append(line).append("\n"); } LOGGER.fine(output.toString()); int exitStatus = svdlibc.waitFor(); LOGGER.fine("svdlibc exit status: " + exitStatus); if (exitStatus != 0) { StringBuilder sb = new StringBuilder(); for (String line = null; (line = stderr.readLine()) != null; ) { sb.append(line).append("\n"); } // warning or error? LOGGER.warning("svdlibc exited with error status. " + "stderr:\n" + sb.toString()); svdlibcFailed = true; } } catch (Exception e) { svdlibcFailed = true; LOGGER.log(Level.WARNING, "an exception occurred when trying to transpose " + "with svdlibc; retrying with Java code", e); } // If SVDLIBC failed in any way, use MatrixIO to transpose the // data for us. if (svdlibcFailed) { LOGGER.fine("retrying failed svdlibc transpose " + "with MatrixIO transpose"); try { matrixFile = MatrixIO.convertFormat( transposedMatrixFile, Format.SVDLIBC_SPARSE_BINARY, Format.SVDLIBC_SPARSE_BINARY, true); } catch (IOException ioe) { throw new IOError(ioe); } } } } } /** * {@inheritDoc} */ public synchronized File getFile() { if (!isFinished) throw new IllegalStateException( "Cannot access matrix file until finished has been called"); return matrixFile; } /** * Returns {@link MatrixIO.Format#SVDLIBC_SPARSE_BINARY * SVDLIBC_SPARSE_BINARY}. */ public Format getMatrixFormat() { return MatrixIO.Format.SVDLIBC_SPARSE_BINARY; } /** * {@inheritDoc} */ public MatrixFile getMatrixFile() { return new MatrixFile(getFile(), getMatrixFormat()); } /** * {@inheritDoc} */ public synchronized boolean isFinished() { return isFinished; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy