All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.matrix.MatrixIO Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.matrix;

import edu.ucla.sspace.matrix.Matrix.Type;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseVector;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.SortedMap;
import java.util.TreeMap;

import java.util.logging.Logger;
import java.util.logging.Level;


/**
 * A shared utility for printing matrices to files in a uniform manner and
 * converting between different formats.
 *
 * @see MatrixIO.Format
 * @see Matrix
 * @see Matrix.Type
 *
 * @author David Jurgens
 */
public class MatrixIO {

    private static final Logger MATRIX_IO_LOGGER = 
        Logger.getLogger(MatrixIO.class.getName());

    /**
     * An enum that specifies the formatting used for the matrix that is
     * serialized in a file.  Format specifications are as follows: 
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
format
description
{@link Format#DENSE_TEXT DENSE_TEXT}Each row is on its own line, with each column being delimited * by one or more white space characters. All rows should have the * same number of columns.
{@link Format#MATLAB_SPARSE MATLAB_SPARSE} The sparse format supported by Matlab. See here * for full details.
{@link Format#SVDLIBC_SPARSE_TEXT SVDLIBC_SPARSE_TEXT}The sparse human readable format supported by SVDLIBC. See here for * full details.
{@link Format#SVDLIBC_DENSE_TEXT SVDLIBC_DENSE_TEXT} The dense human readable format supported by SVDLIBC. See here for * full details.
{@link Format#SVDLIBC_SPARSE_BINARY SVDLIBC_SPARSE_BINARY}The sparse binary format supported by SVDLIBC. See here for * full details.
{@link Format#SVDLIBC_DENSE_BINARY SVDLIBC_DENSE_BINARY}The dense binary format supported by SVDLIBC. See here for * full details.
*
*/ public enum Format { /** * A human readable format where each row has its own line and all * column values are provided. */ DENSE_TEXT, /** * The sparse format supported by Matlab. See here * for full details. */ MATLAB_SPARSE, /** * The sparse human readable format supported by SVDLIBC. See here for * full details. */ SVDLIBC_SPARSE_TEXT, /** * The dense human readable format supported by SVDLIBC. See here for * full details. */ SVDLIBC_DENSE_TEXT, /** * The sparse binary format supported by SVDLIBC. See here for * full details. */ SVDLIBC_SPARSE_BINARY, /** * The dense binary format supported by SVDLIBC. See here for * full details. */ SVDLIBC_DENSE_BINARY, /** * The sparse text format supported by * CLUTO. See more details in the * CLUTO manual. */ CLUTO_DENSE, /** * The sparse text format supported by * CLUTO. See more details in the * CLUTO manual. */ CLUTO_SPARSE, } /** * Uninstantiable */ private MatrixIO() { } /** * Converts the format of the input {@code matrix}, returning a temporary * file containing the matrix's data in the desired format. * * @param matrix a file containing a matrix to convert * @param current the format of the {@code matrix} file * @param desired the format of the returned matrix file * * @return a matrix file with the same data in the desired format * * @throws IOException if any error occurs while reading the input matrix or * wring the output matrix */ public static File convertFormat(File matrix, Format current, Format desired) throws IOException { return convertFormat(matrix, current, desired, false); } /** * Converts the format of the input {@code matrix}, returning a temporary * file containing the matrix's data in the desired format. * * @param matrix a file containing a matrix to convert * @param current the format of the {@code matrix} file * @param desired the format of the returned matrix file * @param transpose {@code true} if data in the input matrix should be * transposed while converting formats to the output matrix. * * @return a matrix file with the same data in the desired format * * @throws IOException if any error occurs while reading the input matrix or * wring the output matrix */ public static File convertFormat(File matrix, Format current, Format desired, boolean transpose) throws IOException { if (!transpose && current.equals(desired)) { return matrix; } // Special cases for optimized formats switch (current) { case MATLAB_SPARSE: { if (desired.equals(Format.SVDLIBC_SPARSE_TEXT)) { File output = File.createTempFile( "matlab-to-SVDLIBC-sparse-text",".dat"); output.deleteOnExit(); matlabToSvdlibcSparseText(matrix, output, transpose); return output; } if (desired.equals(Format.SVDLIBC_SPARSE_BINARY)) { File output = File.createTempFile( "matlab-to-SVDLIBC-sparse-binary",".dat"); output.deleteOnExit(); matlabToSvdlibcSparseBinary(matrix, output, transpose); return output; } break; } case SVDLIBC_SPARSE_BINARY: { if (desired.equals(Format.MATLAB_SPARSE)) { File output = File.createTempFile( "SVDLIBC-sparse-binary-to-Matlab",".dat"); output.deleteOnExit(); svdlibcSparseBinaryToMatlab(matrix, output, transpose); return output; } } } // NOTE: the current default implementation does not try to keep the // matrix data on disk, which could present a memory bottleneck. File output = File.createTempFile("transposed",".dat"); // NOTE: the matrix type is not intelligently selected. Futher work is // needed to switch based on the format. Matrix transposed = readMatrix(matrix, current, Matrix.Type.SPARSE_IN_MEMORY, true); writeMatrix(transposed, output, desired); transposed = null; // for explicit GC return output; } /** * Returns an iterator over the matrix entries in the data file. For sparse * formats that specify only non-zero, no zero valued entries will be * returened. Conversely, for dense matrix formats, all of the entries, * including zero entries will be returned. * * @param matrixFile the file containing matrix data in the specified format * @param fileFormat the format of the matrix file * * @return an interator over the entries in the matrix file */ public static Iterator getMatrixFileIterator( File matrixFile, Format fileFormat) throws IOException { switch(fileFormat) { case DENSE_TEXT: return new DenseTextFileIterator(matrixFile); case SVDLIBC_SPARSE_BINARY: return new SvdlibcSparseBinaryFileIterator(matrixFile); case SVDLIBC_SPARSE_TEXT: return new SvdlibcSparseTextFileIterator(matrixFile); case SVDLIBC_DENSE_BINARY: return new SvdlibcDenseBinaryFileIterator(matrixFile); case CLUTO_SPARSE: return new ClutoSparseFileIterator(matrixFile); case CLUTO_DENSE: // Cluto dense and svdlibc's dense text formats are equivalent. case SVDLIBC_DENSE_TEXT: return new SvdlibcDenseTextFileIterator(matrixFile); case MATLAB_SPARSE: return new MatlabSparseFileIterator(matrixFile); } throw new Error("Iterating over matrices of " + fileFormat + " format is not "+ "currently supported. Email " + "[email protected] to request its" + "inclusion and it will be quickly added"); } /** * Returns a {@link FileTransformer} for an {@link Matrix} file encodeded in * the provided {@link Format}. * * @param format the format of a {@link Matrix} that will be transformed * * @return A {@link FileTransformer} for the given format */ public static FileTransformer fileTransformer(Format format) { switch (format) { case MATLAB_SPARSE: return new MatlabSparseFileTransformer(); case SVDLIBC_SPARSE_TEXT: return new SvdlibcSparseTextFileTransformer(); case SVDLIBC_DENSE_TEXT: return new SvdlibcDenseTextFileTransformer(); case SVDLIBC_SPARSE_BINARY: return new SvdlibcSparseBinaryFileTransformer(); case SVDLIBC_DENSE_BINARY: return new SvdlibcDenseBinaryFileTransformer(); default: throw new UnsupportedOperationException( "Transforming format " + format + " is currently " + "not implemented. Please email " + "[email protected] to have this " + "implemented."); } } /** * Reads in a matrix in the {@link Format#MATLAB_SPARSE} format and writes * it to the output file in {@link Format#SVDLIBC_SPARSE_TEXT} format. */ private static void matlabToSvdlibcSparseText(File input, File output, boolean transpose) throws IOException { MATRIX_IO_LOGGER.info("Converting from Matlab double values to " + "SVDLIBC float values; possible loss of " + "precision"); BufferedReader br = new BufferedReader(new FileReader(input)); Map colToNonZero = new HashMap(); // read through once to get matrix dimensions int rows = 0, cols = 0, nonZero = 0; for (String line = null; (line = br.readLine()) != null; ) { String[] rowColVal = line.split("\\s+"); int row, col; if (transpose) { row = Integer.parseInt(rowColVal[1]); col = Integer.parseInt(rowColVal[0]); } else { row = Integer.parseInt(rowColVal[0]); col = Integer.parseInt(rowColVal[1]); } if (row > rows) rows = row; if (col > cols) cols = col; ++nonZero; // NOTE: subtract by 1 here because Matlab arrays start at 1, while // SVDLIBC arrays start at 0. Integer colCount = colToNonZero.get(col-1); colToNonZero.put(col-1, (colCount == null) ? 1 : colCount + 1); } br.close(); // print out the header information PrintWriter pw = new PrintWriter(output); pw.println(rows + "\t" + cols + "\t" + nonZero); // Process the entire array in chunks in case the matlab array is too // big to fit into memory. // REMINDER: this should probably be chosen based on the number of rows // and their expected density int chunkSize = 1000; // This keeps track of the last columns printed. We need this outside // the loop to ensure that blank columns at the end of a chunk are still // printed by the next non-zero chunk int lastCol = -1; // lower bound inclusive, upper bound exclusive for (int lowerBound = 0, upperBound = chunkSize ; lowerBound < rows; lowerBound = upperBound, upperBound += chunkSize) { // Once the dimensions and number of non-zero values are known, // reprocess the matrix, storing the rows and values for each column // that are inside the bounds br = new BufferedReader(new FileReader(input)); // for each column, keep track of which in the next index into the // rows array that should be used to store the row index. Also keep // track of the value associated for that row int[] colIndices = new int[cols]; // columns are kept in sorted order SortedMap colToRowIndex = new TreeMap(); SortedMap colToRowValues = new TreeMap(); for (String line = null; (line = br.readLine()) != null; ) { String[] rowColVal = line.split("\\s+"); int row, col; if (transpose) { row = Integer.parseInt(rowColVal[1]) - 1; col = Integer.parseInt(rowColVal[0]) - 1; } else { row = Integer.parseInt(rowColVal[0]) - 1; col = Integer.parseInt(rowColVal[1]) - 1; } // NOTE: SVDLIBC uses floats instead of doubles, which can cause // a loss of precision float val = Double.valueOf(rowColVal[2]).floatValue(); // check that the current column is within the current chunk if (col < lowerBound || col >= upperBound) continue; // get the arrays used to store the non-zero row indices for // this column and the parallel array that stores the // row-index's value int[] rowIndices = colToRowIndex.get(col); float[] rowValues = colToRowValues.get(col); if (rowIndices == null) { rowIndices = new int[colToNonZero.get(col)]; rowValues = new float[colToNonZero.get(col)]; colToRowIndex.put(col,rowIndices); colToRowValues.put(col,rowValues); } // determine what is the current index in the non-zero row array // that can be used to store this row. int curColIndex = colIndices[col]; rowIndices[curColIndex] = row; rowValues[curColIndex] = val; colIndices[col] += 1; } br.close(); // loop through the stored column and row values, printing out for // each column, the number of non zero rows, followed by each row // index and the value. This is the SVDLIBC sparse text format. for (Map.Entry e : colToRowIndex.entrySet()) { int col = e.getKey().intValue(); int[] nonZeroRows = e.getValue(); float[] values = colToRowValues.get(col); if (col != lastCol) { // print any missing columns in case not all the columns // have data for (int i = lastCol + 1; i < col; ++i) pw.println(0); // print the new header int colCount = colToNonZero.get(col); lastCol = col; pw.println(colCount); } for (int i = 0; i < nonZeroRows.length; ++i) pw.println(nonZeroRows[i] + " " + values[i]); } } pw.flush(); pw.close(); } /** * Reads in a matrix in the {@link Format#MATLAB_SPARSE} format and writes * it to the output file in {@link Format#SVDLIBC_SPARSE_BINARY} format. */ private static void matlabToSvdlibcSparseBinary(File input, File output, boolean transpose) throws IOException { MATRIX_IO_LOGGER.info("Converting from Matlab double values to " + "SVDLIBC float values; possible loss of " + "precision"); BufferedReader br = new BufferedReader(new FileReader(input)); Map colToNonZero = new HashMap(); // read through once to get matrix dimensions int rows = 0, cols = 0, nonZero = 0; for (String line = null; (line = br.readLine()) != null; ) { String[] rowColVal = line.split("\\s+"); int row, col; if (transpose) { row = Integer.parseInt(rowColVal[1]); col = Integer.parseInt(rowColVal[0]); } else { row = Integer.parseInt(rowColVal[0]); col = Integer.parseInt(rowColVal[1]); } if (row > rows) rows = row; if (col > cols) cols = col; ++nonZero; // NOTE: subtract by 1 here because Matlab arrays start at 1, while // SVDLIBC arrays start at 0. Integer colCount = colToNonZero.get(col-1); colToNonZero.put(col-1, (colCount == null) ? 1 : colCount + 1); } br.close(); // Print out the header information DataOutputStream dos = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(output))); dos.writeInt(rows); dos.writeInt(cols); dos.writeInt(nonZero); // Process the entire array in chunks in case the matlab array is too // big to fit into memory. // REMINDER: this should probably be chosen based on the number of rows // and their expected density int chunkSize = 1000; // This keeps track of the last columns printed. We need this outside // the loop to ensure that blank columns at the end of a chunk are still // printed by the next non-zero chunk int lastCol = -1; // lower bound inclusive, upper bound exclusive for (int lowerBound = 0, upperBound = chunkSize ; lowerBound < rows; lowerBound = upperBound, upperBound += chunkSize) { // Once the dimensions and number of non-zero values are known, // reprocess the matrix, storing the rows and values for each column // that are inside the bounds br = new BufferedReader(new FileReader(input)); // for each column, keep track of which in the next index into the // rows array that should be used to store the row index. Also keep // track of the value associated for that row int[] colIndices = new int[cols]; // columns are kept in sorted order SortedMap colToRowIndex = new TreeMap(); SortedMap colToRowValues = new TreeMap(); for (String line = null; (line = br.readLine()) != null; ) { String[] rowColVal = line.split("\\s+"); int row, col; if (transpose) { row = Integer.parseInt(rowColVal[1]) - 1; col = Integer.parseInt(rowColVal[0]) - 1; } else { row = Integer.parseInt(rowColVal[0]) - 1; col = Integer.parseInt(rowColVal[1]) - 1; } // NOTE: SVDLIBC uses floats instead of doubles, which can cause // a loss of precision float val = Double.valueOf(rowColVal[2]).floatValue(); // check that the current column is within the current chunk if (col < lowerBound || col >= upperBound) continue; // get the arrays used to store the non-zero row indices for // this column and the parallel array that stores the // row-index's value int[] rowIndices = colToRowIndex.get(col); float[] rowValues = colToRowValues.get(col); if (rowIndices == null) { rowIndices = new int[colToNonZero.get(col)]; rowValues = new float[colToNonZero.get(col)]; colToRowIndex.put(col,rowIndices); colToRowValues.put(col,rowValues); } // determine what is the current index in the non-zero row array // that can be used to store this row. int curColIndex = colIndices[col]; rowIndices[curColIndex] = row; rowValues[curColIndex] = val; colIndices[col] += 1; } br.close(); // loop through the stored column and row values, printing out for // each column, the number of non zero rows, followed by each row // index and the value. This is the SVDLIBC sparse text format. for (Map.Entry e : colToRowIndex.entrySet()) { int col = e.getKey().intValue(); int[] nonZeroRows = e.getValue(); float[] values = colToRowValues.get(col); if (col != lastCol) { // print any missing columns in case not all the columns // have data for (int i = lastCol + 1; i < col; ++i) dos.writeInt(0); // print the new header int colCount = colToNonZero.get(col); lastCol = col; dos.writeInt(colCount); } for (int i = 0; i < nonZeroRows.length; ++i) { dos.writeInt(nonZeroRows[i]); dos.writeFloat(values[i]); } } } dos.close(); } /** * Reads in a matrix in the {@link Format#SVDLIBC_SPARSE_BINARY} format and * writes it to the output file in the {@link Format#MATLAB_SPARSE} format. */ private static void svdlibcSparseBinaryToMatlab(File input, File output, boolean transpose) throws IOException { DataInputStream dis = new DataInputStream( new BufferedInputStream(new FileInputStream(input))); PrintWriter pw = new PrintWriter(output); int rows = dis.readInt(); int cols = dis.readInt(); int entries = dis.readInt(); // SVDLIBC sparse binary is organized as column data. int entriesSeen = 0; int col = 0; for (; entriesSeen < entries; ++col) { int nonZero = dis.readInt(); for (int i = 0; i < nonZero; ++i, ++entriesSeen) { int row = dis.readInt(); float val = dis.readFloat(); if (transpose) pw.println((1 + col) + " " + (1 + row) + " " + val); else pw.println((1 + row) + " " + (1 + col) + " " + val); } } dis.close(); pw.close(); } /** * Reads in the content of the file as a two dimensional Java array matrix. * This method has been deprecated in favor of using purely {@link Matrix} * objects for all operations. If a Java array is needed, the {@link * Matrix#toArray()} method may be used to generate one. * * @return a two-dimensional array of the matrix contained in provided file */ @Deprecated public static double[][] readMatrixArray(File input, Format format) throws IOException { // Use the same code path for reading the Matrix object and then dump to // an array. This comes at a potential 2X space usage, but greatly // reduces the possibilities for bugs. return readMatrix(input, format).toDenseArray(); } /** * Returns the contents of a matrix file as a {@link SparseMatrix} object, * using the provided format as a hint for what kind to create. The type * of {@code SparseMatrix} object created will assuming that the entire matrix * can fit in memory based on the format of the file speicfied. Note that * the returned {@link SparseMatrix} instance is not backed by the data on file; * changes to the {@code SparseMatrix} will not be reflected in the * original file's data. * * @param matrix a file containing matrix data * @param format the format of the file * * @return the {@code SparseMatrix} instance that contains the data in the * provided file */ public static SparseMatrix readSparseMatrix(String matrix, Format format) throws IOException { return readSparseMatrix(new File(matrix), format); } /** * Returns the contents of a matrix file as a {@link SparseMatrix} object, * using the provided format as a hint for what kind to create. The type * of {@code SparseMatrix} object created will assuming that the entire matrix * can fit in memory based on the format of the file speicfied. Note that * the returned {@link SparseMatrix} instance is not backed by the data on file; * changes to the {@code SparseMatrix} will not be reflected in the * original file's data. * * @param matrix a file containing matrix data * @param format the format of the file * * @return the {@code SparseMatrix} instance that contains the data in the * provided file */ public static SparseMatrix readSparseMatrix(File matrix, Format format) throws IOException { Type type = Type.SPARSE_IN_MEMORY; switch (format) { // Assume all sparse formats will fit in memory. case MATLAB_SPARSE: return readMatlabSparse(matrix, type, false); case CLUTO_SPARSE: return readClutoSparse(matrix, type, false); case SVDLIBC_SPARSE_TEXT: return readSparseSVDLIBCtext(matrix, type, false); case SVDLIBC_SPARSE_BINARY: return readSparseSVDLIBCbinary(matrix, type, false); default: throw new Error( format + " is not a a supproted sparse format"); } } /** * Converts the contents of a matrix file as a {@link Matrix} object, using * the provided format as a hint for what kind to create. The type of * {@code Matrix} object created will assuming that the entire matrix can * fit in memory based on the format of the file speicfied Note that the * returned {@link Matrix} instance is not backed by the data on file; * changes to the {@code Matrix} will not be reflected in the * original file's data. * * @param matrix a file contain matrix data * @param format the format of the file * * @return the {@code Matrix} instance that contains the data in the * provided file */ public static Matrix readMatrix(String matrix, Format format) throws IOException { return readMatrix(new File(matrix), format); } /** * Converts the contents of a matrix file as a {@link Matrix} object, using * the provided format as a hint for what kind to create. The type of * {@code Matrix} object created will assuming that the entire matrix can * fit in memory based on the format of the file speicfied Note that the * returned {@link Matrix} instance is not backed by the data on file; * changes to the {@code Matrix} will not be reflected in the * original file's data. * * @param matrix a file contain matrix data * @param format the format of the file * * @return the {@code Matrix} instance that contains the data in the * provided file */ public static Matrix readMatrix(File matrix, Format format) throws IOException { switch (format) { // Assume all sparse formats will fit in memory. case SVDLIBC_SPARSE_TEXT: case SVDLIBC_SPARSE_BINARY: case MATLAB_SPARSE: case CLUTO_SPARSE: return readMatrix(matrix, format, Type.SPARSE_IN_MEMORY, false); // Assume all dense formats will fit in memory. case SVDLIBC_DENSE_TEXT: case SVDLIBC_DENSE_BINARY: case DENSE_TEXT: case CLUTO_DENSE: return readMatrix(matrix, format, Type.DENSE_IN_MEMORY, false); default: throw new Error( "Reading matrices of " + format + " format is not "+ "currently supported. Email " + "[email protected] to request " + "its inclusion and it will be quickly added"); } } /** * Converts the contents of a matrix file as a {@link Matrix} object, using * the provided type description as a hint for what kind to create. The * type of {@code Matrix} object created will be based on an estimate of * whether the data will fit into the available memory. Note that the * returned {@link Matrix} instance is not backed by the data on file; * changes to the {@code Matrix} will not be reflected in the * original file's data. * * @param matrix a file contain matrix data * @param format the format of the file * @param matrixType the expected type and behavior of the matrix in * relation to memory. This value will be used as a hint for what * kind of {@code Matrix} instance to create * * @return the {@code Matrix} instance that contains the data in the * provided file */ public static Matrix readMatrix(File matrix, Format format, Type matrixType) throws IOException { return readMatrix(matrix, format, matrixType, false); } /** * Converts the contents of a matrix file as a {@link Matrix} object, using * the provided type description as a hint for what kind to create. The * type of {@code Matrix} object created will be based on an estimate of * whether the data will fit into the available memory. Note that the * returned {@link Matrix} instance is not backed by the data on file; * changes to the {@code Matrix} will not be reflected in the * original file's data. * * @param matrix a file contain matrix data * @param format the format of the file * @param matrixType the expected type and behavior of the matrix in * relation to memory. This value will be used as a hint for what * kind of {@code Matrix} instance to create * @param transposeOnRead {@code true} if the matrix should be transposed as * its data is read in. For certain formats, this is more efficient * than reading the data in and then transposing it directly. * * @return the {@code Matrix} instance that contains the data in the * provided file, optionally transposed from its original format * * @throws IOException if any error occurs while reading in the matrix data */ public static Matrix readMatrix(File matrix, Format format, Type matrixType, boolean transposeOnRead) throws IOException { try { switch(format) { case DENSE_TEXT: return readDenseTextMatrix(matrix, matrixType, transposeOnRead); case MATLAB_SPARSE: return readMatlabSparse(matrix, matrixType, transposeOnRead); case CLUTO_SPARSE: return readClutoSparse(matrix, matrixType, transposeOnRead); case SVDLIBC_SPARSE_TEXT: return readSparseSVDLIBCtext(matrix, matrixType, transposeOnRead); // These two formats are equivalent case CLUTO_DENSE: case SVDLIBC_DENSE_TEXT: return readDenseSVDLIBCtext(matrix, matrixType, transposeOnRead); case SVDLIBC_SPARSE_BINARY: return readSparseSVDLIBCbinary(matrix, matrixType, transposeOnRead); case SVDLIBC_DENSE_BINARY: return readDenseSVDLIBCbinary(matrix, matrixType, transposeOnRead); } } catch (EOFException eofe) { // Rethrow with more specific type information throw new MatrixIOException("Matrix file " + matrix + " appeared " + "truncated, or was missing expected values at the end of its " + "contents."); } throw new Error("Reading matrices of " + format + " format is not "+ "currently supported. Email " + "[email protected] to request its "+ "inclusion and it will be quickly added"); } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#CLUTO_SPARSE} in provided file. * * @param matrix The matrix file to read from * @param matrixType The expected format of {@code matrix} * @param transposeOnRead If true, the returned matrix will be a transposed * form of the data in {@code matrix}. * * @return A {@link SparseMatrix} containing the data in {@code matrix}. */ private static SparseMatrix readClutoSparse(File matrix, Type matrixType, boolean transposeOnRead) throws IOException { BufferedReader br = new BufferedReader(new FileReader(matrix)); String[] rowCol = br.readLine().split("\\s+"); // unknown number of rows, so do a quick scan to determine it int rows = Integer.parseInt(rowCol[0]); int cols = Integer.parseInt(rowCol[1]); // Create the matrix. SparseMatrix m = (transposeOnRead) ? new YaleSparseMatrix(cols, rows) : new YaleSparseMatrix(rows, cols); // Cluto Sparse stores each row's values on a single line in the form of // "col value" tuples with everything separated by space. int row = 0; for (String line = null; (line = br.readLine()) != null; ++row) { String[] colValuePairs = line.split("\\s+"); for (int i = 0; i < colValuePairs.length; i+=2) { int col = Integer.parseInt(colValuePairs[i]) - 1; double value = Double.parseDouble(colValuePairs[i+1]); // Store the value in the matrix. if (transposeOnRead) m.set(col, row, value); else m.set(row, col, value); } } return m; } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#DENSE_TEXT} in provided file. * * @param matrix * @param matrixType * * @return as */ private static Matrix readDenseTextMatrix(File matrix, Type matrixType, boolean transposeOnRead) throws IOException { BufferedReader br = new BufferedReader(new FileReader(matrix)); // unknown number of rows, so do a quick scan to determine it int rows = 0; int cols = -1; for (String line = null; (line = br.readLine()) != null; rows++) { // Create a scanner to parse out the values from the current line of // the file. Scanner s = new Scanner(line); int vals = 0; // Count how many columns the row has for (; s.hasNextDouble(); vals++, s.nextDouble()) ; // Base case if the number of columns has not been set if (cols == -1) cols = vals; // Otherwise, ensure that the number of values in the current row // matches the number of values seen in the first else { if (cols != vals) { throw new MatrixIOException("line " + (rows + 1) + " contains an inconsistent number of columns. " + cols + " columns expected versus " + vals + " found."); } } } br.close(); if (MATRIX_IO_LOGGER.isLoggable(Level.FINE)) { MATRIX_IO_LOGGER.fine("reading in text matrix with " + rows + " rows and " + cols + " cols"); } // Once the matrix has had its dimensions determined, re-open the file // to load the data into a Matrix instance. Use a Scanner to parse the // text for us. Scanner scanner = new Scanner(matrix); Matrix m = (transposeOnRead) ? Matrices.create(cols, rows, matrixType) : Matrices.create(rows, cols, matrixType); for (int row = 0; row < rows; ++row) { for (int col = 0; col < cols; ++col) { double d = scanner.nextDouble(); if (transposeOnRead) m.set(col, row, d); else m.set(row, col, d); } } scanner.close(); return m; } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#SVDLIBC_DENSE_TEXT} in provided file. * * @param matrix * @param matrixType * * @return a matrix whose data was specified by the provided file */ private static Matrix readDenseSVDLIBCtext(File matrix, Type matrixType, boolean transposeOnRead) throws IOException { BufferedReader br = new BufferedReader(new FileReader(matrix)); // Note that according to the formatting, spaces and new lines are // equivalent. Therefore, someone could just print all of the matrix // values on a single line. int rows = -1; int cols = -1; int valuesSeen = 0; // REMINDER: possibly use on disk if the matrix is too big Matrix m = null; for (String line = null; (line = br.readLine()) != null; ) { String[] vals = line.split("\\s+"); for (int i = 0; i < vals.length; ++i) { // rows is specified first if (rows == -1) { rows = Integer.parseInt(vals[i]); } // cols will be second else if (cols == -1) { cols = Integer.parseInt(vals[i]); // once both rows and cols have been assigned, create the // matrix m = (transposeOnRead) ? Matrices.create(cols, rows, matrixType) : Matrices.create(rows, cols, matrixType); MATRIX_IO_LOGGER.log(Level.FINE, "created matrix of size {0} x {1}", new Object[] {Integer.valueOf(rows), Integer.valueOf(cols)}); } else { int row = valuesSeen / cols; int col = valuesSeen % cols; double val = Double.parseDouble(vals[i]); if (transposeOnRead) m.set(col, row, val); else m.set(row, col, val); // increment the number of values seen to properly set the // next index of the matrix ++valuesSeen; } } } br.close(); return m; } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#SVDLIBC_DENSE_BINARY} in provided file. * * @param matrix * @param matrixType * * @return a matrix whose data was specified by the provided file */ private static Matrix readDenseSVDLIBCbinary(File matrix, Type matrixType, boolean transposeOnRead) throws IOException { DataInputStream dis = new DataInputStream( new BufferedInputStream(new FileInputStream(matrix))); int rows = dis.readInt(); int cols = dis.readInt(); Matrix m = (transposeOnRead) ? Matrices.create(cols, rows, matrixType) : Matrices.create(rows, cols, matrixType); if (transposeOnRead) { for (int row = 0; row < rows; ++row) { for (int col = 0; col < cols; ++col) { m.set(col, row, dis.readFloat()); } } } else { for (int row = 0; row < rows; ++row) { for (int col = 0; col < cols; ++col) { m.set(row, col, dis.readFloat()); } } } dis.close(); return m; } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#SVDLIBC_SPARSE_BINARY} in provided file. * * @param matrix * @param matrixType * * @return a matrix whose data was specified by the provided file */ private static SparseMatrix readSparseSVDLIBCbinary(File matrix, Type matrixType, boolean transposeOnRead) throws IOException { DataInputStream dis = new DataInputStream( new BufferedInputStream(new FileInputStream(matrix))); int rows = dis.readInt(); int cols = dis.readInt(); int nz = dis.readInt(); MATRIX_IO_LOGGER.fine( String.format("Creating %s matrix %d rows, %d cols, %d nz%n", ((transposeOnRead) ? "transposed" : ""), rows, cols, nz)); SparseMatrix m = null; // Special case for reading transposed data. This avoids the log(n) // overhead from resorting the row data for the matrix, which can be // significant in large matrices. if (transposeOnRead) { SparseDoubleVector[] rowArr = new SparseDoubleVector[cols]; int entriesSeen = 0; int col = 0; int curRow = 0; for (; entriesSeen < nz; ++col) { int nzInCol = dis.readInt(); int[] indices = new int[nzInCol]; double[] vals = new double[nzInCol]; for (int i = 0; i < nzInCol; ++i, ++entriesSeen) { indices[i] = dis.readInt(); vals[i] = dis.readFloat(); } SparseDoubleVector rowVec = new CompactSparseVector(indices, vals, rows); rowArr[curRow] = rowVec; ++curRow; } m = Matrices.asSparseMatrix(Arrays.asList(rowArr)); } else { m = new SparseHashMatrix(rows, cols); int entriesSeen = 0; int col = 0; for (; entriesSeen < nz; ++col) { int nzInCol = dis.readInt(); for (int i = 0; i < nzInCol; ++i, ++entriesSeen) { m.set(dis.readInt(), col, dis.readFloat()); } } } dis.close(); MATRIX_IO_LOGGER.fine("Completed loading matrix"); return m; } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#SVDLIBC_SPARSE_TEXT} in provided file. * * @param matrix * @param matrixType * @param transposeOnRead * * @return a matrix whose data was specified by the provided file */ private static SparseMatrix readMatlabSparse( File matrixFile, Type matrixType, boolean transposeOnRead) throws IOException { SparseMatrix matrix = new GrowingSparseMatrix(); BufferedReader br = new BufferedReader(new FileReader(matrixFile)); for (String line = null; (line = br.readLine()) != null; ) { String[] rowColVal = line.split("\\s+"); int row = Integer.parseInt(rowColVal[0]) - 1; int col = Integer.parseInt(rowColVal[1]) - 1; double value = Double.parseDouble(rowColVal[2]); if (transposeOnRead) matrix.set(col, row, value); else matrix.set(row, col, value); } br.close(); return matrix; } /** * Creates a {@code Matrix} from the data encoded as {@link * Format#SVDLIBC_SPARSE_TEXT} in provided file. * * @param matrix * @param matrixType * @param transposeOnRead * * @return a matrix whose data was specified by the provided file */ private static SparseMatrix readSparseSVDLIBCtext( File matrix, Type matrixType, boolean transposeOnRead) throws IOException { BufferedReader br = new BufferedReader(new FileReader(matrix)); String line = br.readLine(); if (line == null) throw new IOException("Empty input Matrix"); String[] numRowsColsNonZeros = line.split("\\s"); int rows = Integer.parseInt(numRowsColsNonZeros[0]); int cols = Integer.parseInt(numRowsColsNonZeros[1]); SparseMatrix m = (transposeOnRead) ? new YaleSparseMatrix(cols, rows) : new YaleSparseMatrix(rows, cols); for (int j = 0; j < cols && (line = br.readLine()) != null; ++j) { int numNonZeros = Integer.parseInt(line); for (int i = 0; i < numNonZeros && (line = br.readLine()) != null; ++i) { String[] rowValue = line.split("\\s"); int row = Integer.parseInt(rowValue[0]); double value = Double.parseDouble(rowValue[1]); if (value != 0d) { if (transposeOnRead) m.set(j, row, value); else m.set(row, j, value); } } } br.close(); return m; } /** * Writes the matrix to the specified output file in the provided format * * @param matrix the matrix to be written * @param output the file in which the matrix should be written * @param format the data format in which the matrix's data should be * written * * @throws IllegalArgumentException if the input matrix is 0-dimensional * @throws IOException if an error occurs while writing to the output file */ public static void writeMatrix(Matrix matrix, String output, Format format) throws IOException { writeMatrix(matrix, new File(output), format); } /** * Writes the matrix to the specified output file in the provided format * * @param matrix the matrix to be written * @param output the file in which the matrix should be written * @param format the data format in which the matrix's data should be * written * * @throws IllegalArgumentException if the input matrix is 0-dimensional * @throws IOException if an error occurs while writing to the output file */ public static void writeMatrix(Matrix matrix, File output, Format format) throws IOException { if (matrix.rows() == 0 || matrix.columns() == 0) throw new IllegalArgumentException( "cannot write 0-dimensional matrix"); switch (format) { case DENSE_TEXT: { PrintWriter pw = new PrintWriter(output); for (int i = 0; i < matrix.rows(); ++i) { StringBuffer sb = new StringBuffer(matrix.columns() * 5); for (int j = 0; j < matrix.columns(); ++j) { sb.append(matrix.get(i,j)).append(" "); } pw.println(sb.toString()); } pw.close(); break; } // These two formats are equivalent case CLUTO_DENSE: case SVDLIBC_DENSE_TEXT: { PrintWriter pw = new PrintWriter(output); pw.println(matrix.rows() + " " + matrix.columns()); for (int i = 0; i < matrix.rows(); ++i) { StringBuffer sb = new StringBuffer(32); for (int j = 0; j < matrix.columns(); ++j) { sb.append((float)(matrix.get(i,j))).append(" "); } pw.println(sb.toString()); } pw.close(); break; } case SVDLIBC_DENSE_BINARY: { DataOutputStream outStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(output))); outStream.writeInt(matrix.rows()); outStream.writeInt(matrix.columns()); for (int i = 0; i < matrix.rows(); ++i) { for (int j = 0; j < matrix.columns(); ++j) { outStream.writeFloat((float)matrix.get(i,j)); } } outStream.close(); break; } case CLUTO_SPARSE: { PrintWriter pw = new PrintWriter(output); // Count the number of non-zero values in the matrix int nonZero = 0; int rows = matrix.rows(); for (int i = 0; i < rows; ++i) { DoubleVector v = matrix.getRowVector(i); if (v instanceof SparseVector) nonZero += ((SparseVector)v).getNonZeroIndices().length; else { for (int col = 0; col < v.length(); ++col) { if (v.get(col) != 0) nonZero++; } } } // Write the header: rows cols non-zero pw.println(matrix.rows() + " " + matrix.columns() + " " + nonZero); for (int row = 0; row < rows; ++row) { StringBuilder sb = new StringBuilder(nonZero / rows); // NOTE: the columns in CLUTO start at 1, not 0, so increment // one to each of the columns DoubleVector v = matrix.getRowVector(row); if (v instanceof SparseVector) { int[] nzIndices = ((SparseVector)v).getNonZeroIndices(); for (int nz : nzIndices) { sb.append(nz + 1).append(" "). append(v.get(nz)).append(" "); } } else { for (int col = 0; col < v.length(); ++col) { double d = v.get(col); if (d != 0) sb.append(col+1).append(" ").append(d).append(" "); } } pw.println(sb.toString()); } pw.close(); break; } case SVDLIBC_SPARSE_TEXT: { PrintWriter pw = new PrintWriter(output); // count the number of non-zero values for each column as well as // the total int nonZero = 0; int[] nonZeroPerCol = new int[matrix.columns()]; for (int i = 0; i < matrix.rows(); ++i) { for (int j = 0; j < matrix.columns(); ++j) { if (matrix.get(i, j) != 0) { nonZero++; nonZeroPerCol[j]++; } } } // loop through the matrix a second time, printing out the number of // non-zero values for each column, followed by those values and // their associated row pw.println(matrix.rows() + " " + matrix.columns() + " " + nonZero); for (int col = 0; col < matrix.columns(); ++col) { pw.println(nonZeroPerCol[col]); if (nonZeroPerCol[col] > 0) { for (int row = 0; row < matrix.rows(); ++row) { double val = matrix.get(row, col); if (val != 0d) { // NOTE: need to convert to float since this is what // SVDLIBC uses pw.println(row + " " + Double.valueOf(val).floatValue()); } } } } pw.close(); break; } case SVDLIBC_SPARSE_BINARY: { DataOutputStream outStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(output))); // count the number of non-zero values for each column as well as // the total int nonZero = 0; int[] nonZeroPerCol = new int[matrix.columns()]; for (int i = 0; i < matrix.rows(); ++i) { for (int j = 0; j < matrix.columns(); ++j) { if (matrix.get(i, j) != 0) { nonZero++; nonZeroPerCol[j]++; } } } // Write the 12 byte header data outStream.writeInt(matrix.rows()); outStream.writeInt(matrix.columns()); outStream.writeInt(nonZero); // loop through the matrix a second time, printing out the number of // non-zero values for each column, followed by those values and // their associated row for (int col = 0; col < matrix.columns(); ++col) { outStream.writeInt(nonZeroPerCol[col]); if (nonZeroPerCol[col] > 0) { for (int row = 0; row < matrix.rows(); ++row) { double val = matrix.get(row, col); if (val != 0) { // NOTE: need to convert to float since this is what // SVDLIBC uses outStream.writeInt(row); outStream.writeFloat((float)val); } } } } outStream.close(); break; } case MATLAB_SPARSE: { PrintWriter pw = new PrintWriter(output); // NOTE: Matlab's sparse matrix offers no way of specifying the // original matrix's dimensions. This is only problematic if the // matrix contains trailing rows or columns that are all // zeros. Therefore to ensure that the matrix has the correct size, // we track the maximum values written and write a 0-value to extend // the matrix to its correct size int maxRowSeen = 0; int maxColSeen = 0; for (int i = 0; i < matrix.rows(); ++i) { for (int j = 0; j < matrix.columns(); ++j) { if (matrix.get(i,j) == 0) continue; if (j > maxColSeen) maxColSeen = j; if (i > maxRowSeen) maxRowSeen = i; StringBuffer sb = new StringBuffer(32); // Add 1 to index values since Matlab arrays are 1-based, // not 0-based sb.append(i+1).append(" ").append(j+1); sb.append(" ").append(matrix.get(i,j)); pw.println(sb.toString()); } } // Check whether we need to extend the matrix if (maxRowSeen + 1 != matrix.rows() || maxColSeen + 1 != matrix.columns()) { pw.println(matrix.rows() + " " + matrix.columns() + " 0"); } pw.close(); break; } default: throw new UnsupportedOperationException( "writing to " + format + " is currently unsupported"); } } @Deprecated public static void writeMatrixArray(double[][] matrix, File output) throws IOException { if (matrix.length == 0 || matrix[0].length == 0) throw new IllegalArgumentException("invalid matrix dimensions"); PrintWriter pw = new PrintWriter(output); int rows = matrix.length; int cols = matrix[0].length; for (int row = 0; row < rows; ++row) { for (int col = 0; col < cols; ++col) pw.print(matrix[row][col] + ((col + 1 == cols) ? "\n" : " ")); } pw.close(); return; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy