All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.common.StaticSemanticSpace Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.common;

import edu.ucla.sspace.common.SemanticSpaceIO.SSpaceFormat;

import edu.ucla.sspace.matrix.ArrayMatrix;
import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseMatrix;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;

import edu.ucla.sspace.util.IntegerMap;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * An unmodifiable {@link SemanticSpace} whose data is loaded into memory from
 * an {@code .sspace} file.  Instance of this class perform no document
 * processing, and the {@code processDocument} and {@code processSpace} methods
 * throw an {@link UnsupportedOperationException}.  
 *
 * 

In general, users should call {@link * edu.ucla.sspace.common.SemanticSpaceUtils#loadSemanticSpace(File) * SemanticSpaceUtils.loadSemanticSpace(File)} rather than create an instance of * this class directly.

* * This class is thread-safe * * @see OnDiskSemanticSpace * @see SemanticSpaceUtils * @see SemanticSpaceUtils.SSpaceFormat */ public class StaticSemanticSpace implements SemanticSpace { private static final Logger LOGGER = Logger.getLogger(StaticSemanticSpace.class.getName()); /** * The {@code Matrix} which contains the data read from a finished {@link * SemanticSpace}. */ private Matrix wordSpace; /** * A mapping of terms to row indexes. Also serves as a quick means of * retrieving the words known by this {@link SemanticSpace}. */ private Map termToIndex ; /** * The name of this semantic space. */ private String spaceName; /** * Creates the {@link StaticSemanticSpace} from the file. * * @param filename the name of a file containing {@code SemanticSpace} data. * * @throws IOException if any I/O exception occurs when reading the semantic * space data from the file */ public StaticSemanticSpace(String filename) throws IOException { this(new File(filename)); } /** * Creates the {@link StaticSemanticSpace} from the provided file. * * @param file a file containing the data of a {@link * edu.ucla.sspace.common.SemanticSpace}. * * @throws IOException if any I/O exception occurs when reading the semantic * space data from the file */ public StaticSemanticSpace(File file) throws IOException { spaceName = file.getName(); SSpaceFormat format = SemanticSpaceIO.getFormat(file); if (format == null) throw new Error("Unrecognzied format in " + "file: " + file.getName()); DataInputStream dis = new DataInputStream( new BufferedInputStream(new FileInputStream(file))); // Read off the four byte header from the stream so the loading methods // do not see the data. This is necessary to support older formats that // did not include the header. dis.readInt(); loadFromFormat(dis, format); } /** * Creates the {@link StaticSemanticSpace} from the provided file in the * specified format. This method is only to be used in accessing {@code * SemanticSpace} files that do not include the format in their file * contents. * * @param file a file containing the data of a {@link * edu.ucla.sspace.common.SemanticSpace}. * @param format the format of the semantic space * * @throws IOException if any I/O exception occurs when reading the semantic * space data from the file */ @Deprecated public StaticSemanticSpace(File file, SSpaceFormat format) throws IOException { loadFromFormat(new BufferedInputStream( new FileInputStream(file)), format); spaceName = file.getName(); } /** * Loads the semantic space data from the specified stream, using the format * to determine how the data is layed out internally within the stream. * * @param is the input stream from which the semantic space will be read * @param format the internal data formatting of the semantic space */ private void loadFromFormat(InputStream is, SSpaceFormat format) throws IOException { // NOTE: Use a LinkedHashMap here because this will ensure that the // words are returned in the same row-order as the matrix. This // generates better disk I/O behavior for accessing the matrix since // each word is directly after the previous on disk. termToIndex = new LinkedHashMap(); Matrix m = null; long start = System.currentTimeMillis(); switch (format) { case TEXT: m = Matrices.synchronizedMatrix(loadText(is)); break; case BINARY: m = Matrices.synchronizedMatrix(loadBinary(is)); break; // REMINDER: we don't use synchronized here because the current // sparse matrix implementations are thread-safe. We really should // be aware of this for when the file-based sparse matrix gets // implemented. -jurgens 05/29/09 case SPARSE_TEXT: m = loadSparseText(is); break; case SPARSE_BINARY: m = loadSparseBinary(is); break; } if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine("loaded " + format + " .sspace file in " + (System.currentTimeMillis() - start) + "ms"); } wordSpace = m; } /** * Loads the {@link SemanticSpace} from the text formatted file, adding its * words to {@link #termToIndex} and returning the {@code Matrix} containing * the space's vectors. * * @param sspaceFile a file in {@link SSpaceFormat#TEXT text} format */ private Matrix loadText(InputStream fileStream) throws IOException { Matrix matrix = null; BufferedReader br = new BufferedReader(new InputStreamReader(fileStream)); String line = br.readLine(); if (line == null) throw new IOException("Empty .sspace file"); // Strip off the 4-byte (2 char) header String[] dimensions = line.split("\\s"); int rows = Integer.parseInt(dimensions[0]); int columns = Integer.parseInt(dimensions[1]); int index = 0; // reusable array for writing rows into the matrix double[] row = new double[columns]; matrix = new ArrayMatrix(rows, columns); while ((line = br.readLine()) != null) { if (index >= rows) throw new IOException("More rows than specified"); String[] termVectorPair = line.split("\\|"); String[] values = termVectorPair[1].split("\\s"); termToIndex.put(termVectorPair[0], index); if (values.length != columns) { throw new IOException( "improperly formated semantic space file"); } for (int c = 0; c < columns; ++c) { double d = Double.parseDouble(values[c]); row[c] = d; // matrix.set(index, c, d); } matrix.setRow(index, row); index++; } if (index != rows) throw new IOException(String.format( "Expected %d rows; saw %d", rows, index)); return matrix; } /** * Loads the {@link SemanticSpace} from the text formatted file, adding its * words to {@link #termToIndex} and returning the {@code Matrix} containing * the space's vectors. * * @param sspaceFile a file in {@link SSpaceFormat#TEXT text} format */ private Matrix loadSparseText(InputStream fileStream) throws IOException { Matrix matrix = null; BufferedReader br = new BufferedReader(new InputStreamReader(fileStream)); String line = br.readLine(); if (line == null) throw new IOError(new Throwable( "An empty file has been passed in")); String[] dimensions = line.split("\\s"); int rows = Integer.parseInt(dimensions[0]); int columns = Integer.parseInt(dimensions[1]); int row = 0; // create a sparse matrix matrix = Matrices.create(rows, columns, false); while ((line = br.readLine()) != null) { String[] termVectorPair = line.split("\\|"); String[] values = termVectorPair[1].split(","); termToIndex.put(termVectorPair[0], row); // even indicies are columns, odd are the values for (int i = 0; i < values.length; i +=2 ) { int col = Integer.parseInt(values[i]); double val = Double.parseDouble(values[i+1]); matrix.set(row, col, val); } row++; } return matrix; } /** * Loads the {@link SemanticSpace} from the binary formatted file, adding * its words to {@link #termToIndex} and returning the {@code Matrix} * containing the space's vectors. * * @param sspaceFile a file in {@link SSpaceFormat#BINARY binary} format */ private Matrix loadBinary(InputStream fileStream) throws IOException { DataInputStream dis = new DataInputStream(fileStream); int rows = dis.readInt(); int cols = dis.readInt(); // create a dense matrix Matrix m = new ArrayMatrix(rows, cols); double[] d = new double[cols]; for (int row = 0; row < rows; ++row) { String word = dis.readUTF(); termToIndex.put(word, row); for (int col = 0; col < cols; ++col) { d[col] = dis.readDouble(); } m.setRow(row, d); } return m; } /** * Loads the {@link SemanticSpace} from the binary formatted file, adding * its words to {@link #termToIndex} and returning the {@code Matrix} * containing the space's vectors. * * @param sspaceFile a file in {@link SSpaceFormat#BINARY binary} format */ private Matrix loadSparseBinary(InputStream fileStream) throws IOException { DataInputStream dis = new DataInputStream(fileStream); int rows = dis.readInt(); int cols = dis.readInt(); // Create the sparse matrix as individual rows since we can fully // allocate the indices values at once, rather than pay the log(n) // overhead of sorting them CompactSparseVector[] rowVectors = new CompactSparseVector[rows]; for (int row = 0; row < rows; ++row) { String word = dis.readUTF(); termToIndex.put(word, row); int nonZero = dis.readInt(); int[] indices = new int[nonZero]; double[] values = new double[nonZero]; for (int i = 0; i < nonZero; ++i) { int nz = dis.readInt(); double val = dis.readDouble(); indices[i] = nz; values[i] = val; } rowVectors[row] = new CompactSparseVector(indices, values, cols); } return Matrices.asSparseMatrix(Arrays.asList(rowVectors)); } /** * {@inheritDoc} */ public Set getWords() { return Collections.unmodifiableSet(termToIndex.keySet()); } /** * {@inheritDoc} */ public Vector getVector(String term) { Integer index = termToIndex.get(term); return (index == null) ? null : wordSpace.getRowVector(index.intValue()); } /** * {@inheritDoc} */ public String getSpaceName() { return spaceName; } /** * {@inheritDoc} */ public int getVectorLength() { return wordSpace.columns(); } /** * Not supported; throws an {@link UnsupportedOperationException} if called. * * @throws an {@link UnsupportedOperationException} if called */ public void processDocument(BufferedReader document) { throw new UnsupportedOperationException( "StaticSemanticSpace instances cannot be updated"); } /** * Not supported; throws an {@link UnsupportedOperationException} if called. * * @throws an {@link UnsupportedOperationException} if called */ public void processSpace(Properties props) { throw new UnsupportedOperationException( "StaticSemanticSpace instances cannot be updated"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy