All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.common.CachingOnDiskSemanticSpace Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 David Jurgens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.common;

import edu.ucla.sspace.common.SemanticSpaceIO.SSpaceFormat;

import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOError;
import java.io.IOException;

import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.WeakHashMap;

import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * A {@link SemanticSpace} where most vector data is kept on disk, but
 * frequently accessed data is kept in memory.  This class is designed for large
 * semantic spaces whose data will not fit in memory and whose usage pattern
 * will frequently access a vector multiple times.

* * The performance of this class is dependent on the format of the backing * vector data; {@code .sspace} files in {@link SSpaceFormat#BINARY binary} or * {@link SSpaceFormat#SPARSE_BINARY sparse binary} format will likely be faster * for accessing the data due to it being in its native format.

* * The {@code getWords} method will return words in the order they are stored on * disk. Accessing the words in this order will have to a significant * performance improve over random access. Furtherore, random access to {@link * SSpaceFormat#TEXT text} and {@link SSpaceFormat#SPARSE_TEXT sparse text} * formatted matrices will have particularly poor performance for large semantic * spaces, as the internal cursor to the data will have to restart from the * beginning of the file.

* * This class is thread-safe. * * @see SemanticSpaceIO * @see OnDiskSemanticSpace */ public class CachingOnDiskSemanticSpace implements SemanticSpace { private static final Logger LOGGER = Logger.getLogger(CachingOnDiskSemanticSpace.class.getName()); /** * A mapping for words that have had their vector recently loaded into * memory. */ private final Map wordToVector; /** * The backing semantic space that reads in the data from disk. */ private final SemanticSpace backingSpace; /** * Creates a new instance of {@code CachingOnDiskSemanticSpace} from the * data in the file with the specified name. * * @param filename the name of a file containing a semantic space * * @throws IOException if any I/O exception occurs when reading the semantic * space data from the file */ public CachingOnDiskSemanticSpace(String filename) throws IOException { this(new File(filename)); } /** * Creates a new instance of {@code CachingOnDiskSemanticSpace} from the data in * the specified file. * * @param file a file containing a semantic space * * @throws IOException if any I/O exception occurs when reading the semantic * space data from the fil */ public CachingOnDiskSemanticSpace(File file) throws IOException { backingSpace = new OnDiskSemanticSpace(file); wordToVector = new WeakHashMap(); } /** * {@inheritDoc} */ public String getSpaceName() { return backingSpace.getSpaceName(); } /** * {@inheritDoc} */ public Set getWords() { return backingSpace.getWords(); } /** * {@inheritDoc} If the word is in the semantic space, its vector will be * temporarily loaded into memory so that subsequent calls will not need to * go to disk. As memory pressure increases, the vector will be discarded. * * @throws IOError if any {@code IOException} occurs when reading the data * from the underlying semantic space file. */ public synchronized Vector getVector(String word) { Vector vector = wordToVector.get(word); if (vector != null) return Vectors.immutable(vector); Vector v = backingSpace.getVector(word); if (v != null) wordToVector.put(word, v); return v; } /** * {@inheritDoc} */ public int getVectorLength() { return backingSpace.getVectorLength(); } /** * Not supported; throws an {@link UnsupportedOperationException} if called. * * @throws an {@link UnsupportedOperationException} if called. */ public void processDocument(BufferedReader document) { throw new UnsupportedOperationException( "CachingOnDiskSemanticSpace instances cannot be updated"); } /** * Not supported; throws an {@link UnsupportedOperationException} if called. * * @throws an {@link UnsupportedOperationException} if called. */ public void processSpace(Properties props) { throw new UnsupportedOperationException( "CachingOnDiskSemanticSpace instances cannot be updated"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy