All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.tools.SemanticSpaceExplorer Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.tools;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.DimensionallyInterpretableSemanticSpace;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.SemanticSpaceIO;
import edu.ucla.sspace.common.Similarity;

import edu.ucla.sspace.text.WordIterator;

import edu.ucla.sspace.util.NearestNeighborFinder;
import edu.ucla.sspace.util.PartitioningNearestNeighborFinder;

import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.VectorIO;

import edu.ucla.sspace.util.SortedMultiMap;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;


/**
 * A utility class that operates as a command-line tool for interacting with
 * semantic space files.  The utility also provides script execution
 * capabilities for its commands.  This allows users to develop custom methods
 * of interacting with one or more semantic spaces.  In additoin, scripting can
 * help automate certain forms of tests on the expected contents of a semantic
 * space.
 *
 * @author David Jurgens
 */
public class SemanticSpaceExplorer {

    /**
     * A set of commands that can be issued to the semantic space explorer.
     */
    private enum Command {
        LOAD, 
        UNLOAD, 
        GET_NEIGHBORS, 
        GET_SIMILARITY,
        COMPARE_SSPACE_VECTORS, 
        HELP,
        WRITE_COMMAND_RESULTS,
        SET_CURRENT_SSPACE,
        GET_CURRENT_SSPACE,
        PRINT_VECTOR,
        ALIAS,
        GET_WORDS,
        DESCRIBE_DIMENSION,
        DESCRIBE_SEMANTIC_SPACE
    }

    /**
     * A mapping from the abbreviation for a command to its {@link Command}
     * instance.
     */
    private static final Map abbreviatedCommands 
        = new HashMap();

    // For each of the commands, take the first letter of each word in its name
    // to form the abbreviated command string.
    static {
        for (Command c : Command.values()) {
            String[] commandWords = c.toString().split("_");
            StringBuilder abbv = new StringBuilder();
            for (String w : commandWords)
                abbv.append(w.charAt(0));
            abbreviatedCommands.put(abbv.toString().toLowerCase(), c);
        }
    }
    
    /**
     * The mapping from file name to the {@code SemanticSpace} that was loaded
     * from that file.
     */
    private final Map fileNameToSSpace;

    /**
     * The mapping from the alias of a semantic space to the file name from
     * which it was loaded.
     */
    private final Map aliasToFileName;

    /**
     * The current {@code SemanticSpace} to be used when invoking commands
     */
    private SemanticSpace current;

    /** 
     * The {@code NearestNeighborFinder} for the current {@code SemanticSpace}
     * or {@code null} if the nearest terms have yet to be searched for.
     */
    private NearestNeighborFinder currentNnf;

    /**
     * Constructs an instance of {@code SemanticSpaceExplorer}.
     */
    private SemanticSpaceExplorer() {        
        fileNameToSSpace = new LinkedHashMap();
        aliasToFileName = new HashMap();
        current = null;
    }

    /**
     * Returns the name of the file form which the current {@code SemanticSpace}
     * was loaded, or {@code null} if no semantic space is currently open.
     *
     * @return the name of the file from which the current space was loaded
     */
    private String getCurrentSSpaceFileName() {
        // REMINDER: This instruction is expected to be rare, so rather than
        // save the name and require a lookup every time the current sspace
        // is needed, we use an O(n) call to find the name as necessary
        for (Map.Entry e : fileNameToSSpace.entrySet()) {
            if (e.getValue() == current) {
                return e.getKey();
            }
        }
        return null;
    }

    /**
     * Returns the {@code SemanticSpace} linked to the name, either as an alias
     * or as a file name.
     *
     * @param name the alias or file name of a loaded semantic space
     *
     * @return the loaded semantic space or {@code null} no space with the
     *         provided name exists
     */
    private SemanticSpace getSSpace(String name) {
        String aliased = aliasToFileName.get(name);
        return (aliased != null) 
            ? fileNameToSSpace.get(aliased)
            : fileNameToSSpace.get(name);
    }

    /**
     * Executes the specified command and writes any output to standard out.  If
     * an error occurs an error message will be written instead.
     *
     * @param commandTokens the series of tokens that comprise the command and
     *        all of its arguments
     *
     * @return {@code true} if the command was successfully executed
     */
    public boolean execute(Iterator commandTokens) {
        return execute(commandTokens, System.out);
    }

    /**
     * Executes the specified command and writes any output to the provided
     * stream.  If an error occurs an error message will be written to the
     * stream instead.
     *
     * @param commandTokens the series of tokens that comprise the command and
     *        all of its arguments
     * @param out the stream to which any output should be written
     *
     * @return {@code true} if the command was successfully executed
     */
    private boolean execute(Iterator commandTokens, PrintStream out) {

        // No-op for empty commands
        if (!commandTokens.hasNext())
            return false;

        // Convert the name of the command into a Command Enum
        String commandStr = commandTokens.next();
        Command command = null;
        try {
            command = 
                Command.valueOf(commandStr.replaceAll("-", "_").toUpperCase());
        } catch (IllegalArgumentException iae) {
            command = abbreviatedCommands.get(commandStr);
            if (command == null) {
                out.println("Unknown command: " + commandStr);
                return false;
            }
        }

        // A giant switch statement for all of the commands 
        command_switch:
        switch (command) {

        // Loads the semantic space from a file
        case LOAD: {
            if (!commandTokens.hasNext()) {
                out.println("missing .sspace file argument");
                return false;
            }
            String sspaceFileName = commandTokens.next();
            
            // Don't re-open .sspace files that are already loaded
            if (fileNameToSSpace.containsKey(sspaceFileName))
                break;

            SemanticSpace sspace = null;
            try {
                sspace = SemanticSpaceIO.load(sspaceFileName);
            } catch (Throwable t) {
                // Catch Throwable since this method may throw an IOError
                out.println("an error occurred while loading the semantic " +
                            "space from " + sspaceFileName + ":\n" + t);
                t.printStackTrace();
            }
            fileNameToSSpace.put(sspaceFileName, sspace);
            current = sspace;
            currentNnf = null;
            break;
        }

        // Removes all references to the space, which free the associated
        // memory.
        case UNLOAD: {
            if (!commandTokens.hasNext()) {
                out.println("missing .sspace file argument");
                return false;
            }
            String sspaceName = commandTokens.next();
            String aliased = aliasToFileName.get(sspaceName);
            SemanticSpace removed = null;
            if (aliased != null) {
                aliasToFileName.remove(sspaceName);
                removed = fileNameToSSpace.remove(aliased);
            }
            else {
                removed = fileNameToSSpace.remove(sspaceName);
                // Remove the alias for the file if it existed
                Iterator> it = 
                    aliasToFileName.entrySet().iterator();
                while (it.hasNext()) {
                    Map.Entry e = it.next();
                    if (e.getValue().equals(sspaceName)) {
                        it.remove();
                        break;
                    }
                }
            }
            
            // If we are removing the current semantic space, reassign it to be
            // the oldest semantic space, or if none are available, null.
            if (removed == current) {
                Iterator it =
                    fileNameToSSpace.values().iterator();
                current = (it.hasNext()) ? it.next() : null;
            }
            break;
        }

        // Creates an alias for a semantic space file.  This is useful for long
        // file names.
        case ALIAS: {
            if (!commandTokens.hasNext()) {
                out.println("missing .sspace file argument");
                return false;
            }
            String fileName = commandTokens.next();
            if (!fileNameToSSpace.containsKey(fileName)) {
                out.println(fileName + "is not currently loaded");
                return false;
            }
            if (!commandTokens.hasNext()) {
                out.println("missing alias name");
                return false;
            }
            String alias = commandTokens.next();
            aliasToFileName.put(alias, fileName);
            break;
        }

        // Finds the nearest neighbors to a word in the current semantic space
        case GET_NEIGHBORS: {
            if (!commandTokens.hasNext()) {
                out.println("missing word argument");
                return false;
            }
            String focusWord = commandTokens.next();
            int neighbors = 10;
            if (commandTokens.hasNext()) {
                String countStr = commandTokens.next();
                try {
                    neighbors = Integer.parseInt(countStr);
                } catch (NumberFormatException nfe) {
                    out.println("invalid number of neighbors: " + countStr);
                    return false;
                }
            }
            
            // If this is the first time the nearest neighbors have been
            // searched for, construct a new NNF
            if (currentNnf == null) 
                currentNnf = new PartitioningNearestNeighborFinder(current);

            // Using the provided or default arguments find the closest
            // neighbors to the target word in the current semantic space
            SortedMultiMap mostSimilar =
                currentNnf.getMostSimilar(focusWord, neighbors);

            if (mostSimilar == null) {
                out.println(focusWord + 
                            " is not in the current semantic space");
            }
            else {
                // Print each of the neighbors and their similarity score 
                for (Map.Entry e : mostSimilar.entrySet()) {
                    out.println(e.getValue() + "\t" + e.getKey());
                }
            }
            break;
        }

        // Get the similarity for two words 
        case GET_SIMILARITY: {
            if (current == null) {
                out.println("no current semantic space");
                return false;
            }

            if (!commandTokens.hasNext()) {
                out.println("missing word argument");
                return false;
            }
            String word1 = commandTokens.next();

            if (!commandTokens.hasNext()) {
                out.println("missing word argument");
                return false;
            }
            String word2 = commandTokens.next();                      

            Similarity.SimType simType = Similarity.SimType.COSINE;
            if (commandTokens.hasNext()) {
                // Upper case since it's an enum
                String simTypeStr = commandTokens.next().toUpperCase();
                try {
                    simType = Similarity.SimType.valueOf(simTypeStr);
                } catch (IllegalArgumentException iae) {
                    // See if the user provided a prefix of the similarity
                    // measure's name
                    for (Similarity.SimType t : Similarity.SimType.values())
                        if (t.name().startsWith(simTypeStr)) 
                            simType = t;
                    // If no prefix was found, report an error
                    if (simType == null) {
                        out.println("invalid similarity measure: " +simTypeStr);
                        return false;
                    }
                }
            }

            Vector word1vec = current.getVector(word1);
            if (word1vec == null) {
                out.println(word1 + " is not in semantic space " 
                            + getCurrentSSpaceFileName());
                break;
            }
            Vector word2vec = current.getVector(word2);
            if (word2vec == null) {
                out.println(word2 + " is not in semantic space " 
                            + getCurrentSSpaceFileName());
                break;
            }
            
            double similarity = 
                Similarity.getSimilarity(simType, word1vec, word2vec);
            out.println(similarity);
            break;
        }

        // Compare the vectors for the same word from two different semantic
        // spaces
        case COMPARE_SSPACE_VECTORS: {

            if (!commandTokens.hasNext()) {
                out.println("missing word argument");
                return false;
            }
            String word = commandTokens.next();

            if (!commandTokens.hasNext()) {
                out.println("missing sspace argument");
                return false;
            }
            String name1 = commandTokens.next();
            SemanticSpace sspace1 = getSSpace(name1);
            if (sspace1 == null) {
                out.println("no such semantic space: " + name1);
                return false;
            }

            if (!commandTokens.hasNext()) {
                out.println("missing sspace argument");
                return false;
            }
            String name2 = commandTokens.next();
            SemanticSpace sspace2 = getSSpace(name2);
            if (sspace2 == null) {
                out.println("no such semantic space: " + name2);
                return false;
            }
            
            Similarity.SimType simType = Similarity.SimType.COSINE;
            if (commandTokens.hasNext()) {
                String simTypeStr = commandTokens.next();
                try {
                    simType = Similarity.SimType.valueOf(simTypeStr);
                } catch (IllegalArgumentException iae) {
                    out.println("invalid similarity measure: " + simTypeStr);
                    return false;
                }
            }

            // Get the vectors from each dimension
            Vector sspace1vec = sspace1.getVector(word);
            if (sspace1vec == null) {
                out.println(word + " is not in semantic space " 
                            + name1);
                break;
            }
            Vector sspace2vec = sspace2.getVector(word);
            if (sspace2vec == null) {
                out.println(word + " is not in semantic space " 
                            + name2);
                break;
            }

            // Ensure that the two have the same number of dimensions
            if (sspace1vec.length() != sspace2vec.length()) {
                out.println(name1 + " and " + name2 + " have different numbers "
                            + "of dimensions and are not comparable.");
                break;
            }

            double similarity = 
                Similarity.getSimilarity(simType, sspace1vec, sspace2vec);
            out.println(similarity);
            break;
        }

        case HELP: {
            out.println("available commands:\n" + getCommands());
            break;
        }

        // Write the results of a command to a file
        case WRITE_COMMAND_RESULTS: {
            if (!commandTokens.hasNext()) {
                out.println("missing file destination argument");
                return false;
            }
            String fileName = commandTokens.next();
            try {
                // Open up a new output stream where the command's results will
                // be sent
                PrintStream ps = new PrintStream(fileName);
                // Recursively call execute using the file as the new output
                // stream
                execute(commandTokens, ps);
                ps.close();
            } catch (IOException ioe) {
                out.println("An error occurred while writing to " + fileName + 
                            ":\n"  + ioe);
            }
            break;
        }

        // Print the vector for a word
        case PRINT_VECTOR: {
            if (current == null) {
                out.println("no current semantic space");
                return false;
            }

            if (!commandTokens.hasNext()) {
                out.println("missing word argument");
                return false;
            }
            String word = commandTokens.next();

            Vector vec = current.getVector(word);
            if (vec == null) {
                out.println(word + " is not in semantic space " +
                            getCurrentSSpaceFileName());
                break;
            }
            
            out.println(VectorIO.toString(vec));
            break;
        }

        // Update the current semantic space
        case SET_CURRENT_SSPACE: {
            if (!commandTokens.hasNext()) {
                out.println("missing .sspace file argument");
                return false;
            }
            String spaceName = commandTokens.next();
            // Check whether the name was an alias
            String fileName = aliasToFileName.get(spaceName);
            // If the argument wasn't an alias, the arg was the file name
            if (fileName == null)
                fileName = spaceName;
            
            SemanticSpace s = fileNameToSSpace.get(fileName);
            if (s == null) {
                out.println("no such .sspace (file is not currently loaded)");
                return false;
            }
            current = s;
            break;
        }

        // Get the name of the current semantic space
        case GET_CURRENT_SSPACE: {
            String currentSpaceName = getCurrentSSpaceFileName();
            if (currentSpaceName != null)
                out.println(currentSpaceName);
            else
                out.println("none");
            break;
        }

        // Prints out the words in the semantic space
        case GET_WORDS: {            
            String prefix = null;
            if (commandTokens.hasNext())
                prefix  = commandTokens.next();
            Set words = current.getWords();
            for (String word : words) {
                if (prefix == null)
                    out.println(word);
                else if (word.startsWith(prefix))
                    out.println(word);
            }
            break;
        }

        // Describes the dimension, if the current sspace has annotations
        case DESCRIBE_DIMENSION: {
            if (current instanceof DimensionallyInterpretableSemanticSpace) {
                if (!commandTokens.hasNext()) {
                    out.println("Must supply a dimension number");
                    break;
                }
                int dim = -1;
                String next = commandTokens.next();
                try {
                    dim = Integer.parseInt(next);                    
                } catch (NumberFormatException nfe) {
                    out.println("Invalid dimension: " + next);
                    break;
                }
                DimensionallyInterpretableSemanticSpace diss = 
                    (DimensionallyInterpretableSemanticSpace)current;
                try {
                    out.println(diss.getDimensionDescription(dim).toString());
                } catch (Exception e) {
                    out.println(e.getMessage());
                }
            }
            else
                out.println("Current space has no dimension descriptions");
            break;
        }

        // Prints out statistics on the current sspaces
        case DESCRIBE_SEMANTIC_SPACE: {
            if (current == null) {
                out.println("no .sspace loaded");
                break;
            }
            String name = current.getSpaceName();
            boolean hasDimDescriptions = 
                current instanceof DimensionallyInterpretableSemanticSpace;
            int dims = current.getVectorLength();
            int words = current.getWords().size();
            boolean isSparse = (current.getWords().isEmpty()) ||
                current.getVector(current.getWords().iterator().next())
                    instanceof SparseVector;
            out.println(name + ": " + words + " words, " 
                        + dims + " dimensions" 
                        + ((hasDimDescriptions) 
                           ? " with descriptions" : "")
                        + ((isSparse) ? ", sparse vectors"
                           : ", dense vectors"));
            break;
        }

        default: // should never get executed
            assert false : command;
        }
        
        return true;
    }

    /**
     * Returns a formatted list of the available commands that a {@code
     * SemanticSpaceExplorer} instance will recognize.
     *
     * @return the commands
     */
    private static String getCommands() {
        return
            "  load file1.sspace [file2.sspace...]\n" +
            "  unload file1.sspace [file2.sspace...]\n" +
            "  get-neighbors word [number (default 10)] [similarity measure]\n" +
            "  get-similarity word1 word2 [similarity measure " + 
            "(default cosine)]\n" +
            "  compare-sspace-vectors word sspace1 sspace2 " +
            "[similarity measure (default: cosine)]\n" +
            "  help\n" +
            "  set-current-sspace filename.sspace\n" +
            "  get-current-sspace\n" +
            "  alias filename.sspace name\n" + 
            "  write-command-results output-file command...\n" +
            "  print-vector word\n" +
            "  get-words [string-prefix]\n" +
            "  describe-dimension number\n" +
            "  describe-semantic-space\n";
    }

    /**
     * Prints the options and supported commands used by this program.
     *
     * @param options the options supported by the system
     */
    private static void usage(ArgOptions options) {
        System.out.println("usage: java SemanticSpaceExplorer [options]\n\n" +
                           "Command line options:\n" + options.prettyPrint() +
                           "\n\nExplorer commands:\n" + getCommands());
    }

    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
        options.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        options.addOption('f', "executeFile", "Executes the commands in the " +
                          "specified file and exits", true, "FILE",
                          "Program Options");
        options.addOption('s', "saveRecord", "Saves a record of all the " +
                          "executed commands to the specfied file", true,
                          "FILE",  "Program Options");

        options.parseOptions(args);

        if (options.hasOption("help")) {
            usage(options);
            return;
        }

        PrintWriter recordFile = null;
        if (options.hasOption("saveRecord")) {
            try {
                recordFile = new PrintWriter(
                    options.getStringOption("saveRecord"));
            } catch (IOException ioe) {
                System.out.println("Unable to open file for saving commands:\n"
                                   + ioe);
            }
        }
        
        BufferedReader commandsToExecute = null; 
        if (options.hasOption("executeFile")) {
            try {
                commandsToExecute = new BufferedReader(new FileReader(
                    options.getStringOption("executeFile")));
            } catch (IOException ioe) {
                System.out.println("unable to open commands file " + 
                                   options.getStringOption("executeFile") 
                                   + ":\n" + ioe);
                return;
            }
        }
        else {
            commandsToExecute =
                new BufferedReader(new InputStreamReader(System.in));
        }

        boolean suppressPrompt = options.hasOption("executeFile");

        SemanticSpaceExplorer explorer = new SemanticSpaceExplorer();
        try {
            if (!suppressPrompt)
                System.out.print("> ");
            for (String command = null; 
                     (command = commandsToExecute.readLine()) != null; ) {
                Iterator commandTokens = new WordIterator(command);
                if (explorer.execute(commandTokens) && recordFile != null) {
                    recordFile.println(command);
                }
                if (!suppressPrompt)
                    System.out.print("> ");
            }
        } catch (IOException ioe) {
            System.out.println("An error occurred while reading in a command:\n"
                               + ioe);
        }
        if (recordFile != null) {
            recordFile.close();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy