edu.stanford.nlp.parser.lexparser.IntTaggedWord Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.StringUtils;

import java.io.Serializable;

/** Represents a WordTag (in the sense that equality is defined
 *  on both components), where each half is represented by an
 *  int indexed by a Index.  In this representation, -1 is
 *  used to represent the wildcard ANY value, and -2 is used
 *  to represent a STOP value (i.e., no more dependents).
 *
 * TODO: does that cause any problems regarding unseen words also being -1?
 * TODO: any way to not have links to the Index in each object?
 *
 *  @author Dan Klein
 *  @author Christopher Manning
 */
public class IntTaggedWord implements Serializable, Comparable {

  public static final int ANY_WORD_INT = -1;
  public static final int ANY_TAG_INT = -1;
  public static final int STOP_WORD_INT = -2;
  public static final int STOP_TAG_INT = -2;

  public static final String ANY = ".*.";
  public static final String STOP = "STOP";

  public final int word;
  public final short tag;

  public int tag() {
    return tag;
  }

  public int word() {
    return word;
  }

  public String wordString(Index wordIndex) {
    String wordStr;
    if (word >= 0) {
      wordStr = wordIndex.get(word);
    } else if (word == ANY_WORD_INT) {
      wordStr = ANY;
    } else {
      wordStr = STOP;
    }
    return wordStr;
  }

  public String tagString(Index tagIndex) {
    String tagStr;
    if (tag >= 0) {
      tagStr = tagIndex.get(tag);
    } else if (tag == ANY_TAG_INT) {
      tagStr = ANY;
    } else {
      tagStr = STOP;
    }
    return tagStr;
  }

  @Override
  public int hashCode() {
    return word ^ (tag << 16);
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    } else if (o instanceof IntTaggedWord) {
      IntTaggedWord i = (IntTaggedWord) o;
      return (word == i.word && tag == i.tag);
    } else {
      return false;
    }
  }

  public int compareTo(IntTaggedWord that) {
    if (tag != that.tag) {
      return tag - that.tag;
    } else {
      return word - that.word;
    }
  }

  private static final char[] charsToEscape = { '\"' };


  public String toLexicalEntry(Index wordIndex,
                               Index tagIndex) {
    String wordStr = wordString(wordIndex);
    String tagStr = tagString(tagIndex);
    return '\"' + StringUtils.escapeString(tagStr, charsToEscape, '\\') + "\" -> \"" + StringUtils.escapeString(wordStr, charsToEscape, '\\') + '\"';
  }

  @Override
  public String toString() {
    return word + "/" + tag;
  }

  public String toString(Index wordIndex, Index tagIndex) {
    return wordString(wordIndex)+ '/' +tagString(tagIndex);
  }

  public String toString(String arg,
                         Index wordIndex, Index tagIndex) {
    if (arg.equals("verbose")) {
      return (wordString(wordIndex) + '[' + word + "]/" +
              tagString(tagIndex) + '[' + tag + ']');
    } else {
      return toString(wordIndex, tagIndex);
    }
  }

  public IntTaggedWord(int word, int tag) {
    this.word = word;
    this.tag = (short) tag;
  }

  public TaggedWord toTaggedWord(Index wordIndex,
                                 Index tagIndex) {
    String wordStr = wordString(wordIndex);
    String tagStr = tagString(tagIndex);
    return new TaggedWord(wordStr, tagStr);
  }

  /**
   * Creates an IntTaggedWord given by the String representation
   * of the form <word>|<tag*gt;
   */
  public IntTaggedWord(String s, char splitChar,
                       Index wordIndex, Index tagIndex) {
    // awkward, calls s.indexOf(splitChar) twice
    this(extractWord(s, splitChar), extractTag(s, splitChar),
         wordIndex, tagIndex);
    //    System.out.println("s: " + s);
    //    System.out.println("tagIndex: " + tagIndex);
    //    System.out.println("word: " + word);
    //    System.out.println("tag: " + tag);
  }

  private static String extractWord(String s, char splitChar) {
    int n = s.lastIndexOf(splitChar);
    String result = s.substring(0, n);
    //    System.out.println("extracted word: " + result);
    return result;
  }

  private static String extractTag(String s, char splitChar) {
    int n = s.lastIndexOf(splitChar);
    String result = s.substring(n + 1);
    //    System.out.println("extracted tag: " + result);
    return result;
  }

  /**
   * Creates an IntTaggedWord given by the tagString and wordString
   */
  public IntTaggedWord(String wordString, String tagString,
                       Index wordIndex, Index tagIndex) {
    switch (wordString) {
      case ANY:
        word = ANY_WORD_INT;
        break;
      case STOP:
        word = STOP_WORD_INT;
        break;
      default:
        word = wordIndex.addToIndex(wordString);
        break;
    }
    switch (tagString) {
      case ANY:
        tag = (short) ANY_TAG_INT;
        break;
      case STOP:
        tag = (short) STOP_TAG_INT;
        break;
      default:
        tag = (short) tagIndex.addToIndex(tagString);
        break;
    }
  }

  private static final long serialVersionUID = 1L;

} // end class IntTaggedWord