edu.stanford.nlp.ling.IndexedWord Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.ling;

import java.util.Set;

import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;

/**
 * This class provides a {@link CoreLabel} that uses its
 * DocIDAnnotation, SentenceIndexAnnotation, and IndexAnnotation to implement
 * Comparable/compareTo, hashCode, and equals.  This means no other annotations,
 * including the identity of the word, are taken into account when using these
 * methods. Historically, this class was introduced for and is mainly used in
 * the RTE package, and it provides a number of methods that are really specific
 * to that use case. A second use case is now the Stanford Dependencies code,
 * where this class directly implements the "copy nodes" of section 4.6 of the
 * Stanford Dependencies Manual, rather than these being placed directly in the
 * backing CoreLabel. This was so there can stay one CoreLabel per token, despite
 * there being multiple IndexedWord nodes, additional ones representing copy
 * nodes.
 * 
 * The actual implementation is to wrap a {@code CoreLabel}.
 * This avoids breaking the {@code equals()} and
 * {@code hashCode()} contract and also avoids expensive copying
 * when used to represent the same data as the original
 * {@code CoreLabel}.
 *
 * @author rafferty
 * @author John Bauer
 * @author Sonal Gupta
 */
public class IndexedWord implements AbstractCoreLabel, Comparable {

  private static final long serialVersionUID = 3739633991145239829L;

  /**
   * The identifier that points to no word.
   */
  public static final IndexedWord NO_WORD = new IndexedWord(null, -1, -1);

  private final CoreLabel label;

  private int copyCount; // = 0;
  
  private int numCopies = 0;
  
  private IndexedWord original = null;

  /**
   * Useful for specifying a fine-grained position when butchering parse trees.
   * The canonical use case for this is resolving coreference in the OpenIE system, where
   * we want to move nodes between sentences, but do not want to change their index annotation
   * (plus, we need to have multiple nodes fit into the space of one pronoun).
   */
  private double pseudoPosition = Double.NaN;

  /**
   * Default constructor; uses {@link CoreLabel} default constructor
   */
  public IndexedWord() {
    label = new CoreLabel();
  }


  /**
   * Copy Constructor - relies on {@link CoreLabel} copy constructor
   * It will set the value, and if the word is not set otherwise, set
   * the word to the value.
   *
   * @param w A Label to initialize this IndexedWord from
   */
  public IndexedWord(Label w) {
    if (w instanceof CoreLabel) {
      this.label = (CoreLabel) w;
    } else {
      label = new CoreLabel(w);
      if (label.word() == null) {
        label.setWord(label.value());
      }
    }
  }

  /**
   * Construct an IndexedWord from a CoreLabel just as for a CoreMap.
   * Implementation note: this is a the same as the constructor
   * that takes a CoreMap, but is needed to ensure unique most specific
   * type inference for selecting a constructor at compile-time.
   *
   * @param w A Label to initialize this IndexedWord from
   */
  public IndexedWord(CoreLabel w) {
    label = w;
  }

  /**
   * Constructor for setting docID, sentenceIndex, and
   * index without any other annotations.
   *
   * @param docID The document ID (arbitrary string)
   * @param sentenceIndex The sentence number in the document (normally 0-based)
   * @param index The index of the word in the sentence (normally 0-based)
   */
  public IndexedWord(String docID, int sentenceIndex, int index) {
    label = new CoreLabel();
    label.set(CoreAnnotations.DocIDAnnotation.class, docID);
    label.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
    label.set(CoreAnnotations.IndexAnnotation.class, index);
  }

  public IndexedWord makeCopy(int count) {
    CoreLabel labelCopy = new CoreLabel(label);
    IndexedWord copy = new IndexedWord(labelCopy);
    copy.setCopyCount(count);
    return copy;
  }
  
  public IndexedWord makeCopy() {
    return makeCopy(++numCopies);
  }

  public IndexedWord makeSoftCopy(int count) {
    IndexedWord copy = new IndexedWord(label);
    copy.setCopyCount(count);
    copy.original = this;
    return copy;
  }
  
  public IndexedWord makeSoftCopy() {
    if (original != null) {
      return original.makeSoftCopy();
    } else {
      return makeSoftCopy(++numCopies);
    }
  }
  
  public IndexedWord getOriginal() {
    return original;
  }

  /**
   * TODO: get rid of this.  Only used in two places in RTE (in rewriter code)
   */
  public CoreLabel backingLabel() { return label; }

  @Override
  public  VALUE get(Class> key) {
    return label.get(key);
  }

  @Override
  public  boolean has(Class> key) {
    return label.has(key);
  }

  @Override
  public  boolean containsKey(Class> key) {
    return label.containsKey(key);
  }

  @Override
  public  VALUE set(Class> key, VALUE value) {
    return label.set(key, value);
  }

  @Override
  public > String getString(Class key) {
    return label.getString(key);
  }

  @Override
  public > String getString(Class key, String def) {
    return label.getString(key, def);
  }

  @Override
  public  VALUE remove(Class> key) {
    return label.remove(key);
  }

  @Override
  public Set> keySet() {
    return label.keySet();
  }

  @Override
  public int size() {
    return label.size();
  }

  @Override
  public String value() {
    return label.value();
  }

  @Override
  public void setValue(String value) {
    label.setValue(value);
  }

  @Override
  public String tag() {
    return label.tag();
  }

  @Override
  public void setTag(String tag) {
    label.setTag(tag);
  }

  @Override
  public String word() {
    return label.word();
  }

  @Override
  public void setWord(String word) {
    label.setWord(word);
  }

  @Override
  public String lemma() {
    return label.lemma();
  }

  @Override
  public void setLemma(String lemma) {
    label.setLemma(lemma);
  }

  @Override
  public String ner() {
    return label.ner();
  }

  @Override
  public void setNER(String ner) {
    label.setNER(ner);
  }

  @Override
  public String docID() {
    return label.docID();
  }

  @Override
  public void setDocID(String docID) {
    label.setDocID(docID);
  }

  @Override
  public int index() {
    return label.index();
  }

  @Override
  public void setIndex(int index) {
    label.setIndex(index);
  }

  /**
   * In most cases, this is just the index of the word.
   * However, this should be the value used to sort nodes in
   * a tree.
   *
   * @see IndexedWord#pseudoPosition
   */
  public double pseudoPosition() {
    if (!Double.isNaN(pseudoPosition)) {
      return pseudoPosition;
    } else {
      return (double) index();
    }
  }

  /**
   * @see IndexedWord#pseudoPosition
   */
  public void setPseudoPosition(double position) {
    this.pseudoPosition = position;
  }

  @Override
  public int sentIndex() {
    return label.sentIndex();
  }

  @Override
  public void setSentIndex(int sentIndex) {
    label.setSentIndex(sentIndex);
  }

  @Override
  public String originalText() {
    return label.originalText();
  }

  @Override
  public void setOriginalText(String originalText) {
    label.setOriginalText(originalText);
  }

  @Override
  public int beginPosition() {
    return label.beginPosition();
  }

  @Override
  public int endPosition() {
    return label.endPosition();
  }

  @Override
  public void setBeginPosition(int beginPos) {
    label.setBeginPosition(beginPos);
  }

  @Override
  public void setEndPosition(int endPos) {
    label.setEndPosition(endPos);
  }

  public int copyCount() {
    return copyCount;
  }

  public void setCopyCount(int count) {
    this.copyCount = count;
  }

  public String toPrimes() {
    return StringUtils.repeat('\'', copyCount);
  }
  
  public boolean isCopy(IndexedWord otherWord) {
    Integer myInd = get(CoreAnnotations.IndexAnnotation.class);
    Integer otherInd = otherWord.get(CoreAnnotations.IndexAnnotation.class);
    if (myInd == null) {
      if (otherInd != null)
      return false;
    } else if ( ! myInd.equals(otherInd)) {
      return false;
    }
    Integer mySentInd = get(CoreAnnotations.SentenceIndexAnnotation.class);
    Integer otherSentInd = otherWord.get(CoreAnnotations.SentenceIndexAnnotation.class);
    if (mySentInd == null) {
      if (otherSentInd != null)
      return false;
    } else if ( ! mySentInd.equals(otherSentInd)) {
      return false;
    }
    String myDocID = getString(CoreAnnotations.DocIDAnnotation.class);
    String otherDocID = otherWord.getString(CoreAnnotations.DocIDAnnotation.class);
    if (myDocID == null) {
      if (otherDocID != null)
      return false;
    } else if ( ! myDocID.equals(otherDocID)) {
      return false;
    }
    
    if (copyCount() == 0 || otherWord.copyCount() != 0) {
      return false;
    }

    return true;
  }

  /**
   * This .equals is dependent only on docID, sentenceIndex, and index.
   * It doesn't consider the actual word value, but assumes that it is
   * validly represented by token position.
   * All IndexedWords that lack these fields will be regarded as equal.
   */
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (!(o instanceof IndexedWord)) return false;

    //now compare on appropriate keys
    final IndexedWord otherWord = (IndexedWord) o;
    Integer myInd = get(CoreAnnotations.IndexAnnotation.class);
    Integer otherInd = otherWord.get(CoreAnnotations.IndexAnnotation.class);
    if (myInd == null) {
      if (otherInd != null)
      return false;
    } else if ( ! myInd.equals(otherInd)) {
      return false;
    }
    Integer mySentInd = get(CoreAnnotations.SentenceIndexAnnotation.class);
    Integer otherSentInd = otherWord.get(CoreAnnotations.SentenceIndexAnnotation.class);
    if (mySentInd == null) {
      if (otherSentInd != null)
      return false;
    } else if ( ! mySentInd.equals(otherSentInd)) {
      return false;
    }
    String myDocID = getString(CoreAnnotations.DocIDAnnotation.class);
    String otherDocID = otherWord.getString(CoreAnnotations.DocIDAnnotation.class);
    if (myDocID == null) {
      if (otherDocID != null)
      return false;
    } else if ( ! myDocID.equals(otherDocID)) {
      return false;
    }
    if (copyCount() != otherWord.copyCount()) {
      return false;
    }
    // Compare pseudo-positions
    if ( (!Double.isNaN(this.pseudoPosition) || !Double.isNaN(otherWord.pseudoPosition)) &&
         this.pseudoPosition != otherWord.pseudoPosition) {
      return false;
    }
    return true;
  }


  private int cachedHashCode = 0;
  /**
   * This hashCode uses only the docID, sentenceIndex, and index.
   * See compareTo for more info.
   */
  @Override
  public int hashCode() {
    if (cachedHashCode != 0) {
      return cachedHashCode;
    }
    boolean sensible = false;
    int result = 0;
    if (get(CoreAnnotations.DocIDAnnotation.class) != null) {
      result = get(CoreAnnotations.DocIDAnnotation.class).hashCode();
      sensible = true;
    }
    if (has(CoreAnnotations.SentenceIndexAnnotation.class)) {
      result = 29 * result + get(CoreAnnotations.SentenceIndexAnnotation.class).hashCode();
      sensible = true;
    }
    if (has(CoreAnnotations.IndexAnnotation.class)) {
      result = 29 * result + get(CoreAnnotations.IndexAnnotation.class).hashCode();
      sensible = true;
    }
    if ( ! sensible) {
      System.err.println("WARNING!!!  You have hashed an IndexedWord with no docID, sentIndex or wordIndex. You will almost certainly lose");
    }
    cachedHashCode = result;
    return result;
  }

  /**
   * NOTE: This compareTo is based on and made to be compatible with the one
   * from IndexedFeatureLabel.  You must have a DocIDAnnotation,
   * SentenceIndexAnnotation, and IndexAnnotation for this to make sense and
   * be guaranteed to work properly. Currently, it won't error out and will
   * try to return something sensible if these are not defined, but that really
   * isn't proper usage!
   *
   * This compareTo method is based not by value elements like the word(),
   *  but on passage position. It puts NO_WORD elements first, and then orders
   *  by document, sentence, and word index.  If these do not differ, it
   *  returns equal.
   *
   *  @param w The IndexedWord to compare with
   *  @return Whether this is less than w or not in the ordering
   */
  @Override
  public int compareTo(IndexedWord w) {
    if (this.equals(IndexedWord.NO_WORD)) {
      if (w.equals(IndexedWord.NO_WORD)) {
        return 0;
      } else {
        return -1;
      }
    }
    if (w.equals(IndexedWord.NO_WORD)) {
      return 1;
    }

    // Override the default comparator if pseudo-positions are set.
    // This is needed for splicing trees together awkwardly in OpenIE.
    if (!Double.isNaN(w.pseudoPosition) || !Double.isNaN(this.pseudoPosition)) {
      double val = this.pseudoPosition() - w.pseudoPosition();
      if (val < 0) { return -1; }
      if (val > 0) { return 1; }
      else { return 0; }
    }

    // Otherwise, compare using the normal doc/sentence/token index hierarchy
    String docID = this.getString(CoreAnnotations.DocIDAnnotation.class);
    int docComp = docID.compareTo(w.getString(CoreAnnotations.DocIDAnnotation.class));
    if (docComp != 0) return docComp;

    int sentComp = sentIndex() - w.sentIndex();
    if (sentComp != 0) return sentComp;

    int indexComp = index() - w.index();
    if (indexComp != 0) return indexComp;

    return copyCount() - w.copyCount();
  }

  /**
   * Returns the value-tag of this label.
   */
  @Override
  public String toString() {
    return toString(CoreLabel.OutputFormat.VALUE_TAG);
  }

  public String toString(CoreLabel.OutputFormat format) {
    return label.toString(format) + toPrimes();
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void setFromString(String labelStr) {
    throw new UnsupportedOperationException("Cannot set from string");
  }


  public static LabelFactory factory() {
    return new LabelFactory() {

      @Override
      public Label newLabel(String labelStr) {
        CoreLabel coreLabel = new CoreLabel();
        coreLabel.setValue(labelStr);
        return new IndexedWord(coreLabel);
      }

      @Override
      public Label newLabel(String labelStr, int options) {
        return newLabel(labelStr);
      }

      @Override
      public Label newLabel(Label oldLabel) {
        return new IndexedWord(oldLabel);
      }

      @Override
      public Label newLabelFromString(String encodedLabelStr) {
        throw new UnsupportedOperationException("This code branch left blank" +
        " because we do not understand what this method should do.");
      }
    };
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public LabelFactory labelFactory() {
    return IndexedWord.factory();
  }

}