All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.IndexedWord Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.ling;

import java.util.Set;

import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;

/**
 * This class provides a {@link CoreLabel} that uses its
 * DocIDAnnotation, SentenceIndexAnnotation, and IndexAnnotation to implement
 * Comparable/compareTo, hashCode, and equals.  This means no other annotations,
 * including the identity of the word, are taken into account when using these
 * methods. Historically, this class was introduced for and is mainly used in
 * the RTE package, and it provides a number of methods that are really specific
 * to that use case. A second use case is now the Stanford Dependencies code,
 * where this class directly implements the "copy nodes" of section 4.6 of the
 * Stanford Dependencies Manual, rather than these being placed directly in the
 * backing CoreLabel. This was so there can stay one CoreLabel per token, despite
 * there being multiple IndexedWord nodes, additional ones representing copy
 * nodes.
 * 

* The actual implementation is to wrap a {@code CoreLabel}. * This avoids breaking the {@code equals()} and * {@code hashCode()} contract and also avoids expensive copying * when used to represent the same data as the original * {@code CoreLabel}. * * @author rafferty * @author John Bauer * @author Sonal Gupta */ public class IndexedWord implements AbstractCoreLabel, Comparable { private static final long serialVersionUID = 3739633991145239829L; /** * The identifier that points to no word. */ public static final IndexedWord NO_WORD = new IndexedWord(null, -1, -1); private final CoreLabel label; private int copyCount; // = 0; private int numCopies = 0; private IndexedWord original = null; /** * Useful for specifying a fine-grained position when butchering parse trees. * The canonical use case for this is resolving coreference in the OpenIE system, where * we want to move nodes between sentences, but do not want to change their index annotation * (plus, we need to have multiple nodes fit into the space of one pronoun). */ private double pseudoPosition = Double.NaN; /** * Default constructor; uses {@link CoreLabel} default constructor */ public IndexedWord() { label = new CoreLabel(); } /** * Copy Constructor - relies on {@link CoreLabel} copy constructor * It will set the value, and if the word is not set otherwise, set * the word to the value. * * @param w A Label to initialize this IndexedWord from */ public IndexedWord(Label w) { if (w instanceof CoreLabel) { this.label = (CoreLabel) w; } else { label = new CoreLabel(w); if (label.word() == null) { label.setWord(label.value()); } } } /** * Construct an IndexedWord from a CoreLabel just as for a CoreMap. * Implementation note: this is a the same as the constructor * that takes a CoreMap, but is needed to ensure unique most specific * type inference for selecting a constructor at compile-time. * * @param w A Label to initialize this IndexedWord from */ public IndexedWord(CoreLabel w) { label = w; } /** * Constructor for setting docID, sentenceIndex, and * index without any other annotations. * * @param docID The document ID (arbitrary string) * @param sentenceIndex The sentence number in the document (normally 0-based) * @param index The index of the word in the sentence (normally 0-based) */ public IndexedWord(String docID, int sentenceIndex, int index) { label = new CoreLabel(); label.set(CoreAnnotations.DocIDAnnotation.class, docID); label.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex); label.set(CoreAnnotations.IndexAnnotation.class, index); } public IndexedWord makeCopy(int count) { CoreLabel labelCopy = new CoreLabel(label); IndexedWord copy = new IndexedWord(labelCopy); copy.setCopyCount(count); return copy; } public IndexedWord makeCopy() { return makeCopy(++numCopies); } public IndexedWord makeSoftCopy(int count) { IndexedWord copy = new IndexedWord(label); copy.setCopyCount(count); copy.original = this; return copy; } public IndexedWord makeSoftCopy() { if (original != null) { return original.makeSoftCopy(); } else { return makeSoftCopy(++numCopies); } } public IndexedWord getOriginal() { return original; } /** * TODO: get rid of this. Only used in two places in RTE (in rewriter code) */ public CoreLabel backingLabel() { return label; } @Override public VALUE get(Class> key) { return label.get(key); } @Override public boolean has(Class> key) { return label.has(key); } @Override public boolean containsKey(Class> key) { return label.containsKey(key); } @Override public VALUE set(Class> key, VALUE value) { return label.set(key, value); } @Override public > String getString(Class key) { return label.getString(key); } @Override public > String getString(Class key, String def) { return label.getString(key, def); } @Override public VALUE remove(Class> key) { return label.remove(key); } @Override public Set> keySet() { return label.keySet(); } @Override public int size() { return label.size(); } @Override public String value() { return label.value(); } @Override public void setValue(String value) { label.setValue(value); } @Override public String tag() { return label.tag(); } @Override public void setTag(String tag) { label.setTag(tag); } @Override public String word() { return label.word(); } @Override public void setWord(String word) { label.setWord(word); } @Override public String lemma() { return label.lemma(); } @Override public void setLemma(String lemma) { label.setLemma(lemma); } @Override public String ner() { return label.ner(); } @Override public void setNER(String ner) { label.setNER(ner); } @Override public String docID() { return label.docID(); } @Override public void setDocID(String docID) { label.setDocID(docID); } @Override public int index() { return label.index(); } @Override public void setIndex(int index) { label.setIndex(index); } /** * In most cases, this is just the index of the word. * However, this should be the value used to sort nodes in * a tree. * * @see IndexedWord#pseudoPosition */ public double pseudoPosition() { if (!Double.isNaN(pseudoPosition)) { return pseudoPosition; } else { return (double) index(); } } /** * @see IndexedWord#pseudoPosition */ public void setPseudoPosition(double position) { this.pseudoPosition = position; } @Override public int sentIndex() { return label.sentIndex(); } @Override public void setSentIndex(int sentIndex) { label.setSentIndex(sentIndex); } @Override public String originalText() { return label.originalText(); } @Override public void setOriginalText(String originalText) { label.setOriginalText(originalText); } @Override public int beginPosition() { return label.beginPosition(); } @Override public int endPosition() { return label.endPosition(); } @Override public void setBeginPosition(int beginPos) { label.setBeginPosition(beginPos); } @Override public void setEndPosition(int endPos) { label.setEndPosition(endPos); } public int copyCount() { return copyCount; } public void setCopyCount(int count) { this.copyCount = count; } public String toPrimes() { return StringUtils.repeat('\'', copyCount); } public boolean isCopy(IndexedWord otherWord) { Integer myInd = get(CoreAnnotations.IndexAnnotation.class); Integer otherInd = otherWord.get(CoreAnnotations.IndexAnnotation.class); if (myInd == null) { if (otherInd != null) return false; } else if ( ! myInd.equals(otherInd)) { return false; } Integer mySentInd = get(CoreAnnotations.SentenceIndexAnnotation.class); Integer otherSentInd = otherWord.get(CoreAnnotations.SentenceIndexAnnotation.class); if (mySentInd == null) { if (otherSentInd != null) return false; } else if ( ! mySentInd.equals(otherSentInd)) { return false; } String myDocID = getString(CoreAnnotations.DocIDAnnotation.class); String otherDocID = otherWord.getString(CoreAnnotations.DocIDAnnotation.class); if (myDocID == null) { if (otherDocID != null) return false; } else if ( ! myDocID.equals(otherDocID)) { return false; } if (copyCount() == 0 || otherWord.copyCount() != 0) { return false; } return true; } /** * This .equals is dependent only on docID, sentenceIndex, and index. * It doesn't consider the actual word value, but assumes that it is * validly represented by token position. * All IndexedWords that lack these fields will be regarded as equal. */ @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof IndexedWord)) return false; //now compare on appropriate keys final IndexedWord otherWord = (IndexedWord) o; Integer myInd = get(CoreAnnotations.IndexAnnotation.class); Integer otherInd = otherWord.get(CoreAnnotations.IndexAnnotation.class); if (myInd == null) { if (otherInd != null) return false; } else if ( ! myInd.equals(otherInd)) { return false; } Integer mySentInd = get(CoreAnnotations.SentenceIndexAnnotation.class); Integer otherSentInd = otherWord.get(CoreAnnotations.SentenceIndexAnnotation.class); if (mySentInd == null) { if (otherSentInd != null) return false; } else if ( ! mySentInd.equals(otherSentInd)) { return false; } String myDocID = getString(CoreAnnotations.DocIDAnnotation.class); String otherDocID = otherWord.getString(CoreAnnotations.DocIDAnnotation.class); if (myDocID == null) { if (otherDocID != null) return false; } else if ( ! myDocID.equals(otherDocID)) { return false; } if (copyCount() != otherWord.copyCount()) { return false; } // Compare pseudo-positions if ( (!Double.isNaN(this.pseudoPosition) || !Double.isNaN(otherWord.pseudoPosition)) && this.pseudoPosition != otherWord.pseudoPosition) { return false; } return true; } private int cachedHashCode = 0; /** * This hashCode uses only the docID, sentenceIndex, and index. * See compareTo for more info. */ @Override public int hashCode() { if (cachedHashCode != 0) { return cachedHashCode; } boolean sensible = false; int result = 0; if (get(CoreAnnotations.DocIDAnnotation.class) != null) { result = get(CoreAnnotations.DocIDAnnotation.class).hashCode(); sensible = true; } if (has(CoreAnnotations.SentenceIndexAnnotation.class)) { result = 29 * result + get(CoreAnnotations.SentenceIndexAnnotation.class).hashCode(); sensible = true; } if (has(CoreAnnotations.IndexAnnotation.class)) { result = 29 * result + get(CoreAnnotations.IndexAnnotation.class).hashCode(); sensible = true; } if ( ! sensible) { System.err.println("WARNING!!! You have hashed an IndexedWord with no docID, sentIndex or wordIndex. You will almost certainly lose"); } cachedHashCode = result; return result; } /** * NOTE: This compareTo is based on and made to be compatible with the one * from IndexedFeatureLabel. You must have a DocIDAnnotation, * SentenceIndexAnnotation, and IndexAnnotation for this to make sense and * be guaranteed to work properly. Currently, it won't error out and will * try to return something sensible if these are not defined, but that really * isn't proper usage! * * This compareTo method is based not by value elements like the word(), * but on passage position. It puts NO_WORD elements first, and then orders * by document, sentence, and word index. If these do not differ, it * returns equal. * * @param w The IndexedWord to compare with * @return Whether this is less than w or not in the ordering */ @Override public int compareTo(IndexedWord w) { if (this.equals(IndexedWord.NO_WORD)) { if (w.equals(IndexedWord.NO_WORD)) { return 0; } else { return -1; } } if (w.equals(IndexedWord.NO_WORD)) { return 1; } // Override the default comparator if pseudo-positions are set. // This is needed for splicing trees together awkwardly in OpenIE. if (!Double.isNaN(w.pseudoPosition) || !Double.isNaN(this.pseudoPosition)) { double val = this.pseudoPosition() - w.pseudoPosition(); if (val < 0) { return -1; } if (val > 0) { return 1; } else { return 0; } } // Otherwise, compare using the normal doc/sentence/token index hierarchy String docID = this.getString(CoreAnnotations.DocIDAnnotation.class); int docComp = docID.compareTo(w.getString(CoreAnnotations.DocIDAnnotation.class)); if (docComp != 0) return docComp; int sentComp = sentIndex() - w.sentIndex(); if (sentComp != 0) return sentComp; int indexComp = index() - w.index(); if (indexComp != 0) return indexComp; return copyCount() - w.copyCount(); } /** * Returns the value-tag of this label. */ @Override public String toString() { return toString(CoreLabel.OutputFormat.VALUE_TAG); } public String toString(CoreLabel.OutputFormat format) { return label.toString(format) + toPrimes(); } /** * {@inheritDoc} */ @Override public void setFromString(String labelStr) { throw new UnsupportedOperationException("Cannot set from string"); } public static LabelFactory factory() { return new LabelFactory() { @Override public Label newLabel(String labelStr) { CoreLabel coreLabel = new CoreLabel(); coreLabel.setValue(labelStr); return new IndexedWord(coreLabel); } @Override public Label newLabel(String labelStr, int options) { return newLabel(labelStr); } @Override public Label newLabel(Label oldLabel) { return new IndexedWord(oldLabel); } @Override public Label newLabelFromString(String encodedLabelStr) { throw new UnsupportedOperationException("This code branch left blank" + " because we do not understand what this method should do."); } }; } /** * {@inheritDoc} */ @Override public LabelFactory labelFactory() { return IndexedWord.factory(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy