All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.sequences.ObjectBankWrapper Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.sequences;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.Americanize;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;
import java.util.regex.Pattern;


/**
 * This class is used to wrap the ObjectBank used by the sequence
 * models and is where any sort of general processing, like the IOB mapping
 * stuff and wordshape stuff, should go.
 * It checks the SeqClassifierFlags to decide what to do.
 * 

* TODO: We should rearchitect this so that the FeatureFactory-specific * stuff is done by a callback to the relevant FeatureFactory. * * @author Jenny Finkel */ public class ObjectBankWrapper extends ObjectBank> { private static final long serialVersionUID = -3838331732026362075L; private final SeqClassifierFlags flags; private final ObjectBank> wrapped; private final Set knownLCWords; public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank> wrapped, Set knownLCWords) { super(null, null); this.flags = flags; this.wrapped = wrapped; this.knownLCWords = knownLCWords; } @Override public Iterator> iterator() { Iterator> iter = new WrappedIterator(wrapped.iterator()); return iter; } private class WrappedIterator extends AbstractIterator> { Iterator> wrappedIter; Iterator> spilloverIter; public WrappedIterator(Iterator> wrappedIter) { this.wrappedIter = wrappedIter; } @Override public boolean hasNext() { while ((spilloverIter == null || !spilloverIter.hasNext()) && wrappedIter.hasNext()) { List doc = wrappedIter.next(); List> docs = new ArrayList<>(); docs.add(doc); fixDocLengths(docs); spilloverIter = docs.iterator(); } return wrappedIter.hasNext() || (spilloverIter != null && spilloverIter.hasNext()); } @Override public List next() { // this while loop now is redundant because it should // have already been done in "hasNext". // I'm keeping it so that the diff is minimal. // -pichuan while (spilloverIter == null || !spilloverIter.hasNext()) { List doc = wrappedIter.next(); List> docs = new ArrayList<>(); docs.add(doc); fixDocLengths(docs); spilloverIter = docs.iterator(); } return processDocument(spilloverIter.next()); } } public List processDocument(List doc) { if (flags.mergeTags) { mergeTags(doc); } if (flags.iobTags) { iobTags(doc); } doBasicStuff(doc); return doc; } private String intern(String s) { if (flags.intern) { return s.intern(); } else { return s; } } private final Pattern monthDayPattern = Pattern.compile("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|January|February|March|April|May|June|July|August|September|October|November|December", Pattern.CASE_INSENSITIVE); private String fix(String word) { if (flags.normalizeTerms || flags.normalizeTimex) { // Same case for days/months: map to lowercase if (monthDayPattern.matcher(word).matches()) { return word.toLowerCase(); } } if (flags.normalizeTerms) { return Americanize.americanize(word, false); } return word; } private void doBasicStuff(List doc) { int position = 0; for (IN fl : doc) { // position in document fl.set(CoreAnnotations.PositionAnnotation.class, Integer.toString((position++))); // word shape if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) { // TODO: if we pass in a FeatureFactory, as suggested by an earlier comment, // we should use that FeatureFactory's getWord function String word = fl.get(CoreAnnotations.TextAnnotation.class); if (flags.wordFunction != null) { word = flags.wordFunction.apply(word); } if ( ! word.isEmpty() && Character.isLowerCase(word.codePointAt(0))) { knownLCWords.add(word); } String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords)); fl.set(CoreAnnotations.ShapeAnnotation.class, s); } // normalizing and interning // was the following; should presumably now be // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) { if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) { // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency. fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class)))); } else { fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class)))); // only override GoldAnswer if not set - so that a DocumentReaderAndWriter can set it right in the first place. if (fl.get(CoreAnnotations.AnswerAnnotation.class) == null) { fl.set(CoreAnnotations.GoldAnswerAnnotation.class, fl.get(CoreAnnotations.AnswerAnnotation.class)); } } } } /** * Take a {@link List} of documents (which are themselves {@link List}s * of something that extends {@link CoreMap}, CoreLabel by default), * and if any are longer than the length * specified by flags.maxDocSize split them up. If maxDocSize is negative, * nothing is changed. In practice, documents need to be not too long or * else the CRF inference will fail due to numerical problems. * This method tries to be smart * and split on sentence boundaries, but this is hard-coded to English. * * @param docs The list of documents whose length might be adjusted. */ private void fixDocLengths(List> docs) { final int maxDocSize = flags.maxDocSize; WordToSentenceProcessor wts = new WordToSentenceProcessor<>(); List> newDocuments = new ArrayList<>(); for (List document : docs) { if (maxDocSize <= 0 || document.size() <= maxDocSize) { if (flags.keepEmptySentences || !document.isEmpty()) { newDocuments.add(document); } continue; } List> sentences = wts.process(document); List newDocument = new ArrayList<>(); for (List sentence : sentences) { if (newDocument.size() + sentence.size() > maxDocSize) { if (!newDocument.isEmpty()) { newDocuments.add(newDocument); } newDocument = new ArrayList<>(); } newDocument.addAll(sentence); } if (flags.keepEmptySentences || !newDocument.isEmpty()) { newDocuments.add(newDocument); } } docs.clear(); docs.addAll(newDocuments); } private void iobTags(List doc) { String lastTag = ""; for (IN wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); if (!flags.backgroundSymbol.equals(answer) && answer != null) { int index = answer.indexOf('-'); String prefix; String label; if (index < 0) { prefix = ""; label = answer; } else { prefix = answer.substring(0,index); label = answer.substring(index+1); } if (!prefix.equals("B")) { if (!label.equals(lastTag)) { wi.set(CoreAnnotations.AnswerAnnotation.class, "B-" + label); } else { wi.set(CoreAnnotations.AnswerAnnotation.class, "I-" + label); } } lastTag = label; } else { lastTag = answer; } } } /** Change some form of IOB/IOE encoding via forms like "I-PERS" to * IO encoding as just "PERS". * * @param doc The document for which the AnswerAnnotation will be changed (in place) */ private void mergeTags(List doc) { for (IN wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); if (answer == null) { continue; } if ( ! answer.equals(flags.backgroundSymbol)) { int index = answer.indexOf('-'); if (index >= 0) { answer = answer.substring(index + 1); } } wi.set(CoreAnnotations.AnswerAnnotation.class, answer); } } // all the other the crap from ObjectBank @Override public boolean add(List o) { return wrapped.add(o); } @Override public boolean addAll(Collection> c) { return wrapped.addAll(c); } @Override public void clear() { wrapped.clear(); } @Override public void clearMemory() { wrapped.clearMemory(); } public boolean contains(List o) { return wrapped.contains(o); } @Override public boolean containsAll(Collection c) { return wrapped.containsAll(c); } @Override public boolean isEmpty() { return wrapped.isEmpty(); } @Override public void keepInMemory(boolean keep) { wrapped.keepInMemory(keep); } public boolean remove(List o) { return wrapped.remove(o); } @Override public boolean removeAll(Collection c) { return wrapped.removeAll(c); } @Override public boolean retainAll(Collection c) { return wrapped.retainAll(c); } @Override public int size() { return wrapped.size(); } @Override public Object[] toArray() { return wrapped.toArray(); } public List[] toArray(List[] o) { return wrapped.toArray(o); } } // end class ObjectBankWrapper





© 2015 - 2024 Weber Informatics LLC | Privacy Policy