All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.DocumentPreprocessor Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.process;

import java.io.*;
import java.util.*;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import java.util.function.Function;
import edu.stanford.nlp.util.Generics;

/**
 * Produces a list of sentences from either a plain text or XML document.
 * 

* Tokenization: The default tokenizer is {@link PTBTokenizer}. If null is passed * to setTokenizerFactory, then whitespace tokenization is assumed. *

* Adding a new document type requires two steps: *

    *
  1. Add a new DocType. *
  2. Create an iterator for the new DocType and modify the iterator() * function to return the new iterator. *
*

* NOTE: This implementation should not use external libraries since it * is used in the parser. * * @author Spence Green */ public class DocumentPreprocessor implements Iterable> { public static enum DocType {Plain, XML} public static final String[] DEFAULT_SENTENCE_DELIMS = {".", "?", "!"}; // inputReader is used in a fairly yucky way at the moment to communicate // from a XMLIterator across to a PlainTextIterator. Maybe redo by making // the inner classes static and explicitly passing things around. private Reader inputReader; private final DocType docType; //Configurable options private TokenizerFactory tokenizerFactory = PTBTokenizer.coreLabelFactory(); private String[] sentenceFinalPuncWords = DEFAULT_SENTENCE_DELIMS; private Function,List> escaper = null; private String sentenceDelimiter = null; /** * Example: if the words are already POS tagged and look like * foo_VB, you want to set the tagDelimiter to "_" */ private String tagDelimiter = null; /** * When doing XML parsing, only accept text in between tags that * match this regular expression. Defaults to everything. */ private String elementDelimiter = ".*"; private static final Pattern wsPattern = Pattern.compile("\\s+"); //From PTB conventions private final String[] sentenceFinalFollowers = {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-"}; private boolean keepEmptySentences = false; /** * Constructs a preprocessor from an existing input stream. * * @param input An existing reader */ public DocumentPreprocessor(Reader input) { this(input,DocType.Plain); } public DocumentPreprocessor(Reader input, DocType t) { if (input == null) { throw new IllegalArgumentException("Cannot read from null object!"); } docType = t; inputReader = input; } public DocumentPreprocessor(String docPath) { this(docPath, DocType.Plain, "UTF-8"); } public DocumentPreprocessor(String docPath, DocType t) { this(docPath, t, "UTF-8"); } /** * Constructs a preprocessor from a file at a path, which can be either * a filesystem location, a classpath entry, or a URL. * * @param docPath The path * @param encoding The character encoding used by Readers */ public DocumentPreprocessor(String docPath, DocType t, String encoding) { if (docPath == null) { throw new IllegalArgumentException("Cannot open null document path!"); } docType = t; try { inputReader = IOUtils.readerFromString(docPath, encoding); } catch (IOException ioe) { System.err.printf("%s: Could not open path %s\n", this.getClass().getName(), docPath); throw new RuntimeIOException(ioe); } } /** * Set whether or not the tokenizer keeps empty sentences in * whitespace mode. Useful for programs that want to echo blank * lines. Not relevant for the non-whitespace model. */ public void setKeepEmptySentences(boolean keepEmptySentences) { this.keepEmptySentences = keepEmptySentences; } /** * Sets the end-of-sentence delimiters. *

* For newline tokenization, use the argument {"\n"}. * * @param sentenceFinalPuncWords */ public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) { this.sentenceFinalPuncWords = sentenceFinalPuncWords; } /** * Sets the factory from which to produce a {@link Tokenizer}. The default is * {@link PTBTokenizer}. *

* NOTE: If a null argument is used, then the document is assumed to be tokenized * and DocumentPreprocessor performs no tokenization. * */ public void setTokenizerFactory(TokenizerFactory newTokenizerFactory) { tokenizerFactory = newTokenizerFactory; } /** * Set an escaper. * * @param e The escaper */ public void setEscaper(Function,List> e) { escaper = e; } /** * Make the processor assume that the document is already delimited * by the supplied parameter. * * @param s The sentence delimiter */ public void setSentenceDelimiter(String s) { sentenceDelimiter = s; } /** * Split tags from tokens. The tag will be placed in the TagAnnotation of * the returned label. *

* Note that for strings that contain two or more instances of the tag delimiter, * the last instance is treated as the split point. *

* The tag delimiter should not contain any characters that must be escaped in a Java * regex. * * @param s POS tag delimiter */ public void setTagDelimiter(String s) { tagDelimiter = s; } /** * Only read text from inside these XML elements if in XML mode. * Note: This class implements an approximation to XML via regex. * * Otherwise, text will read from all tokens. */ public void setElementDelimiter(String s) { elementDelimiter = s; } /** * Returns sentences until the document is exhausted. Calls close() if the end of the document * is reached. Otherwise, the user is required to close the stream. */ @Override public Iterator> iterator() { // Add new document types here if (docType == DocType.Plain) { return new PlainTextIterator(); } else if (docType == DocType.XML) { return new XMLIterator(); } else { throw new IllegalStateException("Someone didn't add a handler for a new docType."); } } private class PlainTextIterator implements Iterator> { private final Tokenizer tokenizer; private final Set sentDelims; private final Set delimFollowers; private Function splitTag; private List nextSent = null; private final List nextSentCarryover = new ArrayList(); public PlainTextIterator() { // Establish how to find sentence boundaries boolean eolIsSignificant = false; sentDelims = Generics.newHashSet(); if (sentenceDelimiter == null) { if (sentenceFinalPuncWords != null) { sentDelims.addAll(Arrays.asList(sentenceFinalPuncWords)); } delimFollowers = Generics.newHashSet(Arrays.asList(sentenceFinalFollowers)); } else { sentDelims.add(sentenceDelimiter); delimFollowers = Generics.newHashSet(); eolIsSignificant = wsPattern.matcher(sentenceDelimiter).matches(); if(eolIsSignificant) { // For Stanford English Tokenizer sentDelims.add(PTBLexer.NEWLINE_TOKEN); } } // Setup the tokenizer if (tokenizerFactory == null) { eolIsSignificant = sentDelims.contains(WhitespaceLexer.NEWLINE); tokenizer = WhitespaceTokenizer. newWordWhitespaceTokenizer(inputReader, eolIsSignificant); } else { if (eolIsSignificant) { tokenizer = tokenizerFactory.getTokenizer(inputReader, "tokenizeNLs"); } else { tokenizer = tokenizerFactory.getTokenizer(inputReader); } } // If tokens are tagged, then we must split them // Note that if the token contains two or more instances of the delimiter, then the last // instance is regarded as the split point. if (tagDelimiter != null) { splitTag = new Function() { private final String splitRegex = String.format("%s(?!.*%s)", tagDelimiter, tagDelimiter); public String[] apply(String in) { final String[] splits = in.trim().split(splitRegex); if(splits.length == 2) return splits; else { String[] oldStr = {in}; return oldStr; } } }; } } private void primeNext() { nextSent = new ArrayList(nextSentCarryover); nextSentCarryover.clear(); boolean seenBoundary = false; if (!tokenizer.hasNext()) { IOUtils.closeIgnoringExceptions(inputReader); inputReader = null; nextSent = null; return; } do { HasWord token = tokenizer.next(); if (splitTag != null) { String[] toks = splitTag.apply(token.word()); token.setWord(toks[0]); if (token instanceof Label) { ((Label) token).setValue(toks[0]); } if(toks.length == 2 && token instanceof HasTag) { //wsg2011: Some of the underlying tokenizers return old //JavaNLP labels. We could convert to CoreLabel here, but //we choose a conservative implementation.... ((HasTag) token).setTag(toks[1]); } } if (sentDelims.contains(token.word())) { seenBoundary = true; } else if (seenBoundary && !delimFollowers.contains(token.word())) { nextSentCarryover.add(token); break; } if ( ! (wsPattern.matcher(token.word()).matches() || token.word().equals(PTBLexer.NEWLINE_TOKEN))) { nextSent.add(token); } // If there are no words that can follow a sentence delimiter, // then there are two cases. In one case is we already have a // sentence, in which case there is no reason to look at the // next token, since that just causes buffering without any // chance of the current sentence being extended, since // delimFollowers = {}. In the other case, we have an empty // sentence, which at this point means the sentence delimiter // was a whitespace token such as \n. We might as well keep // going as if we had never seen anything. if (seenBoundary && delimFollowers.size() == 0) { if (nextSent.size() > 0 || keepEmptySentences) { break; } else { seenBoundary = false; } } } while (tokenizer.hasNext()); if (nextSent.size() == 0 && nextSentCarryover.size() == 0 && !keepEmptySentences) { IOUtils.closeIgnoringExceptions(inputReader); inputReader = null; nextSent = null; } else if (escaper != null) { nextSent = escaper.apply(nextSent); } } public boolean hasNext() { if (nextSent == null) { primeNext(); } return nextSent != null; } public List next() { if (nextSent == null) { primeNext(); } if (nextSent == null) { throw new NoSuchElementException(); } List thisIteration = nextSent; nextSent = null; return thisIteration; } public void remove() { throw new UnsupportedOperationException(); } } private class XMLIterator implements Iterator> { private final XMLBeginEndIterator xmlItr; private final Reader originalDocReader; private PlainTextIterator plainItr; // = null; private List nextSent; // = null; public XMLIterator() { xmlItr = new XMLBeginEndIterator(inputReader, elementDelimiter); originalDocReader = inputReader; primeNext(); } private void primeNext() { // It is necessary to loop because if a document has a pattern // that goes: the xmlItr will return an empty // string, which the plainItr will process to null. If we // didn't loop to find the next tag, the iterator would stop. do { if (plainItr != null && plainItr.hasNext()) { nextSent = plainItr.next(); } else if (xmlItr.hasNext()) { String block = xmlItr.next(); inputReader = new BufferedReader(new StringReader(block)); plainItr = new PlainTextIterator(); if (plainItr.hasNext()) { nextSent = plainItr.next(); } else { nextSent = null; } } else { IOUtils.closeIgnoringExceptions(originalDocReader); nextSent = null; break; } } while (nextSent == null); } public boolean hasNext() { return nextSent != null; } public List next() { if (nextSent == null) { throw new NoSuchElementException(); } List thisSentence = nextSent; primeNext(); return thisSentence; } public void remove() { throw new UnsupportedOperationException(); } } // end class XMLIterator /** * This provides a simple test method for DocumentPreprocessor.
* Usage: * java * DocumentPreprocessor filename [-xml tag] [-suppressEscaping] [-noTokenization] *

* A filename is required. The code doesn't run as a filter currently. *

* tag is the element name of the XML from which to extract text. It can * be a regular expression which is called on the element with the * matches() method, such as 'TITLE|P'. * * @param args Command-line arguments */ public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.println("usage: DocumentPreprocessor OPT* filename"); System.err.println(" OPT = -xml TAG|-encoding ENC|-tokenizerOptions opts|-tag delim|..."); return; } String encoding = "utf-8"; boolean printSentenceLengths = false; DocType docType = DocType.Plain; String xmlElementDelimiter = null; TokenizerFactory tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); String sentenceDelimiter = null; String tagDelimiter = null; boolean printOriginalText = false; String[] sentenceDelims = null; int i = 0; for ( ; i < args.length; i++) { if (args[i].isEmpty() || ! args[i].startsWith("-")) { break; } if (args[i].equals("-xml")) { docType = DocType.XML; i++; xmlElementDelimiter = args[i]; } else if (args[i].equals("-encoding") && i+1 < args.length) { i++; encoding = args[i]; } else if (args[i].equals("-printSentenceLengths")) { printSentenceLengths = true; } else if (args[i].equals("-suppressEscaping")) { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false"); } else if (args[i].equals("-tokenizerOptions") && i+1 < args.length) { i++; tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), args[i]); } else if (args[i].equals("-noTokenization")) { tf = null; sentenceDelimiter = System.getProperty("line.separator"); } else if (args[i].equals("-whitespaceTokenization")) { tf = null; List whitespaceDelims = new ArrayList(Arrays.asList(DocumentPreprocessor.DEFAULT_SENTENCE_DELIMS)); whitespaceDelims.add(WhitespaceLexer.NEWLINE); sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]); } else if (args[i].equals("-tag")) { i++; tagDelimiter = args[i]; } else if (args[i].equals("-printOriginalText")) { printOriginalText = true; tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); } else { System.err.println("Unknown option: " + args[i]); } } int numSents = 0; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true); for ( ; i < args.length; i++) { DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(args[i], docType, encoding); if (docType == DocType.XML) { docPreprocessor.setElementDelimiter(xmlElementDelimiter); } docPreprocessor.setTokenizerFactory(tf); if (sentenceDelimiter != null) { docPreprocessor.setSentenceDelimiter(sentenceDelimiter); } if (tagDelimiter != null) { docPreprocessor.setTagDelimiter(args[++i]); } if (sentenceDelims != null) { docPreprocessor.setSentenceFinalPuncWords(sentenceDelims); } for (List sentence : docPreprocessor) { numSents++; if (printSentenceLengths) { System.err.println("Length:\t" + sentence.size()); } boolean printSpace = false; for (HasWord word : sentence) { if (printOriginalText) { CoreLabel cl = (CoreLabel) word; if ( ! printSpace) { pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class)); printSpace = true; } pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class)); pw.print(cl.get(CoreAnnotations.AfterAnnotation.class)); } else { if (printSpace) pw.print(" "); printSpace = true; pw.print(word.word()); } } pw.println(); } } pw.close(); System.err.println("Read in " + numSents + " sentences."); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy