All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.objectbank.ReaderIteratorFactory Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.objectbank;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.AbstractIterator;

import java.io.*;
import java.net.URL;
import java.util.*;

/**
 * A ReaderIteratorFactory provides a means of getting an Iterator
 * which returns java.util.Readers over a Collection of input
 * sources.  Currently supported input sources are: Files, Strings,
 * URLs and Readers.  A ReaderIteratorFactory may take a Collection
 * on construction and new sources may be added either individually
 * (via the add(Object) method) or as a Collection (via the
 * addAll(Collection method).  The implementation automatically
 * determines the type of input and produces a java.util.Reader
 * accordingly.  If you wish to add support for a new kind of input,
 * refer the the setNextObject() method of the nested class
 * ReaderIterator.
 * 

* The Readers returned by this class are not closed by the class when you * move to the next element (nor at any other time). So, if you want the * files closed, then the caller needs to close them. The caller can only * do this if they pass in Readers. Otherwise, this class should probably * close them but currently doesn't. *

* TODO: Have this class close the files that it opens. * * @author Jenny Finkel * @version 1.0 */ //TODO: does this always store the same kind of thing in a given instance, //or do you want to allow having some Files, some Strings, etc.? public class ReaderIteratorFactory implements Iterable { /** * Constructs a ReaderIteratorFactory from the input sources * contained in the Collection. The Collection should contain * Objects of type File, String, URL and/or Reader. See class * description for details. * * @param c Collection of input sources. */ public ReaderIteratorFactory(Collection c) { this(); this.c.addAll(c); } public ReaderIteratorFactory(Collection c, String encoding){ this(); this.enc = encoding; this.c.addAll(c); } /** * Convenience constructor to construct a ReaderIteratorFactory from a single * input source. The Object should be of type File, String, URL and Reader. See class * description for details. * * @param o an input source that can be converted into a Reader */ public ReaderIteratorFactory(Object o) { this(Collections.singleton(o)); } public ReaderIteratorFactory(Object o, String encoding) { this(Collections.singleton(o), encoding); } public ReaderIteratorFactory() { c = new ArrayList<>(); } /** * The underlying Collection of input sources. Currently supported * input sources are: Files, Strings, URLs and Readers. The * implementation automatically determines the type of input and * produces a java.util.Reader accordingly. */ protected Collection c; /** * The encoding for file input. This is defaulted to "utf-8" * only applies when c is of type File . */ protected String enc = "UTF-8"; /** * Returns an Iterator over the input sources in the underlying Collection. * * @return an Iterator over the input sources in the underlying Collection. */ @Override public Iterator iterator() { return new ReaderIterator(); } /** * Adds an Object to the underlying Collection of input sources. * * @param o Input source to be added to the underlying Collection. */ public boolean add(Object o) { return this.c.add(o); } /** * Removes an Object from the underlying Collection of input sources. * * @param o Input source to be removed from the underlying Collection. */ public boolean remove(Object o) { return this.c.remove(o); } /** * Adds all Objects in Collection c to the underlying Collection of * input sources. * * @param c Collection of input sources to be added to the underlying Collection. */ public boolean addAll(Collection c) { return this.c.addAll(c); } /** * Removes all Objects in Collection c from the underlying Collection of * input sources. * * @param c Collection of input sources to be removed from the underlying Collection. */ public boolean removeAll(Collection c) { return this.c.removeAll(c); } /** * Removes all Objects from the underlying Collection of input sources * except those in Collection c * * @param c Collection of input sources to be retained in the underlying Collection. */ public boolean retainAll(Collection c) { return this.c.retainAll(c); } /** * Iterator which contains BufferedReaders. */ class ReaderIterator extends AbstractIterator { private Iterator iter; private Reader nextObject; /** * Sole constructor. */ public ReaderIterator() { iter = c.iterator(); setNextObject(); } /** * sets nextObject to a BufferedReader for the next input source, * or null of there is no next input source. */ private void setNextObject() { if (!iter.hasNext()) { nextObject = null; iter = null; return; } Object o = iter.next(); try { if (o instanceof File) { File file = (File) o; if (file.isDirectory()) { ArrayList l = new ArrayList<>(); l.addAll(Arrays.asList(file.listFiles())); while (iter.hasNext()) { l.add(iter.next()); } iter = l.iterator(); file = (File) iter.next(); } nextObject = IOUtils.readerFromFile(file, enc); } else if (o instanceof String) { // File file = new File((String)o); // if (file.exists()) { // if (file.isDirectory()) { // ArrayList l = new ArrayList(); // l.addAll(Arrays.asList(file.listFiles())); // while (iter.hasNext()) { // l.add(iter.next()); // } // iter = l.iterator(); // file = (File) iter.next(); // } // if (((String)o).endsWith(".gz")) { // BufferedReader tmp = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), enc)); // nextObject = tmp; // } else { // nextObject = new BufferedReader(new EncodingFileReader(file, enc)); // } // } else { nextObject = new BufferedReader(new StringReader((String) o)); // } } else if (o instanceof URL) { // todo: add encoding specification to this as well? -akleeman nextObject = new BufferedReader(new InputStreamReader(((URL) o).openStream())); } else if (o instanceof Reader) { nextObject = new BufferedReader((Reader) o); } else { throw new RuntimeException("don't know how to get Reader from class " + o.getClass() + " of object " + o); } } catch (IOException e) { throw new RuntimeException(e); } } /** * @return true if there is another (valid) input source to read from */ @Override public boolean hasNext() { return nextObject != null; } /** * Returns nextObject and then sets nextObject to the next input source. * * @return BufferedReader for next input source. */ @Override public Reader next() { if (nextObject == null) { throw new NoSuchElementException(); } Reader tmp = nextObject; setNextObject(); return tmp; } } }