edu.stanford.nlp.objectbank.XMLBeginEndIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.objectbank;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.function.Function;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.XMLUtils;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;
/**
* A class which iterates over Strings occurring between the begin and end of
* a selected tag or tags. The element is specified by a regexp, matched
* against the name of the element (i.e., excluding the angle bracket
* characters) using {@code matches()}).
* The class ignores all other characters in the input Reader.
* There are a few different ways to modify the output of the
* XMLBeginEndIterator. One way is to ask it to keep internal tags;
* if {@code keepInternalTags} is set, then
* {@literal AB } will be printed as {@literal AB}.
*
* Another is to tell it to keep delimiting tags; in the above example,
* {@literal } will be kept as well.
*
* Finally, you can ask it to keep track of the nesting depth; the
* ordinary behavior of this iterator is to close all tags with just
* one close tag. This is incorrect XML behavior, but is kept in case
* any code relies on it. If {@code countDepth} is set, though,
* the iterator keeps track of how much it has nested.
*
* @author Teg Grenager ([email protected])
*/
public class XMLBeginEndIterator extends AbstractIterator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(XMLBeginEndIterator.class);
private final Pattern tagNamePattern;
private final BufferedReader inputReader;
private final Function op;
private final boolean keepInternalTags;
private final boolean keepDelimitingTags;
private final boolean countDepth;
private E nextToken; // stores the read-ahead next token to return
@SuppressWarnings({"unchecked"}) // Can't seem to do IdentityFunction without warning!
public XMLBeginEndIterator(Reader in, String tagNameRegexp) {
this(in, tagNameRegexp, new IdentityFunction(), false);
}
@SuppressWarnings({"unchecked"})
public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags) {
this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags);
}
public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function op, boolean keepInternalTags) {
this(in, tagNameRegexp, op, keepInternalTags, false);
}
@SuppressWarnings({"unchecked"})
public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags, boolean keepDelimitingTags) {
this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags, keepDelimitingTags);
}
@SuppressWarnings({"unchecked"})
public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags, boolean keepDelimitingTags, boolean countDepth) {
this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags, keepDelimitingTags, countDepth);
}
public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function op, boolean keepInternalTags, boolean keepDelimitingTags) {
this(in, tagNameRegexp, op, keepInternalTags, keepDelimitingTags, false);
}
public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function op,
boolean keepInternalTags, boolean keepDelimitingTags, boolean countDepth) {
this.tagNamePattern = Pattern.compile(tagNameRegexp);
this.op = op;
this.keepInternalTags = keepInternalTags;
this.keepDelimitingTags = keepDelimitingTags;
this.countDepth = countDepth;
this.inputReader = new BufferedReader(in);
setNext();
}
private void setNext() {
String s = getNext();
nextToken = parseString(s);
}
// returns null if there is no next object
private String getNext() {
StringBuilder result = new StringBuilder();
try {
XMLUtils.XMLTag tag;
do {
// String text =
XMLUtils.readUntilTag(inputReader);
// there may or may not be text before the next tag, but we discard it
// System.out.println("outside text: " + text );
tag = XMLUtils.readAndParseTag(inputReader);
// System.out.println("outside tag: " + tag);
if (tag == null) {
return null; // couldn't find any more tags, so no more elements
}
} while (!tagNamePattern.matcher(tag.name).matches() ||
tag.isEndTag || tag.isSingleTag);
if (keepDelimitingTags) {
result.append(tag.toString());
}
int depth = 1;
while (true) {
String text = XMLUtils.readUntilTag(inputReader);
if (text != null) {
// if the text isn't null, we append it
// System.out.println("inside text: " + text );
result.append(text);
}
String tagString = XMLUtils.readTag(inputReader);
tag = XMLUtils.parseTag(tagString);
if (tag == null) {
return null; // unexpected end of this element, so no more elements
}
if (tagNamePattern.matcher(tag.name).matches() && tag.isEndTag) {
if ((countDepth && depth == 1) || !countDepth) {
if (keepDelimitingTags) {
result.append(tagString);
}
// this is our end tag so we stop
break;
} else {
--depth;
if (keepInternalTags) {
result.append(tagString);
}
}
} else if (tagNamePattern.matcher(tag.name).matches() && !tag.isEndTag &&
!tag.isSingleTag && countDepth) {
++depth;
if (keepInternalTags) {
result.append(tagString);
}
} else {
// not our end tag, so we optionally append it and keep going
if (keepInternalTags) {
result.append(tagString);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return result.toString();
}
protected E parseString(String s) {
return op.apply(s);
}
@Override
public boolean hasNext() {
return nextToken != null;
}
@Override
public E next() {
if (nextToken == null) {
throw new NoSuchElementException();
}
E token = nextToken;
setNext();
return token;
}
/* ---
// Omit methods that made this class a Tokenizer.
// Just have it an Iterator as the name suggests.
// That's all that was used, and this simplifies
// inter-package dependencies.
public E peek() {
return nextToken;
}
* Returns pieces of text in element as a List of tokens.
*
* @return A list of all tokens remaining in the underlying Reader
*
public List tokenize() {
// System.out.println("tokenize called");
List result = new ArrayList();
while (hasNext()) {
result.add(next());
}
return result;
}
--- */
/**
* Returns a factory that vends BeginEndIterators that reads the contents of
* the given Reader, extracts text between the specified Strings, then
* returns the result.
*
* @param tag The tag the XMLBeginEndIterator will match on
* @return The IteratorFromReaderFactory
*/
public static IteratorFromReaderFactory getFactory(String tag) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, new IdentityFunction<>(), false, false);
}
public static IteratorFromReaderFactory getFactory(String tag, boolean keepInternalTags, boolean keepDelimitingTags) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, new IdentityFunction<>(), keepInternalTags, keepDelimitingTags);
}
public static IteratorFromReaderFactory getFactory(String tag, Function op) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, op, false, false);
}
public static IteratorFromReaderFactory getFactory(String tag, Function op, boolean keepInternalTags, boolean keepDelimitingTags) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, op, keepInternalTags, keepDelimitingTags);
}
static class XMLBeginEndIteratorFactory implements IteratorFromReaderFactory {
private final String tag;
private final Function op;
private final boolean keepInternalTags;
private final boolean keepDelimitingTags;
public XMLBeginEndIteratorFactory(String tag, Function op, boolean keepInternalTags, boolean keepDelimitingTags) {
this.tag = tag;
this.op = op;
this.keepInternalTags = keepInternalTags;
this.keepDelimitingTags = keepDelimitingTags;
}
@Override
public Iterator getIterator(Reader r) {
return new XMLBeginEndIterator<>(r, tag, op, keepInternalTags, keepDelimitingTags);
}
}
public static void main(String[] args) throws IOException {
if (args.length < 3) {
log.info("usage: XMLBeginEndIterator file element keepInternalBoolean");
return;
}
Reader in = new FileReader(args[0]);
Iterator iter = new XMLBeginEndIterator<>(in, args[1], args[2].equalsIgnoreCase("true"));
while (iter.hasNext()) {
String s = iter.next();
System.out.println("*************************************************");
System.out.println(s);
}
in.close();
}
} // end class XMLBeginEndIterator