All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.objectbank.XMLBeginEndIterator Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.objectbank; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.function.Function;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.XMLUtils;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
 * A class which iterates over Strings occurring between the begin and end of
 * a selected tag or tags. The element is specified by a regexp, matched
 * against the name of the element (i.e., excluding the angle bracket
 * characters) using {@code matches()}).
 * The class ignores all other characters in the input Reader.
 * There are a few different ways to modify the output of the
 * XMLBeginEndIterator.  One way is to ask it to keep internal tags;
 * if {@code keepInternalTags} is set, then
 * {@literal AB} will be printed as {@literal AB}.
 *
 * Another is to tell it to keep delimiting tags; in the above example,
 * {@literal } will be kept as well.
 *
 * Finally, you can ask it to keep track of the nesting depth; the
 * ordinary behavior of this iterator is to close all tags with just
 * one close tag.  This is incorrect XML behavior, but is kept in case
 * any code relies on it.  If {@code countDepth} is set, though,
 * the iterator keeps track of how much it has nested.
 *
 * @author Teg Grenager ([email protected])
 */
public class XMLBeginEndIterator extends AbstractIterator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(XMLBeginEndIterator.class);

  private final Pattern tagNamePattern;
  private final BufferedReader inputReader;
  private final Function op;
  private final boolean keepInternalTags;
  private final boolean keepDelimitingTags;
  private final boolean countDepth;
  private E nextToken; // stores the read-ahead next token to return

  @SuppressWarnings({"unchecked"}) // Can't seem to do IdentityFunction without warning!
  public XMLBeginEndIterator(Reader in, String tagNameRegexp) {
    this(in, tagNameRegexp, new IdentityFunction(), false);
  }

  @SuppressWarnings({"unchecked"})
  public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags) {
    this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags);
  }

  public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function op, boolean keepInternalTags) {
    this(in, tagNameRegexp, op, keepInternalTags, false);
  }

  @SuppressWarnings({"unchecked"})
  public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags, boolean keepDelimitingTags) {
    this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags, keepDelimitingTags);
  }

  @SuppressWarnings({"unchecked"})
  public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags, boolean keepDelimitingTags, boolean countDepth) {
    this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags, keepDelimitingTags, countDepth);
  }

  public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function op, boolean keepInternalTags, boolean keepDelimitingTags) {
    this(in, tagNameRegexp, op, keepInternalTags, keepDelimitingTags, false);
  }

  public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function op,
                             boolean keepInternalTags, boolean keepDelimitingTags, boolean countDepth) {
    this.tagNamePattern = Pattern.compile(tagNameRegexp);
    this.op = op;
    this.keepInternalTags = keepInternalTags;
    this.keepDelimitingTags = keepDelimitingTags;
    this.countDepth = countDepth;
    this.inputReader = new BufferedReader(in);
    setNext();
  }

  private void setNext() {
    String s = getNext();
    nextToken = parseString(s);
  }

  // returns null if there is no next object
  private String getNext() {
    StringBuilder result = new StringBuilder();
    try {
      XMLUtils.XMLTag tag;
      do {
        // String text =
        XMLUtils.readUntilTag(inputReader);
        // there may or may not be text before the next tag, but we discard it
        //        System.out.println("outside text: " + text );
        tag = XMLUtils.readAndParseTag(inputReader);
        //        System.out.println("outside tag: " + tag);
        if (tag == null) {
          return null; // couldn't find any more tags, so no more elements
        }
      } while (!tagNamePattern.matcher(tag.name).matches() ||
               tag.isEndTag || tag.isSingleTag);
      if (keepDelimitingTags) {
        result.append(tag.toString());
      }
      int depth = 1;
      while (true) {
        String text = XMLUtils.readUntilTag(inputReader);
        if (text != null) {
          // if the text isn't null, we append it
          //        System.out.println("inside text: " + text );
          result.append(text);
        }
        String tagString = XMLUtils.readTag(inputReader);
        tag = XMLUtils.parseTag(tagString);
        if (tag == null) {
          return null; // unexpected end of this element, so no more elements
        }
        if (tagNamePattern.matcher(tag.name).matches() && tag.isEndTag) {
          if ((countDepth && depth == 1) || !countDepth) {
            if (keepDelimitingTags) {
              result.append(tagString);
            }
            // this is our end tag so we stop
            break;
          } else {
            --depth;
            if (keepInternalTags) {
              result.append(tagString);
            }
          }
        } else if (tagNamePattern.matcher(tag.name).matches() && !tag.isEndTag &&
                   !tag.isSingleTag && countDepth) {
          ++depth;
          if (keepInternalTags) {
            result.append(tagString);
          }
        } else {
          // not our end tag, so we optionally append it and keep going
          if (keepInternalTags) {
            result.append(tagString);
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return result.toString();
  }

  protected E parseString(String s) {
    return op.apply(s);
  }

  @Override
  public boolean hasNext() {
    return nextToken != null;
  }

  @Override
  public E next() {
    if (nextToken == null) {
      throw new NoSuchElementException();
    }
    E token = nextToken;
    setNext();
    return token;
  }

  /* ---

  // Omit methods that made this class a Tokenizer.
  // Just have it an Iterator as the name suggests.
  // That's all that was used, and this simplifies
  // inter-package dependencies.

  public E peek() {
    return nextToken;
  }

   * Returns pieces of text in element as a List of tokens.
   *
   * @return A list of all tokens remaining in the underlying Reader
   *
  public List tokenize() {
    // System.out.println("tokenize called");
    List result = new ArrayList();
    while (hasNext()) {
      result.add(next());
    }
    return result;
  }

  --- */

  /**
   * Returns a factory that vends BeginEndIterators that reads the contents of
   * the given Reader, extracts text between the specified Strings, then
   * returns the result.
   *
   * @param tag The tag the XMLBeginEndIterator will match on
   * @return The IteratorFromReaderFactory
   */
  public static IteratorFromReaderFactory getFactory(String tag) {
    return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, new IdentityFunction<>(), false, false);
  }

  public static IteratorFromReaderFactory getFactory(String tag, boolean keepInternalTags, boolean keepDelimitingTags) {
    return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, new IdentityFunction<>(), keepInternalTags, keepDelimitingTags);
  }

  public static  IteratorFromReaderFactory getFactory(String tag, Function op) {
    return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, op, false, false);
  }

  public static  IteratorFromReaderFactory getFactory(String tag, Function op, boolean keepInternalTags, boolean keepDelimitingTags) {
    return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<>(tag, op, keepInternalTags, keepDelimitingTags);
  }

  static class XMLBeginEndIteratorFactory implements IteratorFromReaderFactory {

    private final String tag;
    private final Function op;
    private final boolean keepInternalTags;
    private final boolean keepDelimitingTags;

    public XMLBeginEndIteratorFactory(String tag, Function op, boolean keepInternalTags, boolean keepDelimitingTags) {
      this.tag = tag;
      this.op = op;
      this.keepInternalTags = keepInternalTags;
      this.keepDelimitingTags = keepDelimitingTags;
    }

    @Override
    public Iterator getIterator(Reader r) {
      return new XMLBeginEndIterator<>(r, tag, op, keepInternalTags, keepDelimitingTags);
    }
  }

  public static void main(String[] args) throws IOException {
    if (args.length < 3) {
      log.info("usage: XMLBeginEndIterator file element keepInternalBoolean");
      return;
    }
    Reader in = new FileReader(args[0]);
    Iterator iter = new XMLBeginEndIterator<>(in, args[1], args[2].equalsIgnoreCase("true"));
    while (iter.hasNext()) {
      String s = iter.next();
      System.out.println("*************************************************");
      System.out.println(s);
    }
    in.close();
  }

} // end class XMLBeginEndIterator





© 2015 - 2024 Weber Informatics LLC | Privacy Policy