All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.util.StringParsingTask Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.util;


/**
 * An instantiation of this abstract class parses a String and
 * returns an object of type E.  It's called a
 * StringParsingTask (rather than StringParser)
 * because a new instance is constructed for each String to be
 * parsed.  We do this to be thread-safe: methods in
 * StringParsingTask share state information (e.g. current
 * string index) via instance variables.
 *
 * @author Bill MacCartney
 */
public abstract class StringParsingTask {
  
  // This class represents a parser working on a specific string.  We
  // construct from a specific string in order 
  protected String s;
  protected int index = 0;
  protected boolean isEOF = false;     // true if we tried to read past end
    
  /**
   * Constructs a new StringParsingTask from the specified
   * String.  Derived class constructors should be sure to
   * call super(s)!
   */
  public StringParsingTask(String s) {
    this.s = s;
    index = 0;
  }
    
  /**
   * Parses the String associated with this
   * StringParsingTask and returns a object of type
   * E.
   */
  public abstract E parse();

    
  // ---------------------------------------------------------------------

  /**
   * Reads characters until {@link #isWhiteSpace(char) isWhiteSpace(ch)}or
   * {@link #isPunct(char) isPunct(ch)} or {@link #isEOF()}.  You may need
   * to override the definition of {@link #isPunct(char) isPunct(ch)} to
   * get this to work right.
   */
  protected String readName() {
    readWhiteSpace();
    StringBuilder sb = new StringBuilder();
    char ch = read();
    while (!isWhiteSpace(ch) && !isPunct(ch) && !isEOF) {
      sb.append(ch);
      ch = read();
    }
    unread();
    // System.err.println("Read text: ["+sb+"]");
    return sb.toString().intern();
  }

  protected String readJavaIdentifier() {
    readWhiteSpace();
    StringBuilder sb = new StringBuilder();
    char ch = read();
    if (Character.isJavaIdentifierStart(ch) && !isEOF) {
      sb.append(ch);
      ch = read();
      while (Character.isJavaIdentifierPart(ch) && !isEOF) {
        sb.append(ch);
        ch = read();
      }
    }
    unread();
    // System.err.println("Read text: ["+sb+"]");
    return sb.toString().intern();
  }

  // .....................................................................

  protected void readLeftParen() {
    // System.out.println("Read left.");
    readWhiteSpace();
    char ch = read();
    if (!isLeftParen(ch))
      throw new ParserException("Expected left paren!");
  }

  protected void readRightParen() {
    // System.out.println("Read right.");
    readWhiteSpace();
    char ch = read();
    if (!isRightParen(ch)) 
      throw new ParserException("Expected right paren!");
  }

  protected void readDot() {
    readWhiteSpace();
    if (isDot(peek())) read();
  }

  protected void readWhiteSpace() {
    char ch = read();
    while (isWhiteSpace(ch) && !isEOF()) {
      ch = read();
    }
    unread();
  }

  // .....................................................................

  protected char read() {
    if (index >= s.length() || index < 0) {
      isEOF = true;
      return ' ';                     // arbitrary
    }
    return s.charAt(index++);
  }
  
  protected void unread() {
    index--;
  }
  
  protected char peek() {
    char ch = read();
    unread();
    return ch;
  }


  // -----------------------------------------------------------------------

  protected boolean isEOF() {
    return isEOF;
  }

  protected boolean isWhiteSpace(char ch) {
    return (ch == ' ' || ch == '\t' || ch == '\f' || ch == '\r' || ch == '\n');
  }

  protected boolean isPunct(char ch) {
    return 
      isLeftParen(ch) ||
      isRightParen(ch);
  }

  protected boolean isLeftParen(char ch) {
    return ch == '(';
  }

  protected boolean isRightParen(char ch) {
    return ch == ')';
  }

  protected boolean isDot(char ch) {
    return ch == '.';
  }


  // exception class -------------------------------------------------------

  public static class ParserException extends RuntimeException {
    private static final long serialVersionUID = 1L;
    public ParserException(Exception e)    { super(e); }
    public ParserException(String message) { super(message); }
  }

}  




© 2015 - 2024 Weber Informatics LLC | Privacy Policy