edu.stanford.nlp.process.PTBEscapingProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.process;


import java.util.function.Function;


import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.StringUtils;

import java.io.File;
import java.net.URL;
import java.util.*;


/**
 * Produces a new Document of Words in which special characters of the PTB
 * have been properly escaped.
 *
 * @author Teg Grenager ([email protected])
 * @author Sarah Spikes ([email protected]) (Templatization)
 *
 * @param  The type of the labels
 * @param  The type of the features
 */
public class PTBEscapingProcessor extends AbstractListProcessor
  implements Function, List> {

  private static final char[] EMPTY_CHAR_ARRAY = new char[0];

  private static final char[] SUBST_CHARS = {'(', ')', '[', ']', '{', '}'};
  private static final String[] REPLACE_SUBSTS = {"-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"};

  private final char[] substChars;
  private final String[] replaceSubsts;

  // starting about 2013, we no longer escape  * and /. We de-escape them when reading Treebank3
  private final char[] escapeChars; // was  {'/', '*'};
  private final String[] replaceEscapes; // was = {"\\/", "\\*"};

  private final boolean fixQuotes;


  public PTBEscapingProcessor() {
    this(true);
  }

  public PTBEscapingProcessor(boolean fixQuotes) {
    this(EMPTY_CHAR_ARRAY, StringUtils.EMPTY_STRING_ARRAY, SUBST_CHARS, REPLACE_SUBSTS, fixQuotes);
  }

  public PTBEscapingProcessor(char[] escapeChars, String[] replaceEscapes, char[] substChars, String[] replaceSubsts, boolean fixQuotes) {
    this.escapeChars = escapeChars;
    this.replaceEscapes = replaceEscapes;
    this.substChars = substChars;
    this.replaceSubsts = replaceSubsts;
    this.fixQuotes = fixQuotes;
  }


  /*
  public Document processDocument(Document input) {
    Document result = input.blankDocument();
    result.addAll(process((List)input));
    return result;
  }
  */


  /** Escape a List of HasWords.  Implements the
   *  Function<List<HasWord>, List<HasWord>> interface.
   */
  @Override
  public List apply(List hasWordsList) {
    return process(hasWordsList);
  }

  public static String unprocess(String s) {
    for (int i = 0; i < REPLACE_SUBSTS.length; i++) {
      s = s.replaceAll(REPLACE_SUBSTS[i], String.valueOf(SUBST_CHARS[i]));
    }
    // at present doesn't deal with * / stuff ... never did
    return s;
  }

  /**
   * @param input must be a List of objects of type HasWord
   */
  @Override
  public List process(List input) {
    List output = new ArrayList<>();
    for (IN h : input) {
      String s = h.word();
      h.setWord(escapeString(s));
      output.add(h);
    }
    if (fixQuotes) {
      return fixQuotes(output);
    }
    return output;
  }


  private static List fixQuotes(List input) {
    int inputSize = input.size();
    LinkedList result = new LinkedList<>();
    if (inputSize == 0) {
      return result;
    }
    boolean begin;
    // see if there is a quote at the end
    if (input.get(inputSize - 1).word().equals("\"")) {
      // alternate from the end
      begin = false;
      for (int i = inputSize - 1; i >= 0; i--) {
        HasWord hw = input.get(i);
        String tok = hw.word();
        if (tok.equals("\"")) {
          if (begin) {
            hw.setWord("``");
            begin = false;
          } else {
            hw.setWord("\'\'");
            begin = true;
          }
        } // otherwise leave it alone
        result.addFirst(hw);
      } // end loop
    } else {
      // alternate from the beginning
      begin = true;
      for (HasWord hw : input) {
        String tok = hw.word();
        if (tok.equals("\"")) {
          if (begin) {
            hw.setWord("``");
            begin = false;
          } else {
            hw.setWord("\'\'");
            begin = true;
          }
        } // otherwise leave it alone
        result.addLast(hw);
      } // end loop
    }
    return result;
  }


  public String escapeString(String s) {
    StringBuilder buff = new StringBuilder();
    for (int i = 0; i < s.length(); i++) {
      char curChar = s.charAt(i);
      // run through all the chars we need to replace
      boolean found = false;
      for (int k = 0; k < substChars.length; k++) {
        if (curChar == substChars[k]) {
          buff.append(replaceSubsts[k]);
          found = true;
          break;
        }
      }
      if (found) {
        continue;
      }
      // don't do it if escape is already there usually
      if (curChar == '\\') {
        // add this and the next one unless bracket
        buff.append(curChar);
        if (maybeAppendOneMore(i + 1, s, buff)) {
          i++;
        }
        found = true;
      }
      if (found) {
        continue;
      }
      // run through all the chars we need to escape
      for (int k = 0; k < escapeChars.length; k++) {
        if (curChar == escapeChars[k]) {
          buff.append(replaceEscapes[k]);
          found = true;
          break;
        }
      }
      if (found) {
        continue;
      }

      // append the old char no matter what
      buff.append(curChar);
    }
    return buff.toString();
  }

  private boolean maybeAppendOneMore(int pos, String s, StringBuilder buff) {
    if (pos >= s.length()) {
      return false;
    }
    char candidate = s.charAt(pos);
    boolean found = false;
    for (char ch : substChars) {
      if (candidate == ch) {
        found = true;
        break;
      }
    }
    if (found) {
      return false;
    }
    buff.append(candidate);
    return true;
  }

  /**
   * This will do the escaping on an input file. Input file should already be tokenized,
   * with tokens separated by whitespace. 

   * Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
   *
   * @param args Command line argument: a file or URL
   */
  public static void main(String[] args) {
    if (args.length != 1) {
      System.out.println("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
      return;
    }
    String filename = args[0];
    try {
      Document d; // initialized below
      if (filename.startsWith("http://")) {
        Document dpre = new BasicDocument(WhitespaceTokenizer.factory()).init(new URL(filename));
        DocumentProcessor notags = new StripTagsProcessor<>();
        d = notags.processDocument(dpre);
      } else {
        d = new BasicDocument(WhitespaceTokenizer.factory()).init(new File(filename));
      }
      DocumentProcessor proc = new PTBEscapingProcessor<>();
      Document newD = proc.processDocument(d);
      for (HasWord word : newD) {
        System.out.println(word);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}