edu.stanford.nlp.process.PTBEscapingProcessor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.process;


import java.util.function.Function;


import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.StringUtils;

import java.io.File;
import java.net.URL;
import java.util.*;


/**
 * Produces a new Document of Words in which special characters of the PTB
 * have been properly escaped.
 *
 * @author Teg Grenager ([email protected])
 * @author Sarah Spikes ([email protected]) (Templatization)
 *
 * @param  The type of the labels
 * @param  The type of the features
 */
public class PTBEscapingProcessor extends AbstractListProcessor
  implements Function, List> {

  private static final char[] EMPTY_CHAR_ARRAY = new char[0];

  private static final char[] SUBST_CHARS = {'(', ')', '[', ']', '{', '}'};
  private static final String[] REPLACE_SUBSTS = {"-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"};

  private final char[] substChars;
  private final String[] replaceSubsts;

  // starting about 2013, we no longer escape  * and /. We de-escape them when reading Treebank3
  private final char[] escapeChars; // was  {'/', '*'};
  private final String[] replaceEscapes; // was = {"\\/", "\\*"};

  private final boolean fixQuotes;


  public PTBEscapingProcessor() {
    this(true);
  }

  public PTBEscapingProcessor(boolean fixQuotes) {
    this(EMPTY_CHAR_ARRAY, StringUtils.EMPTY_STRING_ARRAY, SUBST_CHARS, REPLACE_SUBSTS, fixQuotes);
  }

  public PTBEscapingProcessor(char[] escapeChars, String[] replaceEscapes, char[] substChars, String[] replaceSubsts, boolean fixQuotes) {
    this.escapeChars = escapeChars;
    this.replaceEscapes = replaceEscapes;
    this.substChars = substChars;
    this.replaceSubsts = replaceSubsts;
    this.fixQuotes = fixQuotes;
  }


  /*
  public Document processDocument(Document input) {
    Document result = input.blankDocument();
    result.addAll(process((List)input));
    return result;
  }
  */


  /** Escape a List of HasWords.  Implements the
   *  Function<List<HasWord>, List<HasWord>> interface.
   */
  @Override
  public List apply(List hasWordsList) {
    return process(hasWordsList);
  }

  public static String unprocess(String s) {
    for (int i = 0; i < REPLACE_SUBSTS.length; i++) {
      s = s.replaceAll(REPLACE_SUBSTS[i], String.valueOf(SUBST_CHARS[i]));
    }
    // at present doesn't deal with * / stuff ... never did
    return s;
  }

  /**
   * @param input must be a List of objects of type HasWord
   */
  @Override
  public List process(List input) {
    List output = new ArrayList<>();
    for (IN h : input) {
      String s = h.word();
      h.setWord(escapeString(s));
      output.add(h);
    }
    if (fixQuotes) {
      return fixQuotes(output);
    }
    return output;
  }


  private static List fixQuotes(List input) {
    int inputSize = input.size();
    LinkedList result = new LinkedList<>();
    if (inputSize == 0) {
      return result;
    }
    boolean begin;
    // see if there is a quote at the end
    if (input.get(inputSize - 1).word().equals("\"")) {
      // alternate from the end
      begin = false;
      for (int i = inputSize - 1; i >= 0; i--) {
        HasWord hw = input.get(i);
        String tok = hw.word();
        if (tok.equals("\"")) {
          if (begin) {
            hw.setWord("``");
            begin = false;
          } else {
            hw.setWord("\'\'");
            begin = true;
          }
        } // otherwise leave it alone
        result.addFirst(hw);
      } // end loop
    } else {
      // alternate from the beginning
      begin = true;
      for (HasWord hw : input) {
        String tok = hw.word();
        if (tok.equals("\"")) {
          if (begin) {
            hw.setWord("``");
            begin = false;
          } else {
            hw.setWord("\'\'");
            begin = true;
          }
        } // otherwise leave it alone
        result.addLast(hw);
      } // end loop
    }
    return result;
  }


  public String escapeString(String s) {
    StringBuilder buff = new StringBuilder();
    for (int i = 0; i < s.length(); i++) {
      char curChar = s.charAt(i);
      // run through all the chars we need to replace
      boolean found = false;
      for (int k = 0; k < substChars.length; k++) {
        if (curChar == substChars[k]) {
          buff.append(replaceSubsts[k]);
          found = true;
          break;
        }
      }
      if (found) {
        continue;
      }
      // don't do it if escape is already there usually
      if (curChar == '\\') {
        // add this and the next one unless bracket
        buff.append(curChar);
        if (maybeAppendOneMore(i + 1, s, buff)) {
          i++;
        }
        found = true;
      }
      if (found) {
        continue;
      }
      // run through all the chars we need to escape
      for (int k = 0; k < escapeChars.length; k++) {
        if (curChar == escapeChars[k]) {
          buff.append(replaceEscapes[k]);
          found = true;
          break;
        }
      }
      if (found) {
        continue;
      }

      // append the old char no matter what
      buff.append(curChar);
    }
    return buff.toString();
  }

  private boolean maybeAppendOneMore(int pos, String s, StringBuilder buff) {
    if (pos >= s.length()) {
      return false;
    }
    char candidate = s.charAt(pos);
    boolean found = false;
    for (char ch : substChars) {
      if (candidate == ch) {
        found = true;
        break;
      }
    }
    if (found) {
      return false;
    }
    buff.append(candidate);
    return true;
  }

  /**
   * This will do the escaping on an input file. Input file should already be tokenized,
   * with tokens separated by whitespace. 

   * Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
   *
   * @param args Command line argument: a file or URL
   */
  public static void main(String[] args) {
    if (args.length != 1) {
      System.out.println("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
      return;
    }
    String filename = args[0];
    try {
      Document d; // initialized below
      if (filename.startsWith("http://")) {
        Document dpre = new BasicDocument(WhitespaceTokenizer.factory()).init(new URL(filename));
        DocumentProcessor notags = new StripTagsProcessor<>();
        d = notags.processDocument(dpre);
      } else {
        d = new BasicDocument(WhitespaceTokenizer.factory()).init(new File(filename));
      }
      DocumentProcessor proc = new PTBEscapingProcessor<>();
      Document newD = proc.processDocument(d);
      for (HasWord word : newD) {
        System.out.println(word);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}