edu.stanford.nlp.process.PTBEscapingProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.process;
import java.util.function.Function;
import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.StringUtils;
import java.io.File;
import java.net.URL;
import java.util.*;
/**
* Produces a new Document of Words in which special characters of the PTB
* have been properly escaped.
*
* @author Teg Grenager ([email protected])
* @author Sarah Spikes ([email protected]) (Templatization)
*
* @param The type of the labels
* @param The type of the features
*/
public class PTBEscapingProcessor extends AbstractListProcessor
implements Function, List> {
private static final char[] EMPTY_CHAR_ARRAY = new char[0];
private static final char[] SUBST_CHARS = {'(', ')', '[', ']', '{', '}'};
private static final String[] REPLACE_SUBSTS = {"-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"};
private final char[] substChars;
private final String[] replaceSubsts;
// starting about 2013, we no longer escape * and /. We de-escape them when reading Treebank3
private final char[] escapeChars; // was {'/', '*'};
private final String[] replaceEscapes; // was = {"\\/", "\\*"};
private final boolean fixQuotes;
public PTBEscapingProcessor() {
this(true);
}
public PTBEscapingProcessor(boolean fixQuotes) {
this(EMPTY_CHAR_ARRAY, StringUtils.EMPTY_STRING_ARRAY, SUBST_CHARS, REPLACE_SUBSTS, fixQuotes);
}
public PTBEscapingProcessor(char[] escapeChars, String[] replaceEscapes, char[] substChars, String[] replaceSubsts, boolean fixQuotes) {
this.escapeChars = escapeChars;
this.replaceEscapes = replaceEscapes;
this.substChars = substChars;
this.replaceSubsts = replaceSubsts;
this.fixQuotes = fixQuotes;
}
/*
public Document processDocument(Document input) {
Document result = input.blankDocument();
result.addAll(process((List)input));
return result;
}
*/
/** Escape a List of HasWords. Implements the
* Function<List<HasWord>, List<HasWord>> interface.
*/
@Override
public List apply(List hasWordsList) {
return process(hasWordsList);
}
public static String unprocess(String s) {
for (int i = 0; i < REPLACE_SUBSTS.length; i++) {
s = s.replaceAll(REPLACE_SUBSTS[i], String.valueOf(SUBST_CHARS[i]));
}
// at present doesn't deal with * / stuff ... never did
return s;
}
/**
* @param input must be a List of objects of type HasWord
*/
@Override
public List process(List input) {
List output = new ArrayList<>();
for (IN h : input) {
String s = h.word();
h.setWord(escapeString(s));
output.add(h);
}
if (fixQuotes) {
return fixQuotes(output);
}
return output;
}
private static List fixQuotes(List input) {
int inputSize = input.size();
LinkedList result = new LinkedList<>();
if (inputSize == 0) {
return result;
}
boolean begin;
// see if there is a quote at the end
if (input.get(inputSize - 1).word().equals("\"")) {
// alternate from the end
begin = false;
for (int i = inputSize - 1; i >= 0; i--) {
HasWord hw = input.get(i);
String tok = hw.word();
if (tok.equals("\"")) {
if (begin) {
hw.setWord("``");
begin = false;
} else {
hw.setWord("\'\'");
begin = true;
}
} // otherwise leave it alone
result.addFirst(hw);
} // end loop
} else {
// alternate from the beginning
begin = true;
for (HasWord hw : input) {
String tok = hw.word();
if (tok.equals("\"")) {
if (begin) {
hw.setWord("``");
begin = false;
} else {
hw.setWord("\'\'");
begin = true;
}
} // otherwise leave it alone
result.addLast(hw);
} // end loop
}
return result;
}
public String escapeString(String s) {
StringBuilder buff = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char curChar = s.charAt(i);
// run through all the chars we need to replace
boolean found = false;
for (int k = 0; k < substChars.length; k++) {
if (curChar == substChars[k]) {
buff.append(replaceSubsts[k]);
found = true;
break;
}
}
if (found) {
continue;
}
// don't do it if escape is already there usually
if (curChar == '\\') {
// add this and the next one unless bracket
buff.append(curChar);
if (maybeAppendOneMore(i + 1, s, buff)) {
i++;
}
found = true;
}
if (found) {
continue;
}
// run through all the chars we need to escape
for (int k = 0; k < escapeChars.length; k++) {
if (curChar == escapeChars[k]) {
buff.append(replaceEscapes[k]);
found = true;
break;
}
}
if (found) {
continue;
}
// append the old char no matter what
buff.append(curChar);
}
return buff.toString();
}
private boolean maybeAppendOneMore(int pos, String s, StringBuilder buff) {
if (pos >= s.length()) {
return false;
}
char candidate = s.charAt(pos);
boolean found = false;
for (char ch : substChars) {
if (candidate == ch) {
found = true;
break;
}
}
if (found) {
return false;
}
buff.append(candidate);
return true;
}
/**
* This will do the escaping on an input file. Input file should already be tokenized,
* with tokens separated by whitespace.
* Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
*
* @param args Command line argument: a file or URL
*/
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
return;
}
String filename = args[0];
try {
Document d; // initialized below
if (filename.startsWith("http://")) {
Document dpre = new BasicDocument(WhitespaceTokenizer.factory()).init(new URL(filename));
DocumentProcessor notags = new StripTagsProcessor<>();
d = notags.processDocument(dpre);
} else {
d = new BasicDocument(WhitespaceTokenizer.factory()).init(new File(filename));
}
DocumentProcessor proc = new PTBEscapingProcessor<>();
Document newD = proc.processDocument(d);
for (HasWord word : newD) {
System.out.println(word);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}