edu.stanford.nlp.process.PTBEscapingProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.process;
import java.util.function.Function;
import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.StringUtils;
import java.io.File;
import java.net.URL;
import java.util.*;
/**
* Produces a new Document of Words in which special characters of the PTB
* have been properly escaped.
*
* @author Teg Grenager ([email protected])
* @author Sarah Spikes ([email protected]) (Templatization)
*
* @param The type of the labels
* @param The type of the features
*/
public class PTBEscapingProcessor extends AbstractListProcessor
implements Function, List> {
private static final char[] EMPTY_CHAR_ARRAY = new char[0];
private static final char[] SUBST_CHARS = {'(', ')', '[', ']', '{', '}'};
private static final String[] REPLACE_SUBSTS = {"-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"};
private final char[] substChars;
private final String[] replaceSubsts;
// starting about 2013, we no longer escape * and /. We de-escape them when reading Treebank3
private final char[] escapeChars; // was {'/', '*'};
private final String[] replaceEscapes; // was = {"\\/", "\\*"};
private final boolean fixQuotes;
public PTBEscapingProcessor() {
this(true);
}
public PTBEscapingProcessor(boolean fixQuotes) {
this(EMPTY_CHAR_ARRAY, StringUtils.EMPTY_STRING_ARRAY, SUBST_CHARS, REPLACE_SUBSTS, fixQuotes);
}
public PTBEscapingProcessor(char[] escapeChars, String[] replaceEscapes, char[] substChars, String[] replaceSubsts, boolean fixQuotes) {
this.escapeChars = escapeChars;
this.replaceEscapes = replaceEscapes;
this.substChars = substChars;
this.replaceSubsts = replaceSubsts;
this.fixQuotes = fixQuotes;
}
/*
public Document processDocument(Document input) {
Document result = input.blankDocument();
result.addAll(process((List)input));
return result;
}
*/
/** Escape a List of HasWords. Implements the
* Function<List<HasWord>, List<HasWord>> interface.
*/
@Override
public List apply(List hasWordsList) {
return process(hasWordsList);
}
public static String unprocess(String s) {
for (int i = 0; i < REPLACE_SUBSTS.length; i++) {
s = s.replaceAll(REPLACE_SUBSTS[i], String.valueOf(SUBST_CHARS[i]));
}
// at present doesn't deal with * / stuff ... never did
return s;
}
/**
* @param input must be a List of objects of type HasWord
*/
@Override
public List process(List extends IN> input) {
List output = new ArrayList<>();
for (IN h : input) {
String s = h.word();
h.setWord(escapeString(s));
output.add(h);
}
if (fixQuotes) {
return fixQuotes(output);
}
return output;
}
private static List fixQuotes(List input) {
int inputSize = input.size();
LinkedList result = new LinkedList<>();
if (inputSize == 0) {
return result;
}
boolean begin;
// see if there is a quote at the end
if (input.get(inputSize - 1).word().equals("\"")) {
// alternate from the end
begin = false;
for (int i = inputSize - 1; i >= 0; i--) {
HasWord hw = input.get(i);
String tok = hw.word();
if (tok.equals("\"")) {
if (begin) {
hw.setWord("``");
begin = false;
} else {
hw.setWord("\'\'");
begin = true;
}
} // otherwise leave it alone
result.addFirst(hw);
} // end loop
} else {
// alternate from the beginning
begin = true;
for (HasWord hw : input) {
String tok = hw.word();
if (tok.equals("\"")) {
if (begin) {
hw.setWord("``");
begin = false;
} else {
hw.setWord("\'\'");
begin = true;
}
} // otherwise leave it alone
result.addLast(hw);
} // end loop
}
return result;
}
public String escapeString(String s) {
StringBuilder buff = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char curChar = s.charAt(i);
// run through all the chars we need to replace
boolean found = false;
for (int k = 0; k < substChars.length; k++) {
if (curChar == substChars[k]) {
buff.append(replaceSubsts[k]);
found = true;
break;
}
}
if (found) {
continue;
}
// don't do it if escape is already there usually
if (curChar == '\\') {
// add this and the next one unless bracket
buff.append(curChar);
if (maybeAppendOneMore(i + 1, s, buff)) {
i++;
}
found = true;
}
if (found) {
continue;
}
// run through all the chars we need to escape
for (int k = 0; k < escapeChars.length; k++) {
if (curChar == escapeChars[k]) {
buff.append(replaceEscapes[k]);
found = true;
break;
}
}
if (found) {
continue;
}
// append the old char no matter what
buff.append(curChar);
}
return buff.toString();
}
private boolean maybeAppendOneMore(int pos, String s, StringBuilder buff) {
if (pos >= s.length()) {
return false;
}
char candidate = s.charAt(pos);
boolean found = false;
for (char ch : substChars) {
if (candidate == ch) {
found = true;
break;
}
}
if (found) {
return false;
}
buff.append(candidate);
return true;
}
/**
* This will do the escaping on an input file. Input file should already be tokenized,
* with tokens separated by whitespace.
* Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
*
* @param args Command line argument: a file or URL
*/
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
return;
}
String filename = args[0];
try {
Document d; // initialized below
if (filename.startsWith("http://")) {
Document dpre = new BasicDocument(WhitespaceTokenizer.factory()).init(new URL(filename));
DocumentProcessor notags = new StripTagsProcessor<>();
d = notags.processDocument(dpre);
} else {
d = new BasicDocument(WhitespaceTokenizer.factory()).init(new File(filename));
}
DocumentProcessor proc = new PTBEscapingProcessor<>();
Document newD = proc.processDocument(d);
for (HasWord word : newD) {
System.out.println(word);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}