semRewrite.substitutor.MUC Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
package semRewrite.substitutor;
import com.google.common.collect.Lists;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotationSerializer;
import edu.stanford.nlp.pipeline.GenericAnnotationSerializer;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//import edu.stanford.nlp.dcoref.CorefChain;
//import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
//import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
//import edu.stanford.nlp.hcoref.CorefSystem;
//import edu.stanford.nlp.hcoref.data.CorefChain;
//import edu.stanford.nlp.hcoref.data.CorefChain.CorefMention;
//import edu.stanford.nlp.hcoref.data.Document;
/*
* Copyright 2014-2015 IPsoft
*
* Author: Adam Pease [email protected]
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
* Processing to handle the MUC-6 dataset for coreference and compare
* it to Stanford's CoreNLP coreference results
*/
public class MUC {
private int totalStanford = 0;
private int totalMUC = 0;
private int falsePositive = 0;
private int falseNegative = 0;
public static Annotation document2 = null;
public static HashSet stanfordCorefs = new HashSet<>();
private HashMap missedRefs = new HashMap<>();
/****************************************************************
*/
public class Coref {
int ID;
String token;
int ref = -1;
HashMap info = new HashMap<>();
int sentNum;
int firstToken;
int lastToken;
int chainID;
public String toString() {
return Integer.toString(ID) + ":" +
token + ":" +
info + ":" +
Integer.toString(sentNum) + ":" +
Integer.toString(firstToken) + ":" +
Integer.toString(lastToken);
}
}
/****************************************************************
*/
public static List buildCorefList(String input) {
ArrayList corefs = new ArrayList();
return corefs;
}
/****************************************************************
*/
public static String first100(StringBuffer input) {
if (input.length() > 100)
return input.toString().substring(0,100) + "...";
else
return input.toString();
}
/****************************************************************
* convenience method to convert a set of corefs into a map that
* can then be input to @see printCorefList()
*/
public static HashMap toMap(HashSet cs) {
HashMap sorted = new HashMap<>();
for (Coref c : cs) {
sorted.put(c.ID, c);
}
return sorted;
}
/****************************************************************
*/
public static void printCorefList(HashMap cs) {
TreeMap corefs = new TreeMap<>();
corefs.putAll(cs);
for (Integer i : corefs.keySet()) {
Coref c = corefs.get(i);
System.out.println(c);
}
}
/****************************************************************
*/
public void printStanfordCorefList(Map graph) {
for (CorefChain cc : graph.values()) {
List mentions = cc.getMentionsInTextualOrder();
if (mentions.size() > 1) {
for (CorefChain.CorefMention ment : mentions) {
Coref c = new Coref();
c.ID = ment.mentionID;
c.token = ment.mentionSpan;
HashMap info = new HashMap<>();
c.sentNum = ment.sentNum;
c.firstToken = ment.headIndex;
int lastToken;
//System.out.println(ment.sentNum + " : " + ment.headIndex + " : " + ment.mentionSpan);
System.out.println(ment.sentNum + " : " + ment.startIndex + " : " + ment.mentionSpan);
}
System.out.println();
}
}
}
/****************************************************************
*/
public static String listToString(List input) {
StringBuffer sb = new StringBuffer();
for (String s : input)
sb.append(s + " ");
return sb.toString();
}
/****************************************************************
* Use the Stanford sentence tokenizer to convert the input to a list
* of Strings with one sentence per string
*/
public static List toSentences(String input) {
List results = new ArrayList();
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit");
props.setProperty("tokenize.options", "ptb3Escaping=false");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document1 = new Annotation(input);
pipeline.annotate(document1);
List sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
//System.out.println(sentence);
results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
}
return results;
}
/****************************************************************
* Convert Stanford corefs into MUC-style coreference chains.
*/
public HashMap> stanfordToCoref(Annotation document) {
HashMap> result = new HashMap<>();
int ID = 0;
Map graph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
for (CorefChain cc : graph.values()) {
List mentions = cc.getMentionsInTextualOrder();
if (mentions.size() > 1) {
HashSet newchain = new HashSet<>();
for (CorefChain.CorefMention ment : mentions) {
Coref c = new Coref();
c.ID = ment.mentionID;
c.token = ment.mentionSpan;
HashMap info = new HashMap<>();
c.sentNum = ment.sentNum;
//c.firstToken = ment.headIndex;
c.firstToken = ment.startIndex;
int lastToken;
newchain.add(c);
}
result.put(ID,newchain);
ID++;
}
}
return result;
}
/****************************************************************
* @return a Stanford pipeline
*/
public StanfordCoreNLP initPipeline() {
Properties props = new Properties();
//props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, entitymentions, parse, depparse, hcoref");
props.setProperty("tokenize.options", "ptb3Escaping=false");
//String[] configFileProp = {"-props","/home/apease/Programs/stanford-corenlp-full-2015-04-20/CoreNLP/build/resources/main/edu/stanford/nlp/hcoref/properties/coref-default-dep.properties"};
String[] configFileProp = {"-props",System.getenv("COREF")};
props.putAll(StringUtils.argsToPropertiesWithResolve(configFileProp));
System.out.println("MUC.toCoref(): before initialized pipeline");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
System.out.println("MUC.toCoref(): initialized pipeline");
return pipeline;
}
/****************************************************************
* @return a list of sentences with tokens
*/
public ArrayList> toCoref(String input) {
//System.out.println("INFO in MUC.toCoref(): " + input);
//System.out.println("INFO in MUC.toCoref(): " + input);
List corefs = buildCorefList(input);
ArrayList> results = new ArrayList>();
StanfordCoreNLP pipeline = initPipeline();
document2 = new Annotation(input);
System.out.println("MUC.toCoref(): after annotation");
try {
pipeline.annotate(document2);
//HybridCorefAnnotator hcoref = new HybridCorefAnnotator(props);
//hcoref.annotate(document);
}
catch (Exception e) {
System.out.println("input: " + input);
System.out.println(e.getMessage());
e.printStackTrace();
}
List sentences = document2.get(CoreAnnotations.SentencesAnnotation.class);
//SentenceUtil.printCorefChain(document);
System.out.println("Stanford corefs: ");
Map graph = document2.get(CorefCoreAnnotations.CorefChainAnnotation.class);
printStanfordCorefList(graph);
for (CoreMap sentence : sentences) {
//System.out.println(sentence);
ArrayList tokenList = new ArrayList<>();
//results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel t : tokens) {
String t2 = t.toString();
if (t2.startsWith("-LRB-"))
t2 = t2.replace("-LRB-", "(");
if (t2.startsWith("-RRB-"))
t2 = t2.replace("-RRB-", ")");
if (t2.startsWith("``"))
t2 = t2.replace("``", "\"");
if (t2.startsWith("''"))
t2 = t2.replace("''", "\"");
// -LCB-, -RCB-, ???
System.out.print(t2 + " ");
tokenList.add(t2);
}
results.add(tokenList);
System.out.println();
}
return results;
}
/****************************************************************
*/
public static List getDocuments(String filename) {
List lines = new ArrayList();
System.out.println("INFO in MUC.cleanSGML(): Reading files");
LineNumberReader lr = null;
try {
String line;
StringBuffer doc = new StringBuffer();
File nounFile = new File(filename);
if (nounFile == null) {
System.out.println("Error in MUC.cleanSGML(): The file does not exist ");
return lines;
}
long t1 = System.currentTimeMillis();
FileReader r = new FileReader(nounFile);
lr = new LineNumberReader(r);
while ((line = lr.readLine()) != null) {
if (lr.getLineNumber() % 1000 == 0)
System.out.print('.');
line = line.trim();
line = line.replace("", "");
line = line.replace(" ", ".");
line = line.replace("", "");
line = line.replace(" ", ".");
line = line.replace("", "");
line = line.replace("", "");
line = line.replace("----", ".");
line = line.replaceAll("^\\@", "");
if (line.contains("")) {
lines.add(doc.toString());
doc = new StringBuffer();
}
else if (line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains("
") ||
line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains("") ||
line.contains(" ") ||
line.contains("") ||
line.contains("") ||
line.contains(" ")) {
}
else
doc.append(line + " ");
}
}
catch (Exception ex) {
ex.printStackTrace();
}
finally {
try {
if (lr != null) {
lr.close();
}
}
catch (Exception ex) {
}
}
return lines;
}
/****************************************************************
*/
public static List cleanSGML(String filename) {
List lines = new ArrayList();
System.out.println("INFO in MUC.cleanSGML(): Reading files");
LineNumberReader lr = null;
try {
String line;
File nounFile = new File(filename);
if (nounFile == null) {
System.out.println("Error in MUC.cleanSGML(): The file does not exist ");
return lines;
}
long t1 = System.currentTimeMillis();
FileReader r = new FileReader(nounFile);
lr = new LineNumberReader(r);
while ((line = lr.readLine()) != null) {
if (lr.getLineNumber() % 1000 == 0)
System.out.print('.');
line = line.trim();
line = line.replaceAll("<[^>]+>", "");
line = line.replaceAll("<[^>]+$", "");
line = line.replaceAll("^[^>]+>", "");
lines.add(line);
}
}
catch (Exception ex) {
ex.printStackTrace();
}
finally {
try {
if (lr != null) {
lr.close();
}
}
catch (Exception ex) {
}
}
return lines;
}
/****************************************************************
* Modify @param sb to remove the characters in @param token from
* its starting characters.
* @return true if the token was found
*/
private static boolean removeToken(StringBuffer sb, String token) {
//System.out.println("removeToken() remove '" + token + "'");
//System.out.println("removeToken() before: " + first100(sb));
if (sb == null || sb.length() < 1) {
System.out.println("Error in removeToken() - null string with token: " + token);
return false;
}
while (Character.isWhitespace(sb.toString().charAt(0)))
sb.deleteCharAt(0);
if (sb.toString().startsWith(token)) {
sb.delete(0, token.length());
}
else {
System.out.println("Error in removeToken() - no match for '" + token + "' in " + first100(sb));
return false;
}
//System.out.println("after: " + first100(sb));
return true;
}
/***************************************************************
*/
private static void expandCurrentToken(String token,
Stack currentCoref,
HashMap corefTokens) {
Integer id = currentCoref.peek();
String tok = corefTokens.get(id);
if (tok.isEmpty())
corefTokens.put(id, token);
else
corefTokens.put(id, tok + " " + token);
}
/****************************************************************
*/
private static void leadingTrim(StringBuffer sb) {
if (sb == null || sb.length() > 1)
return;
while (sb.length() > 0 && Character.isWhitespace(sb.toString().charAt(0)))
sb.deleteCharAt(0);
}
/****************************************************************
*/
private static String processOneParamString(String paramlist, Coref c) {
//System.out.println("processOneParamString(): " + paramlist);
int space = paramlist.indexOf(' ');
int equals = paramlist.indexOf('=');
int quote1 = paramlist.indexOf('"');
int offset = quote1;
int index = quote1 + 1;
while (index < paramlist.length() &&
(paramlist.charAt(index) != '"' || (index > 0 && paramlist.charAt(index - 1) == '\\'))) {
index++;
}
int quote2 = paramlist.indexOf('"', index);
String key = paramlist.substring(space + 1, equals);
String value = paramlist.substring(quote1 + 1, quote2);
//System.out.println(value);
if (key.equals("REF"))
c.ref = Integer.parseInt(value);
c.info.put(key, value);
paramlist = paramlist.substring(quote2 + 1);
//System.out.println("result of processOneParamString(): " + c);
return paramlist;
}
/****************************************************************
*/
private static void processParams(Coref c, HashMap corefParams) {
String paramlist = corefParams.get(c.ID);
//System.out.println("processParams(): " + paramlist);
while (paramlist.indexOf('"') > -1) {
paramlist = processOneParamString(paramlist, c);
}
}
/***************************************************************
* @return the String content of a COREF tag.
* Destructively modifies sb to remove the tag
*/
private static String getTag(StringBuffer sb, Matcher m) {
if (sb.indexOf("<") > -1)
sb.delete(0, sb.indexOf("<"));
String tag = sb.toString().substring(0, sb.indexOf(">") + 1);
sb.delete(0, sb.indexOf(">") + 1);
return tag;
}
/****************************************************************
* Build chains of coreferences based on their pairwise references
*/
private static HashMap> buildChains(HashMap corefs) {
HashMap chainMap = new HashMap<>(); // coref id to chain id
HashMap> chains = new HashMap<>(); // chain id to members
int chainNum = 0;
boolean first = true;
Coref firstC = null;
for (Integer i : corefs.keySet()) {
Coref c = corefs.get(i);
if (first) {
firstC = c;
first = false;
}
if (!chainMap.keySet().contains(c.ID)) {
if (c.ref == -1) {
chainMap.put(c.ID, chainNum);
HashSet chain = new HashSet<>();
chain.add(c);
c.chainID = chainNum;
chains.put(chainNum, chain);
chainNum++;
}
else if (!chainMap.keySet().contains(c.ref)) {
chainMap.put(c.ID, chainNum);
chainMap.put(c.ref, chainNum);
HashSet chain = new HashSet<>();
c.chainID = chainNum;
chain.add(c);
Coref cref = corefs.get(c.ref);
if (cref != null)
chain.add(cref);
else
System.out.println("Error in MUC.buildChains(): No coref for id: " + c.ref);
chains.put(chainNum, chain);
chainNum++;
}
else {
int ref = chainMap.get(c.ref);
chains.get(ref).add(c);
chainMap.put(c.ID, ref);
}
}
else {
if (c.ref == -1) {
} // no reference so do nothing
else if (!chainMap.keySet().contains(c.ref)) {
chainMap.put(c.ID, chainNum);
chainMap.put(c.ref, chainNum);
HashSet chain = new HashSet<>();
c.chainID = chainNum;
chain.add(c);
Coref cref = corefs.get(c.ref);
if (cref != null)
chain.add(cref);
else
System.out.println("Error in MUC.buildChains(): No coref for id: " + c.ref);
chain.add(cref);
chains.put(chainNum, chain);
chainNum++;
}
else {
int ref = chainMap.get(c.ref);
chains.get(ref).add(c);
}
}
}
return chains;
}
/****************************************************************
* Strip a Stanford token number suffix from the token
*/
private static String stripTokenNum(String t) {
if (t.lastIndexOf("-") < 0)
return t;
return t.substring(0,t.lastIndexOf("-"));
}
/****************************************************************
* Get the Stanford token number suffix from the token
*/
private static String getTokenNum(String t) {
if (t.lastIndexOf("-") < 0)
return t;
return t.substring(t.lastIndexOf("-") + 1);
}
/****************************************************************
* Trim punctuation
*/
private static String trimPunc(String t) {
boolean changed = true;
while (changed) {
if (t.charAt(t.length() -1) == ' ') {
t = t.substring(0,t.length() -1);
changed = true;
}
else if (t.charAt(t.length() -1) == ',') {
t = t.substring(0,t.length() -1);
changed = true;
}
else if (t.endsWith(" 's")) {
t = t.substring(0,t.length() - 3) + t.substring(t.length() -2,t.length());
changed = true;
}
else if (t.endsWith(" .")) {
t = t.substring(0,t.length() - 2);
changed = true;
}
else
changed = false;
}
return t;
}
/****************************************************************
*/
private static void printChains(HashMap> corefs) {
for (Integer i : corefs.keySet()) {
HashSet cs = corefs.get(i);
printCorefList(toMap(cs));
System.out.println();
}
}
/****************************************************************
*/
private static boolean find(Coref c, HashMap> chains) {
for (int i : chains.keySet()) {
HashSet chain = chains.get(i);
for (Coref c2 : chain) {
if (c2.sentNum == c.sentNum && c2.firstToken == c.firstToken)
return true;
if (c2.info.containsKey("MIN") &&
(trimPunc(c2.info.get("MIN")).equals(trimPunc(c.token)) ||
(trimPunc(c.token)).contains(trimPunc(c2.info.get("MIN")))) )
return true;
if (c.info.containsKey("MIN") &&
(trimPunc(c.info.get("MIN")).equals(trimPunc(c2.token)) ||
(trimPunc(c2.token)).contains(trimPunc(c.info.get("MIN")))) )
return true;
}
}
return false;
}
/****************************************************************
* A kludge to handle the fact that MUC sometimes splits tokens
* that are hyphenated. So we pre-split all hyphenated tokens
* into several tokens that share the same token number.
*/
private static ArrayList splitTokens(ArrayList tokens) {
ArrayList result = new ArrayList();
for (String t : tokens) {
if (stripTokenNum(t).indexOf('-') > -1) {
String num = getTokenNum(t);
String[] split = t.split("-");
for (int i = 0; i < split.length - 1 ; i++)
result.add(split[i] + "-" + num);
}
else {
result.add(t);
}
}
return result;
}
/****************************************************************
*/
private static TreeMap> sortTotals(HashMap missed) {
TreeMap> commonMissed = new TreeMap<>();
for (String s : missed.keySet()) {
Integer i = missed.get(s);
if (commonMissed.containsKey(i)) {
ArrayList al = commonMissed.get(i);
al.add(s);
}
else {
ArrayList al = new ArrayList<>();
al.add(s);
commonMissed.put(i, al);
}
}
return commonMissed;
}
/****************************************************************
*/
private static void printTopN(TreeMap> map, int n) {
int index = 0;
Iterator it = map.keySet().iterator();
while (index < n && index < map.keySet().size()) {
Integer key = it.next();
ArrayList al = map.get(key);
System.out.println(key + " : " + al);
index++;
}
}
/** ***************************************************************
* Compare Stanford and MUC coreference chains. Create a map for
* each token in MUC whether it is in Stanford and in which chain
* ID and which Stanford token it corresponds to. Use that map to
* score which tokens are not found (errors of omission, or false
* negatives). Mark the tokens that are found in both MUC and
* Stanford in a separate map. Use that map to create a third map
* of tokens that are in Stanford but not in MUC.
* @param chains are the MUC chains
* @param document contains the Stanford chains
*/
public void compareChains(HashMap> chains,
Annotation document) {
HashMap stanfordNotMUC = new HashMap<>();
HashMap MUCNotStanford = new HashMap<>();
int thisStanford = 0;
HashMap> stanfordChains = stanfordToCoref(document);
for (int i : stanfordChains.keySet()) {
HashSet chain = chains.get(i);
if (chain != null) {
for (Coref c : chain) {
totalStanford++;
thisStanford++;
boolean found = find(c, chains);
if (!found)
stanfordNotMUC.put(c.ID, c);
}
}
}
System.out.println("Stanford not MUC: " + (stanfordNotMUC.keySet().size() + "/" + thisStanford));
falsePositive = falsePositive + stanfordNotMUC.keySet().size();
printCorefList(stanfordNotMUC);
int thisMUC = 0;
for (int i : chains.keySet()) {
HashSet chain = chains.get(i);
for (Coref c : chain) {
totalMUC++;
thisMUC++;
boolean found = find(c, stanfordChains);
if (!found) {
MUCNotStanford.put(c.ID, c);
if (!missedRefs.containsKey(c.token))
missedRefs.put(c.token,0);
else {
Integer counter = missedRefs.get(c.token) + 1;
missedRefs.put(c.token, counter);
}
}
}
}
System.out.println("MUC not Stanford : " + (MUCNotStanford.keySet().size() + "/" + thisMUC));
falseNegative = falseNegative + MUCNotStanford.keySet().size();
printCorefList(MUCNotStanford);
}
/** ***************************************************************
* Pick tokens off the input sentence string, capturing corefXML
* when present and aligning the corefXML with token numbers
*/
public void makeCorefList(String sentsDirty,
ArrayList> tokenized) {
StringBuffer sb = new StringBuffer(sentsDirty);
HashMap corefs = new HashMap<>();
HashMap corefTokens = new HashMap<>();
HashMap corefParams = new HashMap<>();
HashMap references = new HashMap<>();
Stack currentCoref = new Stack<>();
Pattern p1 = Pattern.compile("^\\s*(]+>)");
Pattern p2 = Pattern.compile("^\\s*( )");
int sentNum = 0;
int level = 0;
int tokenNum = 0;
int firstToken = 0;
boolean openTag = false;
boolean skipping = false;
String tag = "";
while (sb.length() > 0) {
if (sentNum > tokenized.size() - 1) {
System.out.println("Error in MUC.makeCorefList(): no tokenized sentence for: " + sb);
break;
}
ArrayList tokens = tokenized.get(sentNum);
tokens = splitTokens(tokens);
//System.out.println("Num tokens: " + tokens.size());
sentNum++;
String lastToken = "";
for (String t : tokens) {
String tokenNumStr = getTokenNum(t);
tokenNum = Integer.parseInt(tokenNumStr);
String token = stripTokenNum(t);
boolean tokenMatches = false;
skipping = false;
while (!tokenMatches && !skipping) {
//System.out.println("Token: " + token + " Last token: " + lastToken);
//System.out.println("sb: " + first100(sb));
Matcher m1 = p1.matcher(sb.toString());
Matcher m2 = p2.matcher(sb.toString());
if (token.length() > 0 && sb.length() > 0 &&
token.charAt(0) != '-' && sb.charAt(0) == '-')
sb.delete(0, 1);
else if (token.lastIndexOf('-') == 0)
tokenMatches = true;
else if (token.length() > 0 && sb.length() > 0 && token.charAt(0) == sb.charAt(0) &&
token.charAt(0) == '\'' && sb.charAt(0) == '\'' ) {
sb.delete(0, 1);
token = token.substring(1);
System.out.println("altered Token: " + token);
System.out.println("altered sb: " + first100(sb));
}
// Stanford can insert an extra period if the last token in a sentence is an abbreviation
else if (m1.find()) {
tag = getTag(sb, m1);
level++;
int quoteIndex = tag.indexOf("\"");
String id = tag.substring(quoteIndex + 1, tag.indexOf("\"", quoteIndex + 1));
currentCoref.push(Integer.parseInt(id));
corefTokens.put(Integer.parseInt(id), "");
corefParams.put(Integer.parseInt(id), tag);
int refIndex = tag.indexOf("REF=");
if (refIndex > -1) {
int refQuoteIndex = tag.indexOf("\"", refIndex + 1);
String ref = tag.substring(refQuoteIndex + 1, tag.indexOf("\"", refQuoteIndex + 1));
references.put(Integer.parseInt(id), Integer.parseInt(ref));
}
openTag = true;
}
else if (m2.find()) {
if (sb.indexOf("<") > -1)
sb.delete(0, sb.indexOf("<"));
sb.delete(0, sb.indexOf(">") + 1);
if (currentCoref.size() < 1) {
System.out.println("Error in MUC.makeCorefList(): no open tag for close tag\n" + first100(sb));
return;
}
Integer cid = currentCoref.pop();
Coref c = new Coref();
c.ID = cid;
c.token = trimPunc(corefTokens.get(cid));
c.firstToken = firstToken;
c.lastToken = tokenNum;
processParams(c, corefParams);
c.sentNum = sentNum;
corefs.put(c.ID, c);
level--;
}
else if (stripTokenNum(t).equals(".") && stripTokenNum(lastToken).endsWith(".") &&
!sb.toString().matches("^\\s*\\..*")) {
System.out.println("makeCorefList() Skipping token removal: " + t);
System.out.println(first100(sb));
skipping = true;
continue;
}
else {
if (openTag)
firstToken = tokenNum;
lastToken = token;
leadingTrim(sb);
tokenMatches = removeToken(sb, token);
if (level > 0)
expandCurrentToken(token, currentCoref, corefTokens);
openTag = false;
}
}
}
}
HashMap> chains = buildChains(corefs);
printChains(chains);
compareChains(chains, document2);
}
/** ***************************************************************
*/
public static void testParamString() {
MUC muc = new MUC();
String paramstring = "";
Coref c = muc.new Coref();
paramstring = processOneParamString(paramstring,c);
paramstring = processOneParamString(paramstring,c);
paramstring = processOneParamString(paramstring,c);
paramstring = processOneParamString(paramstring,c);
}
/** ***************************************************************
*/
public static void testRemoveToken() {
MUC muc = new MUC();
String token = "Corp.";
StringBuffer sb = new StringBuffer("Corp. He also served for 10 years as chairman and chief executive of Paramount Pictures Corp., a unit of Paramount Communications Inc. Arrow Investments Inc., a corporation controlled by Mr. Diller , in December agreed to purchase $25 million of QVC stock in a privately negotiated transaction.");
Coref c = muc.new Coref();
removeToken(sb,token);
System.out.println("MUC.testRemoveToken: " + sb);
}
/** ***************************************************************
*/
public static void testWhitespace() {
String paramstring = " . By Patrick M. Reilly sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
}
/** ***************************************************************
*/
public void testParallelPipeline() {
try (BufferedReader br = new BufferedReader(new InputStreamReader(System.in))) {
Properties preprocessprops = new Properties();
preprocessprops.setProperty("annotators", "tokenize, ssplit,pos,lemma, ner, parse");
preprocessprops.setProperty("tokenize.options", "ptb3Escaping=false");
StanfordCoreNLP preprocesspipeline = new StanfordCoreNLP(preprocessprops);
Properties corefprops = new Properties();
corefprops.setProperty("annotators", "dcoref");
//corefprops.setProperty("annotators", "hcoref");
corefprops.setProperty("tokenize.options", "ptb3Escaping=false");
corefprops.setProperty("enforceRequirements","false");
StanfordCoreNLP corefpipeline = new StanfordCoreNLP(corefprops);
List coreMaps= Lists.newArrayList();
String input;
while ((input = br.readLine()) != null) {
Annotation document = new Annotation(input);
preprocesspipeline.annotate(document);
List newcoreMaps = document.get(CoreAnnotations.SentencesAnnotation.class);
coreMaps.addAll(newcoreMaps);
System.out.println("Stanford corefs: ");
Annotation wholeDocument=new Annotation(coreMaps);
corefpipeline.annotate(wholeDocument);
Map graph = wholeDocument.get(CorefCoreAnnotations.CorefChainAnnotation.class);
printStanfordCorefList(graph);
}
}
catch (Exception e) {
e.printStackTrace();
}
}
/** ***************************************************************
*/
public void testMUC() {
//List lines = cleanSGML("/home/apease/IPsoft/corpora/muc6/data/keys/formal-tst.CO.key.cleanup.09jul96");
//List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/formal-tst.CO.key.cleanup.09jul96");
List lines = getDocuments(System.getenv("MUCCORPUS") + File.separator + "formal-tst.CO.key.cleanup.09jul96");
//List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/Wash.txt");
//List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/891101-0056.co.v0.sgm" + "");
for (String s : lines) {
String cleanedInput = s.replaceAll("]+>", "");
cleanedInput = cleanedInput.replace(" ","");
List sentsClean = toSentences(cleanedInput);
List sentsDirty = toSentences(s);
System.out.println("\n\nMUC markup: " + sentsDirty);
String allClean = listToString(sentsClean);
ArrayList> tokenized = toCoref(listToString(sentsClean));
makeCorefList(s, tokenized);
}
System.out.println("False positive rate: " + (falsePositive + "/" + totalStanford));
System.out.println("False negative rate: " + (falseNegative + "/" + totalMUC));
System.out.println("Most common missed corefs: ");
printTopN(sortTotals(missedRefs), 20);
}
/** ***************************************************************
*/
public static void main(String[] args) {
//testWhitespace();
MUC muc = new MUC();
muc.initPipeline();
//muc.toCoref("Bob likes to eat. He is big");
muc.testMUC();
}
}