semRewrite.substitutor.MUC Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
There is a newer version: 1.1
package semRewrite.substitutor;

import com.google.common.collect.Lists;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotationSerializer;
import edu.stanford.nlp.pipeline.GenericAnnotationSerializer;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import edu.stanford.nlp.dcoref.CorefChain;
//import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
//import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
//import edu.stanford.nlp.hcoref.CorefSystem;
//import edu.stanford.nlp.hcoref.data.CorefChain;
//import edu.stanford.nlp.hcoref.data.CorefChain.CorefMention;
//import edu.stanford.nlp.hcoref.data.Document;

/*
 * Copyright 2014-2015 IPsoft
 *
 * Author: Adam Pease [email protected]
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program ; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA  02111-1307 USA
 *
 * Processing to handle the MUC-6 dataset for coreference and compare
 * it to Stanford's CoreNLP coreference results
 */
public class MUC {

    private int totalStanford = 0;
    private int totalMUC = 0;
    private int falsePositive = 0;
    private int falseNegative = 0;
    public static Annotation document2 = null;
    public static HashSet stanfordCorefs = new HashSet<>();
    private HashMap missedRefs = new HashMap<>();

    /****************************************************************
     */
    public class Coref {

        int ID;
        String token;
        int ref = -1;
        HashMap info = new HashMap<>();
        int sentNum;
        int firstToken;
        int lastToken;
        int chainID;

        public String toString() {
            return Integer.toString(ID) + ":" +
                    token + ":" +
                    info + ":" +
                    Integer.toString(sentNum) + ":" +
                    Integer.toString(firstToken) + ":" +
                    Integer.toString(lastToken);
        }
    }

    /****************************************************************
     */
    public static List buildCorefList(String input) {

        ArrayList corefs = new ArrayList();
        return corefs;
    }

    /****************************************************************
     */
    public static String first100(StringBuffer input) {

        if (input.length() > 100)
            return input.toString().substring(0,100) + "...";
        else
            return input.toString();
    }

    /****************************************************************
     * convenience method to convert a set of corefs into a map that
     * can then be input to @see printCorefList()
     */
    public static HashMap toMap(HashSet cs) {

        HashMap sorted = new HashMap<>();
        for (Coref c : cs) {
            sorted.put(c.ID, c);
        }
        return sorted;
    }

    /****************************************************************
     */
    public static void printCorefList(HashMap cs) {

        TreeMap corefs = new TreeMap<>();
        corefs.putAll(cs);
        for (Integer i : corefs.keySet()) {
            Coref c = corefs.get(i);
            System.out.println(c);
        }
    }

    /****************************************************************
     */
    public void printStanfordCorefList(Map graph) {

        for (CorefChain cc : graph.values()) {
            List mentions = cc.getMentionsInTextualOrder();
            if (mentions.size() > 1) {
                for (CorefChain.CorefMention ment : mentions) {
                    Coref c = new Coref();
                    c.ID = ment.mentionID;
                    c.token = ment.mentionSpan;
                    HashMap info = new HashMap<>();
                    c.sentNum = ment.sentNum;
                    c.firstToken = ment.headIndex;
                    int lastToken;
                    //System.out.println(ment.sentNum + " : " + ment.headIndex + " : " + ment.mentionSpan);
                    System.out.println(ment.sentNum + " : " + ment.startIndex + " : " + ment.mentionSpan);
                }
                System.out.println();
            }
        }
    }

    /****************************************************************
     */
    public static String listToString(List input) {

        StringBuffer sb = new StringBuffer();
        for (String s : input)
            sb.append(s + " ");
        return sb.toString();
    }

    /****************************************************************
     * Use the Stanford sentence tokenizer to convert the input to a list
     * of Strings with one sentence per string
     */
    public static List toSentences(String input) {

        List results = new ArrayList();
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit");
        props.setProperty("tokenize.options", "ptb3Escaping=false");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(input);
        pipeline.annotate(document1);
        List sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
            //System.out.println(sentence);
            results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
        }
        return results;
    }

    /****************************************************************
     * Convert Stanford corefs into MUC-style coreference chains.
     */
    public HashMap> stanfordToCoref(Annotation document) {

        HashMap> result = new HashMap<>();
        int ID = 0;
        Map graph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        for (CorefChain cc : graph.values()) {
            List mentions = cc.getMentionsInTextualOrder();
            if (mentions.size() > 1) {
                HashSet newchain = new HashSet<>();
                for (CorefChain.CorefMention ment : mentions) {
                    Coref c = new Coref();
                    c.ID = ment.mentionID;
                    c.token = ment.mentionSpan;
                    HashMap info = new HashMap<>();
                    c.sentNum = ment.sentNum;
                    //c.firstToken = ment.headIndex;
                    c.firstToken = ment.startIndex;
                    int lastToken;
                    newchain.add(c);
                }
                result.put(ID,newchain);
                ID++;
            }
        }
        return result;
    }

    /****************************************************************
     * @return a Stanford pipeline
     */
    public StanfordCoreNLP initPipeline() {

        Properties props = new Properties();
        //props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, entitymentions, parse, depparse, hcoref");
        props.setProperty("tokenize.options", "ptb3Escaping=false");
        //String[] configFileProp = {"-props","/home/apease/Programs/stanford-corenlp-full-2015-04-20/CoreNLP/build/resources/main/edu/stanford/nlp/hcoref/properties/coref-default-dep.properties"};
        String[] configFileProp = {"-props",System.getenv("COREF")};
        props.putAll(StringUtils.argsToPropertiesWithResolve(configFileProp));
        System.out.println("MUC.toCoref(): before initialized pipeline");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        System.out.println("MUC.toCoref(): initialized pipeline");
        return pipeline;
    }

    /****************************************************************
     * @return a list of sentences with tokens
     */
    public ArrayList> toCoref(String input) {

        //System.out.println("INFO in MUC.toCoref(): " + input);
        //System.out.println("INFO in MUC.toCoref(): " + input);
        List corefs = buildCorefList(input);
        ArrayList> results = new ArrayList>();
        StanfordCoreNLP pipeline = initPipeline();
        document2 = new Annotation(input);
        System.out.println("MUC.toCoref(): after annotation");
        try {
            pipeline.annotate(document2);
            //HybridCorefAnnotator hcoref = new HybridCorefAnnotator(props);
            //hcoref.annotate(document);
        }
        catch (Exception e) {
            System.out.println("input: " + input);
            System.out.println(e.getMessage());
            e.printStackTrace();
        }
        List sentences = document2.get(CoreAnnotations.SentencesAnnotation.class);
        //SentenceUtil.printCorefChain(document);
        System.out.println("Stanford corefs: ");
        Map graph = document2.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        printStanfordCorefList(graph);

        for (CoreMap sentence : sentences) {
            //System.out.println(sentence);
            ArrayList tokenList = new ArrayList<>();
            //results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
            List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            for (CoreLabel t : tokens) {
                String t2 = t.toString();
                if (t2.startsWith("-LRB-"))
                    t2 = t2.replace("-LRB-", "(");
                if (t2.startsWith("-RRB-"))
                    t2 = t2.replace("-RRB-", ")");
                if (t2.startsWith("``"))
                    t2 = t2.replace("``", "\"");
                if (t2.startsWith("''"))
                    t2 = t2.replace("''", "\"");
                // -LCB-,  -RCB-, ???
                System.out.print(t2 + " ");
                tokenList.add(t2);
            }
            results.add(tokenList);
            System.out.println();
        }
        return results;
    }

    /****************************************************************
     */
    public static List getDocuments(String filename) {

        List lines = new ArrayList();
        System.out.println("INFO in MUC.cleanSGML(): Reading files");
        LineNumberReader lr = null;
        try {
            String line;
            StringBuffer doc = new StringBuffer();
            File nounFile = new File(filename);
            if (nounFile == null) {
                System.out.println("Error in MUC.cleanSGML(): The file does not exist ");
                return lines;
            }
            long t1 = System.currentTimeMillis();
            FileReader r = new FileReader(nounFile);
            lr = new LineNumberReader(r);
            while ((line = lr.readLine()) != null) {
                if (lr.getLineNumber() % 1000 == 0)
                    System.out.print('.');
                line = line.trim();
                line = line.replace("", "");
                line = line.replace("", ".");
                line = line.replace("", "");
                line = line.replace("", ".");
                line = line.replace("", "");
                line = line.replace("", "");
                line = line.replace("----", ".");
                line = line.replaceAll("^\\@", "");
                if (line.contains("")) {
                    lines.add(doc.toString());
                    doc = new StringBuffer();
                }
                else if (line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("")) {
                }
                else
                    doc.append(line + " ");
            }
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }
        finally {
            try {
                if (lr != null) {
                    lr.close();
                }
            }
            catch (Exception ex) {
            }
        }
        return lines;
    }

    /****************************************************************
     */
    public static List cleanSGML(String filename) {

        List lines = new ArrayList();
        System.out.println("INFO in MUC.cleanSGML(): Reading files");
        LineNumberReader lr = null;
        try {
            String line;
            File nounFile = new File(filename);
            if (nounFile == null) {
                System.out.println("Error in MUC.cleanSGML(): The file does not exist ");
                return lines;
            }
            long t1 = System.currentTimeMillis();
            FileReader r = new FileReader(nounFile);
            lr = new LineNumberReader(r);
            while ((line = lr.readLine()) != null) {
                if (lr.getLineNumber() % 1000 == 0)
                    System.out.print('.');
                line = line.trim();
                line = line.replaceAll("<[^>]+>", "");
                line = line.replaceAll("<[^>]+$", "");
                line = line.replaceAll("^[^>]+>", "");
                lines.add(line);
            }
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }
        finally {
            try {
                if (lr != null) {
                    lr.close();
                }
            }
            catch (Exception ex) {
            }
        }
        return lines;
    }

    /****************************************************************
     * Modify @param sb to remove the characters in @param token from
     * its starting characters.
     * @return true if the token was found
     */
    private static boolean removeToken(StringBuffer sb, String token) {

        //System.out.println("removeToken() remove '" + token + "'");
        //System.out.println("removeToken() before: " + first100(sb));
        if (sb == null || sb.length() < 1) {
            System.out.println("Error in removeToken() - null string with token: " + token);
            return false;
        }
        while (Character.isWhitespace(sb.toString().charAt(0)))
            sb.deleteCharAt(0);
        if (sb.toString().startsWith(token)) {
            sb.delete(0, token.length());
        }
        else {
            System.out.println("Error in removeToken() - no match for '" + token + "' in " + first100(sb));
            return false;
        }
        //System.out.println("after: " + first100(sb));
        return true;
    }

    /***************************************************************
     */
    private static void expandCurrentToken(String token,
                                           Stack currentCoref,
                                           HashMap corefTokens) {

        Integer id = currentCoref.peek();
        String tok = corefTokens.get(id);
        if (tok.isEmpty())
            corefTokens.put(id, token);
        else
            corefTokens.put(id, tok + " " + token);
    }

    /****************************************************************
     */
    private static void leadingTrim(StringBuffer sb) {

        if (sb == null || sb.length() > 1)
            return;
        while (sb.length() > 0 && Character.isWhitespace(sb.toString().charAt(0)))
            sb.deleteCharAt(0);
    }

    /****************************************************************
     */
    private static String processOneParamString(String paramlist, Coref c) {

        //System.out.println("processOneParamString(): " + paramlist);
        int space = paramlist.indexOf(' ');
        int equals = paramlist.indexOf('=');
        int quote1 = paramlist.indexOf('"');
        int offset = quote1;
        int index = quote1 + 1;
        while (index < paramlist.length() &&
                (paramlist.charAt(index) != '"' || (index > 0 && paramlist.charAt(index - 1) == '\\'))) {
            index++;
        }
        int quote2 = paramlist.indexOf('"', index);
        String key = paramlist.substring(space + 1, equals);

        String value = paramlist.substring(quote1 + 1, quote2);
        //System.out.println(value);
        if (key.equals("REF"))
            c.ref = Integer.parseInt(value);
        c.info.put(key, value);
        paramlist = paramlist.substring(quote2 + 1);
        //System.out.println("result of processOneParamString(): " + c);
        return paramlist;
    }

    /****************************************************************
     */
    private static void processParams(Coref c, HashMap corefParams) {

        String paramlist = corefParams.get(c.ID);
        //System.out.println("processParams(): " + paramlist);
        while (paramlist.indexOf('"') > -1) {
            paramlist = processOneParamString(paramlist, c);
        }
    }

    /***************************************************************
     * @return the String content of a COREF tag.
     * Destructively modifies sb to remove the tag
     */
    private static String getTag(StringBuffer sb, Matcher m) {

        if (sb.indexOf("<") > -1)
            sb.delete(0, sb.indexOf("<"));
        String tag = sb.toString().substring(0, sb.indexOf(">") + 1);
        sb.delete(0, sb.indexOf(">") + 1);
        return tag;
    }

    /****************************************************************
     * Build chains of coreferences based on their pairwise references
     */
    private static HashMap> buildChains(HashMap corefs) {

        HashMap chainMap = new HashMap<>(); // coref id to chain id
        HashMap> chains = new HashMap<>(); // chain id to members
        int chainNum = 0;
        boolean first = true;
        Coref firstC = null;
        for (Integer i : corefs.keySet()) {
            Coref c = corefs.get(i);
            if (first) {
                firstC = c;
                first = false;
            }
            if (!chainMap.keySet().contains(c.ID)) {
                if (c.ref == -1) {
                    chainMap.put(c.ID, chainNum);
                    HashSet chain = new HashSet<>();
                    chain.add(c);
                    c.chainID = chainNum;
                    chains.put(chainNum, chain);
                    chainNum++;
                }
                else if (!chainMap.keySet().contains(c.ref)) {
                    chainMap.put(c.ID, chainNum);
                    chainMap.put(c.ref, chainNum);
                    HashSet chain = new HashSet<>();
                    c.chainID = chainNum;
                    chain.add(c);
                    Coref cref = corefs.get(c.ref);
                    if (cref != null)
                        chain.add(cref);
                    else
                        System.out.println("Error in MUC.buildChains(): No coref for id: " + c.ref);
                    chains.put(chainNum, chain);
                    chainNum++;
                }
                else {
                    int ref = chainMap.get(c.ref);
                    chains.get(ref).add(c);
                    chainMap.put(c.ID, ref);
                }
            }
            else {
                if (c.ref == -1) {
                } // no reference so do nothing
                else if (!chainMap.keySet().contains(c.ref)) {
                    chainMap.put(c.ID, chainNum);
                    chainMap.put(c.ref, chainNum);
                    HashSet chain = new HashSet<>();
                    c.chainID = chainNum;
                    chain.add(c);
                    Coref cref = corefs.get(c.ref);
                    if (cref != null)
                        chain.add(cref);
                    else
                        System.out.println("Error in MUC.buildChains(): No coref for id: " + c.ref);
                    chain.add(cref);
                    chains.put(chainNum, chain);
                    chainNum++;
                }
                else {
                    int ref = chainMap.get(c.ref);
                    chains.get(ref).add(c);
                }
            }
        }
        return chains;
    }

    /****************************************************************
     * Strip a Stanford token number suffix from the token
     */
    private static String stripTokenNum(String t) {

        if (t.lastIndexOf("-") < 0)
            return t;
        return t.substring(0,t.lastIndexOf("-"));
    }

    /****************************************************************
     * Get the Stanford token number suffix from the token
     */
    private static String getTokenNum(String t) {

        if (t.lastIndexOf("-") < 0)
            return t;
        return t.substring(t.lastIndexOf("-") + 1);
    }

    /****************************************************************
     * Trim punctuation
     */
    private static String trimPunc(String t) {

        boolean changed = true;
        while (changed) {
            if (t.charAt(t.length() -1) == ' ') {
                t = t.substring(0,t.length() -1);
                changed = true;
            }
            else if (t.charAt(t.length() -1) == ',') {
                t = t.substring(0,t.length() -1);
                changed = true;
            }
            else if (t.endsWith(" 's")) {
                t = t.substring(0,t.length() - 3) + t.substring(t.length() -2,t.length());
                changed = true;
            }
            else if (t.endsWith("  .")) {
                t = t.substring(0,t.length() - 2);
                changed = true;
            }
            else
                changed = false;
        }
        return t;
    }

    /****************************************************************
     */
    private static void printChains(HashMap> corefs) {

        for (Integer i : corefs.keySet()) {
            HashSet cs = corefs.get(i);
            printCorefList(toMap(cs));
            System.out.println();
        }
    }

    /****************************************************************
     */
    private static boolean find(Coref c, HashMap> chains) {

        for (int i : chains.keySet()) {
            HashSet chain = chains.get(i);
            for (Coref c2 : chain) {
                if (c2.sentNum == c.sentNum && c2.firstToken == c.firstToken)
                    return true;
                if (c2.info.containsKey("MIN") &&
                        (trimPunc(c2.info.get("MIN")).equals(trimPunc(c.token)) ||
                                (trimPunc(c.token)).contains(trimPunc(c2.info.get("MIN")))) )
                    return true;
                if (c.info.containsKey("MIN") &&
                        (trimPunc(c.info.get("MIN")).equals(trimPunc(c2.token)) ||
                                (trimPunc(c2.token)).contains(trimPunc(c.info.get("MIN")))) )
                    return true;
            }
        }
        return false;
    }

    /****************************************************************
     * A kludge to handle the fact that MUC sometimes splits tokens
     * that are hyphenated.  So we pre-split all hyphenated tokens
     * into several tokens that share the same token number.
     */
    private static ArrayList splitTokens(ArrayList tokens) {

        ArrayList result = new ArrayList();
        for (String t : tokens) {
            if (stripTokenNum(t).indexOf('-') > -1) {
                String num = getTokenNum(t);
                String[] split = t.split("-");
                for (int i = 0; i < split.length - 1 ; i++)
                    result.add(split[i] + "-" + num);
            }
            else {
                result.add(t);
            }
        }
        return result;
    }

    /****************************************************************
     */
    private static TreeMap> sortTotals(HashMap missed) {

        TreeMap> commonMissed = new TreeMap<>();
        for (String s : missed.keySet()) {
            Integer i = missed.get(s);
            if (commonMissed.containsKey(i)) {
                ArrayList al = commonMissed.get(i);
                al.add(s);
            }
            else {
                ArrayList al = new ArrayList<>();
                al.add(s);
                commonMissed.put(i, al);
            }
        }
        return commonMissed;
    }

    /****************************************************************
     */
    private static void printTopN(TreeMap> map, int n) {

        int index = 0;
        Iterator it = map.keySet().iterator();
        while (index < n && index < map.keySet().size()) {
            Integer key = it.next();
            ArrayList al = map.get(key);
            System.out.println(key + " : " + al);
            index++;
        }
    }

    /** ***************************************************************
     * Compare Stanford and MUC coreference chains.  Create a map for
     * each token in MUC whether it is in Stanford and in which chain
     * ID and which Stanford token it corresponds to. Use that map to
     * score which tokens are not found (errors of omission, or false
     * negatives).  Mark the tokens that are found in both MUC and
     * Stanford in a separate map. Use that map to create a third map
     * of tokens that are in Stanford but not in MUC.
     * @param chains are the MUC chains
     * @param document contains the Stanford chains
     */
    public void compareChains(HashMap> chains,
                              Annotation document) {

        HashMap stanfordNotMUC = new HashMap<>();
        HashMap MUCNotStanford = new HashMap<>();

        int thisStanford = 0;
        HashMap> stanfordChains = stanfordToCoref(document);
        for (int i : stanfordChains.keySet()) {
            HashSet chain = chains.get(i);
            if (chain != null) {
                for (Coref c : chain) {
                    totalStanford++;
                    thisStanford++;
                    boolean found = find(c, chains);
                    if (!found)
                        stanfordNotMUC.put(c.ID, c);
                }
            }
        }
        System.out.println("Stanford not MUC: " + (stanfordNotMUC.keySet().size() + "/" + thisStanford));
        falsePositive = falsePositive + stanfordNotMUC.keySet().size();
        printCorefList(stanfordNotMUC);

        int thisMUC = 0;
        for (int i : chains.keySet()) {
            HashSet chain = chains.get(i);
            for (Coref c : chain) {
                totalMUC++;
                thisMUC++;
                boolean found = find(c, stanfordChains);
                if (!found) {
                    MUCNotStanford.put(c.ID, c);
                    if (!missedRefs.containsKey(c.token))
                        missedRefs.put(c.token,0);
                    else {
                        Integer counter = missedRefs.get(c.token) + 1;
                        missedRefs.put(c.token, counter);
                    }
                }
            }
        }
        System.out.println("MUC not Stanford  : " + (MUCNotStanford.keySet().size() + "/" + thisMUC));
        falseNegative = falseNegative + MUCNotStanford.keySet().size();
        printCorefList(MUCNotStanford);
    }

    /** ***************************************************************
     * Pick tokens off the input sentence string, capturing corefXML
     * when present and aligning the corefXML with token numbers
     */
    public void makeCorefList(String sentsDirty,
                              ArrayList> tokenized) {

        StringBuffer sb = new StringBuffer(sentsDirty);
        HashMap corefs = new HashMap<>();
        HashMap corefTokens = new HashMap<>();
        HashMap corefParams = new HashMap<>();
        HashMap references = new HashMap<>();
        Stack currentCoref = new Stack<>();
        Pattern p1 = Pattern.compile("^\\s*(]+>)");
        Pattern p2 = Pattern.compile("^\\s*()");
        int sentNum = 0;
        int level = 0;
        int tokenNum = 0;
        int firstToken = 0;
        boolean openTag = false;
        boolean skipping = false;
        String tag = "";
        while (sb.length() > 0) {
            if (sentNum > tokenized.size() - 1) {
                System.out.println("Error in MUC.makeCorefList(): no tokenized sentence for: " + sb);
                break;
            }
            ArrayList tokens = tokenized.get(sentNum);
            tokens = splitTokens(tokens);
            //System.out.println("Num tokens: " + tokens.size());
            sentNum++;
            String lastToken = "";
            for (String t : tokens) {
                String tokenNumStr = getTokenNum(t);
                tokenNum = Integer.parseInt(tokenNumStr);
                String token = stripTokenNum(t);
                boolean tokenMatches = false;
                skipping = false;
                while (!tokenMatches && !skipping) {
                    //System.out.println("Token: " + token + " Last token: " + lastToken);
                    //System.out.println("sb: " + first100(sb));
                    Matcher m1 = p1.matcher(sb.toString());
                    Matcher m2 = p2.matcher(sb.toString());
                    if (token.length() > 0 && sb.length() > 0 &&
                            token.charAt(0) != '-' && sb.charAt(0) == '-')
                        sb.delete(0, 1);
                    else if (token.lastIndexOf('-') == 0)
                        tokenMatches = true;
                    else if (token.length() > 0 && sb.length() > 0 && token.charAt(0) == sb.charAt(0) &&
                            token.charAt(0) == '\'' && sb.charAt(0) == '\'' ) {
                        sb.delete(0, 1);
                        token = token.substring(1);
                        System.out.println("altered Token: " + token);
                        System.out.println("altered sb: " + first100(sb));
                    }
                    // Stanford can insert an extra period if the last token in a sentence is an abbreviation
                    else if (m1.find()) {
                        tag = getTag(sb, m1);
                        level++;
                        int quoteIndex = tag.indexOf("\"");
                        String id = tag.substring(quoteIndex + 1, tag.indexOf("\"", quoteIndex + 1));
                        currentCoref.push(Integer.parseInt(id));
                        corefTokens.put(Integer.parseInt(id), "");
                        corefParams.put(Integer.parseInt(id), tag);
                        int refIndex = tag.indexOf("REF=");
                        if (refIndex > -1) {
                            int refQuoteIndex = tag.indexOf("\"", refIndex + 1);
                            String ref = tag.substring(refQuoteIndex + 1, tag.indexOf("\"", refQuoteIndex + 1));
                            references.put(Integer.parseInt(id), Integer.parseInt(ref));
                        }
                        openTag = true;
                    }
                    else if (m2.find()) {
                        if (sb.indexOf("<") > -1)
                            sb.delete(0, sb.indexOf("<"));
                        sb.delete(0, sb.indexOf(">") + 1);
                        if (currentCoref.size() < 1) {
                            System.out.println("Error in MUC.makeCorefList(): no open tag for close tag\n" + first100(sb));
                            return;
                        }
                        Integer cid = currentCoref.pop();
                        Coref c = new Coref();
                        c.ID = cid;
                        c.token = trimPunc(corefTokens.get(cid));
                        c.firstToken = firstToken;
                        c.lastToken = tokenNum;
                        processParams(c, corefParams);
                        c.sentNum = sentNum;
                        corefs.put(c.ID, c);
                        level--;
                    }
                    else if (stripTokenNum(t).equals(".") && stripTokenNum(lastToken).endsWith(".") &&
                            !sb.toString().matches("^\\s*\\..*")) {
                        System.out.println("makeCorefList() Skipping token removal: " + t);
                        System.out.println(first100(sb));
                        skipping = true;
                        continue;
                    }
                    else {
                        if (openTag)
                            firstToken = tokenNum;
                        lastToken = token;
                        leadingTrim(sb);
                        tokenMatches = removeToken(sb, token);
                        if (level > 0)
                            expandCurrentToken(token, currentCoref, corefTokens);
                        openTag = false;
                    }
                }
            }
        }
        HashMap> chains = buildChains(corefs);
        printChains(chains);
        compareChains(chains, document2);
    }

    /** ***************************************************************
     */
    public static void testParamString() {

        MUC muc = new MUC();
        String paramstring = "";
        Coref c = muc.new Coref();
        paramstring = processOneParamString(paramstring,c);
        paramstring = processOneParamString(paramstring,c);
        paramstring = processOneParamString(paramstring,c);
        paramstring = processOneParamString(paramstring,c);
    }

    /** ***************************************************************
     */
    public static void testRemoveToken() {

        MUC muc = new MUC();
        String token = "Corp.";
        StringBuffer sb = new StringBuffer("Corp. He also served for 10 years as chairman and chief executive of Paramount Pictures Corp., a unit of Paramount Communications Inc. Arrow Investments Inc., a corporation controlled by Mr. Diller, in December agreed to purchase $25 million of QVC stock in a privately negotiated transaction.");
        Coref c = muc.new Coref();
        removeToken(sb,token);
        System.out.println("MUC.testRemoveToken: " + sb);
    }

    /** ***************************************************************
     */
    public static void testWhitespace() {

        String paramstring = "    .   By Patrick M. Reilly    sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
    }

    /** ***************************************************************
     */
    public void testParallelPipeline() {

        try (BufferedReader br = new BufferedReader(new InputStreamReader(System.in))) {
            Properties preprocessprops = new Properties();
            preprocessprops.setProperty("annotators", "tokenize, ssplit,pos,lemma, ner, parse");
            preprocessprops.setProperty("tokenize.options", "ptb3Escaping=false");
            StanfordCoreNLP preprocesspipeline = new StanfordCoreNLP(preprocessprops);

            Properties corefprops = new Properties();
            corefprops.setProperty("annotators", "dcoref");
            //corefprops.setProperty("annotators", "hcoref");
            corefprops.setProperty("tokenize.options", "ptb3Escaping=false");
            corefprops.setProperty("enforceRequirements","false");
            StanfordCoreNLP corefpipeline = new StanfordCoreNLP(corefprops);

            List coreMaps= Lists.newArrayList();
            String input;
            while ((input = br.readLine()) != null) {
                Annotation document = new Annotation(input);
                preprocesspipeline.annotate(document);
                List newcoreMaps = document.get(CoreAnnotations.SentencesAnnotation.class);
                coreMaps.addAll(newcoreMaps);
                System.out.println("Stanford corefs: ");
                Annotation wholeDocument=new Annotation(coreMaps);
                corefpipeline.annotate(wholeDocument);

                Map graph = wholeDocument.get(CorefCoreAnnotations.CorefChainAnnotation.class);
                printStanfordCorefList(graph);
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    /** ***************************************************************
     */
    public void testMUC() {

        //List lines = cleanSGML("/home/apease/IPsoft/corpora/muc6/data/keys/formal-tst.CO.key.cleanup.09jul96");
        //List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/formal-tst.CO.key.cleanup.09jul96");
        List lines = getDocuments(System.getenv("MUCCORPUS") + File.separator + "formal-tst.CO.key.cleanup.09jul96");
        //List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/Wash.txt");
        //List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/891101-0056.co.v0.sgm" + "");
        for (String s : lines) {
            String cleanedInput = s.replaceAll("]+>", "");
            cleanedInput = cleanedInput.replace("","");
            List sentsClean = toSentences(cleanedInput);
            List sentsDirty = toSentences(s);
            System.out.println("\n\nMUC markup: " + sentsDirty);
            String allClean = listToString(sentsClean);
            ArrayList> tokenized = toCoref(listToString(sentsClean));
            makeCorefList(s, tokenized);
        }
        System.out.println("False positive rate: " + (falsePositive + "/" + totalStanford));
        System.out.println("False negative rate: " + (falseNegative + "/" + totalMUC));
        System.out.println("Most common missed corefs: ");
        printTopN(sortTotals(missedRefs), 20);
    }

    /** ***************************************************************
     */
    public static void main(String[] args) {

        //testWhitespace();
        MUC muc = new MUC();
        muc.initPipeline();
        //muc.toCoref("Bob likes to eat.  He is big");
        muc.testMUC();
    }
}