All Downloads are FREE. Search and download functionalities are using the official Maven repository.

semRewrite.substitutor.MUC Maven / Gradle / Ivy

Go to download

Natural language processing toolbox using Sigma knowledge engineering system.

There is a newer version: 1.1
Show newest version
package semRewrite.substitutor;

import com.google.common.collect.Lists;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotationSerializer;
import edu.stanford.nlp.pipeline.GenericAnnotationSerializer;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import edu.stanford.nlp.dcoref.CorefChain;
//import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
//import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
//import edu.stanford.nlp.hcoref.CorefSystem;
//import edu.stanford.nlp.hcoref.data.CorefChain;
//import edu.stanford.nlp.hcoref.data.CorefChain.CorefMention;
//import edu.stanford.nlp.hcoref.data.Document;

/*
 * Copyright 2014-2015 IPsoft
 *
 * Author: Adam Pease [email protected]
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program ; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA  02111-1307 USA
 *
 * Processing to handle the MUC-6 dataset for coreference and compare
 * it to Stanford's CoreNLP coreference results
 */
public class MUC {

    private int totalStanford = 0;
    private int totalMUC = 0;
    private int falsePositive = 0;
    private int falseNegative = 0;
    public static Annotation document2 = null;
    public static HashSet stanfordCorefs = new HashSet<>();
    private HashMap missedRefs = new HashMap<>();

    /****************************************************************
     */
    public class Coref {

        int ID;
        String token;
        int ref = -1;
        HashMap info = new HashMap<>();
        int sentNum;
        int firstToken;
        int lastToken;
        int chainID;

        public String toString() {
            return Integer.toString(ID) + ":" +
                    token + ":" +
                    info + ":" +
                    Integer.toString(sentNum) + ":" +
                    Integer.toString(firstToken) + ":" +
                    Integer.toString(lastToken);
        }
    }

    /****************************************************************
     */
    public static List buildCorefList(String input) {

        ArrayList corefs = new ArrayList();
        return corefs;
    }

    /****************************************************************
     */
    public static String first100(StringBuffer input) {

        if (input.length() > 100)
            return input.toString().substring(0,100) + "...";
        else
            return input.toString();
    }

    /****************************************************************
     * convenience method to convert a set of corefs into a map that
     * can then be input to @see printCorefList()
     */
    public static HashMap toMap(HashSet cs) {

        HashMap sorted = new HashMap<>();
        for (Coref c : cs) {
            sorted.put(c.ID, c);
        }
        return sorted;
    }

    /****************************************************************
     */
    public static void printCorefList(HashMap cs) {

        TreeMap corefs = new TreeMap<>();
        corefs.putAll(cs);
        for (Integer i : corefs.keySet()) {
            Coref c = corefs.get(i);
            System.out.println(c);
        }
    }

    /****************************************************************
     */
    public void printStanfordCorefList(Map graph) {

        for (CorefChain cc : graph.values()) {
            List mentions = cc.getMentionsInTextualOrder();
            if (mentions.size() > 1) {
                for (CorefChain.CorefMention ment : mentions) {
                    Coref c = new Coref();
                    c.ID = ment.mentionID;
                    c.token = ment.mentionSpan;
                    HashMap info = new HashMap<>();
                    c.sentNum = ment.sentNum;
                    c.firstToken = ment.headIndex;
                    int lastToken;
                    //System.out.println(ment.sentNum + " : " + ment.headIndex + " : " + ment.mentionSpan);
                    System.out.println(ment.sentNum + " : " + ment.startIndex + " : " + ment.mentionSpan);
                }
                System.out.println();
            }
        }
    }

    /****************************************************************
     */
    public static String listToString(List input) {

        StringBuffer sb = new StringBuffer();
        for (String s : input)
            sb.append(s + " ");
        return sb.toString();
    }

    /****************************************************************
     * Use the Stanford sentence tokenizer to convert the input to a list
     * of Strings with one sentence per string
     */
    public static List toSentences(String input) {

        List results = new ArrayList();
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit");
        props.setProperty("tokenize.options", "ptb3Escaping=false");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(input);
        pipeline.annotate(document1);
        List sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
            //System.out.println(sentence);
            results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
        }
        return results;
    }

    /****************************************************************
     * Convert Stanford corefs into MUC-style coreference chains.
     */
    public HashMap> stanfordToCoref(Annotation document) {

        HashMap> result = new HashMap<>();
        int ID = 0;
        Map graph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        for (CorefChain cc : graph.values()) {
            List mentions = cc.getMentionsInTextualOrder();
            if (mentions.size() > 1) {
                HashSet newchain = new HashSet<>();
                for (CorefChain.CorefMention ment : mentions) {
                    Coref c = new Coref();
                    c.ID = ment.mentionID;
                    c.token = ment.mentionSpan;
                    HashMap info = new HashMap<>();
                    c.sentNum = ment.sentNum;
                    //c.firstToken = ment.headIndex;
                    c.firstToken = ment.startIndex;
                    int lastToken;
                    newchain.add(c);
                }
                result.put(ID,newchain);
                ID++;
            }
        }
        return result;
    }

    /****************************************************************
     * @return a Stanford pipeline
     */
    public StanfordCoreNLP initPipeline() {

        Properties props = new Properties();
        //props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, entitymentions, parse, depparse, hcoref");
        props.setProperty("tokenize.options", "ptb3Escaping=false");
        //String[] configFileProp = {"-props","/home/apease/Programs/stanford-corenlp-full-2015-04-20/CoreNLP/build/resources/main/edu/stanford/nlp/hcoref/properties/coref-default-dep.properties"};
        String[] configFileProp = {"-props",System.getenv("COREF")};
        props.putAll(StringUtils.argsToPropertiesWithResolve(configFileProp));
        System.out.println("MUC.toCoref(): before initialized pipeline");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        System.out.println("MUC.toCoref(): initialized pipeline");
        return pipeline;
    }

    /****************************************************************
     * @return a list of sentences with tokens
     */
    public ArrayList> toCoref(String input) {

        //System.out.println("INFO in MUC.toCoref(): " + input);
        //System.out.println("INFO in MUC.toCoref(): " + input);
        List corefs = buildCorefList(input);
        ArrayList> results = new ArrayList>();
        StanfordCoreNLP pipeline = initPipeline();
        document2 = new Annotation(input);
        System.out.println("MUC.toCoref(): after annotation");
        try {
            pipeline.annotate(document2);
            //HybridCorefAnnotator hcoref = new HybridCorefAnnotator(props);
            //hcoref.annotate(document);
        }
        catch (Exception e) {
            System.out.println("input: " + input);
            System.out.println(e.getMessage());
            e.printStackTrace();
        }
        List sentences = document2.get(CoreAnnotations.SentencesAnnotation.class);
        //SentenceUtil.printCorefChain(document);
        System.out.println("Stanford corefs: ");
        Map graph = document2.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        printStanfordCorefList(graph);

        for (CoreMap sentence : sentences) {
            //System.out.println(sentence);
            ArrayList tokenList = new ArrayList<>();
            //results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
            List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            for (CoreLabel t : tokens) {
                String t2 = t.toString();
                if (t2.startsWith("-LRB-"))
                    t2 = t2.replace("-LRB-", "(");
                if (t2.startsWith("-RRB-"))
                    t2 = t2.replace("-RRB-", ")");
                if (t2.startsWith("``"))
                    t2 = t2.replace("``", "\"");
                if (t2.startsWith("''"))
                    t2 = t2.replace("''", "\"");
                // -LCB-,  -RCB-, ???
                System.out.print(t2 + " ");
                tokenList.add(t2);
            }
            results.add(tokenList);
            System.out.println();
        }
        return results;
    }

    /****************************************************************
     */
    public static List getDocuments(String filename) {

        List lines = new ArrayList();
        System.out.println("INFO in MUC.cleanSGML(): Reading files");
        LineNumberReader lr = null;
        try {
            String line;
            StringBuffer doc = new StringBuffer();
            File nounFile = new File(filename);
            if (nounFile == null) {
                System.out.println("Error in MUC.cleanSGML(): The file does not exist ");
                return lines;
            }
            long t1 = System.currentTimeMillis();
            FileReader r = new FileReader(nounFile);
            lr = new LineNumberReader(r);
            while ((line = lr.readLine()) != null) {
                if (lr.getLineNumber() % 1000 == 0)
                    System.out.print('.');
                line = line.trim();
                line = line.replace("", "");
                line = line.replace("", ".");
                line = line.replace("", "");
                line = line.replace("", ".");
                line = line.replace("", "");
                line = line.replace("", "");
                line = line.replace("----", ".");
                line = line.replaceAll("^\\@", "");
                if (line.contains("")) {
                    lines.add(doc.toString());
                    doc = new StringBuffer();
                }
                else if (line.contains("") ||
                        line.contains("") ||
                        line.contains("") ||
                        line.contains("

") || line.contains("

") || line.contains("
") || line.contains("") || line.contains("") || line.contains("") || line.contains("") || line.contains("") || line.contains("") || line.contains("") || line.contains("") || line.contains("") || line.contains("")) { } else doc.append(line + " "); } } catch (Exception ex) { ex.printStackTrace(); } finally { try { if (lr != null) { lr.close(); } } catch (Exception ex) { } } return lines; } /**************************************************************** */ public static List cleanSGML(String filename) { List lines = new ArrayList(); System.out.println("INFO in MUC.cleanSGML(): Reading files"); LineNumberReader lr = null; try { String line; File nounFile = new File(filename); if (nounFile == null) { System.out.println("Error in MUC.cleanSGML(): The file does not exist "); return lines; } long t1 = System.currentTimeMillis(); FileReader r = new FileReader(nounFile); lr = new LineNumberReader(r); while ((line = lr.readLine()) != null) { if (lr.getLineNumber() % 1000 == 0) System.out.print('.'); line = line.trim(); line = line.replaceAll("<[^>]+>", ""); line = line.replaceAll("<[^>]+$", ""); line = line.replaceAll("^[^>]+>", ""); lines.add(line); } } catch (Exception ex) { ex.printStackTrace(); } finally { try { if (lr != null) { lr.close(); } } catch (Exception ex) { } } return lines; } /**************************************************************** * Modify @param sb to remove the characters in @param token from * its starting characters. * @return true if the token was found */ private static boolean removeToken(StringBuffer sb, String token) { //System.out.println("removeToken() remove '" + token + "'"); //System.out.println("removeToken() before: " + first100(sb)); if (sb == null || sb.length() < 1) { System.out.println("Error in removeToken() - null string with token: " + token); return false; } while (Character.isWhitespace(sb.toString().charAt(0))) sb.deleteCharAt(0); if (sb.toString().startsWith(token)) { sb.delete(0, token.length()); } else { System.out.println("Error in removeToken() - no match for '" + token + "' in " + first100(sb)); return false; } //System.out.println("after: " + first100(sb)); return true; } /*************************************************************** */ private static void expandCurrentToken(String token, Stack currentCoref, HashMap corefTokens) { Integer id = currentCoref.peek(); String tok = corefTokens.get(id); if (tok.isEmpty()) corefTokens.put(id, token); else corefTokens.put(id, tok + " " + token); } /**************************************************************** */ private static void leadingTrim(StringBuffer sb) { if (sb == null || sb.length() > 1) return; while (sb.length() > 0 && Character.isWhitespace(sb.toString().charAt(0))) sb.deleteCharAt(0); } /**************************************************************** */ private static String processOneParamString(String paramlist, Coref c) { //System.out.println("processOneParamString(): " + paramlist); int space = paramlist.indexOf(' '); int equals = paramlist.indexOf('='); int quote1 = paramlist.indexOf('"'); int offset = quote1; int index = quote1 + 1; while (index < paramlist.length() && (paramlist.charAt(index) != '"' || (index > 0 && paramlist.charAt(index - 1) == '\\'))) { index++; } int quote2 = paramlist.indexOf('"', index); String key = paramlist.substring(space + 1, equals); String value = paramlist.substring(quote1 + 1, quote2); //System.out.println(value); if (key.equals("REF")) c.ref = Integer.parseInt(value); c.info.put(key, value); paramlist = paramlist.substring(quote2 + 1); //System.out.println("result of processOneParamString(): " + c); return paramlist; } /**************************************************************** */ private static void processParams(Coref c, HashMap corefParams) { String paramlist = corefParams.get(c.ID); //System.out.println("processParams(): " + paramlist); while (paramlist.indexOf('"') > -1) { paramlist = processOneParamString(paramlist, c); } } /*************************************************************** * @return the String content of a COREF tag. * Destructively modifies sb to remove the tag */ private static String getTag(StringBuffer sb, Matcher m) { if (sb.indexOf("<") > -1) sb.delete(0, sb.indexOf("<")); String tag = sb.toString().substring(0, sb.indexOf(">") + 1); sb.delete(0, sb.indexOf(">") + 1); return tag; } /**************************************************************** * Build chains of coreferences based on their pairwise references */ private static HashMap> buildChains(HashMap corefs) { HashMap chainMap = new HashMap<>(); // coref id to chain id HashMap> chains = new HashMap<>(); // chain id to members int chainNum = 0; boolean first = true; Coref firstC = null; for (Integer i : corefs.keySet()) { Coref c = corefs.get(i); if (first) { firstC = c; first = false; } if (!chainMap.keySet().contains(c.ID)) { if (c.ref == -1) { chainMap.put(c.ID, chainNum); HashSet chain = new HashSet<>(); chain.add(c); c.chainID = chainNum; chains.put(chainNum, chain); chainNum++; } else if (!chainMap.keySet().contains(c.ref)) { chainMap.put(c.ID, chainNum); chainMap.put(c.ref, chainNum); HashSet chain = new HashSet<>(); c.chainID = chainNum; chain.add(c); Coref cref = corefs.get(c.ref); if (cref != null) chain.add(cref); else System.out.println("Error in MUC.buildChains(): No coref for id: " + c.ref); chains.put(chainNum, chain); chainNum++; } else { int ref = chainMap.get(c.ref); chains.get(ref).add(c); chainMap.put(c.ID, ref); } } else { if (c.ref == -1) { } // no reference so do nothing else if (!chainMap.keySet().contains(c.ref)) { chainMap.put(c.ID, chainNum); chainMap.put(c.ref, chainNum); HashSet chain = new HashSet<>(); c.chainID = chainNum; chain.add(c); Coref cref = corefs.get(c.ref); if (cref != null) chain.add(cref); else System.out.println("Error in MUC.buildChains(): No coref for id: " + c.ref); chain.add(cref); chains.put(chainNum, chain); chainNum++; } else { int ref = chainMap.get(c.ref); chains.get(ref).add(c); } } } return chains; } /**************************************************************** * Strip a Stanford token number suffix from the token */ private static String stripTokenNum(String t) { if (t.lastIndexOf("-") < 0) return t; return t.substring(0,t.lastIndexOf("-")); } /**************************************************************** * Get the Stanford token number suffix from the token */ private static String getTokenNum(String t) { if (t.lastIndexOf("-") < 0) return t; return t.substring(t.lastIndexOf("-") + 1); } /**************************************************************** * Trim punctuation */ private static String trimPunc(String t) { boolean changed = true; while (changed) { if (t.charAt(t.length() -1) == ' ') { t = t.substring(0,t.length() -1); changed = true; } else if (t.charAt(t.length() -1) == ',') { t = t.substring(0,t.length() -1); changed = true; } else if (t.endsWith(" 's")) { t = t.substring(0,t.length() - 3) + t.substring(t.length() -2,t.length()); changed = true; } else if (t.endsWith(" .")) { t = t.substring(0,t.length() - 2); changed = true; } else changed = false; } return t; } /**************************************************************** */ private static void printChains(HashMap> corefs) { for (Integer i : corefs.keySet()) { HashSet cs = corefs.get(i); printCorefList(toMap(cs)); System.out.println(); } } /**************************************************************** */ private static boolean find(Coref c, HashMap> chains) { for (int i : chains.keySet()) { HashSet chain = chains.get(i); for (Coref c2 : chain) { if (c2.sentNum == c.sentNum && c2.firstToken == c.firstToken) return true; if (c2.info.containsKey("MIN") && (trimPunc(c2.info.get("MIN")).equals(trimPunc(c.token)) || (trimPunc(c.token)).contains(trimPunc(c2.info.get("MIN")))) ) return true; if (c.info.containsKey("MIN") && (trimPunc(c.info.get("MIN")).equals(trimPunc(c2.token)) || (trimPunc(c2.token)).contains(trimPunc(c.info.get("MIN")))) ) return true; } } return false; } /**************************************************************** * A kludge to handle the fact that MUC sometimes splits tokens * that are hyphenated. So we pre-split all hyphenated tokens * into several tokens that share the same token number. */ private static ArrayList splitTokens(ArrayList tokens) { ArrayList result = new ArrayList(); for (String t : tokens) { if (stripTokenNum(t).indexOf('-') > -1) { String num = getTokenNum(t); String[] split = t.split("-"); for (int i = 0; i < split.length - 1 ; i++) result.add(split[i] + "-" + num); } else { result.add(t); } } return result; } /**************************************************************** */ private static TreeMap> sortTotals(HashMap missed) { TreeMap> commonMissed = new TreeMap<>(); for (String s : missed.keySet()) { Integer i = missed.get(s); if (commonMissed.containsKey(i)) { ArrayList al = commonMissed.get(i); al.add(s); } else { ArrayList al = new ArrayList<>(); al.add(s); commonMissed.put(i, al); } } return commonMissed; } /**************************************************************** */ private static void printTopN(TreeMap> map, int n) { int index = 0; Iterator it = map.keySet().iterator(); while (index < n && index < map.keySet().size()) { Integer key = it.next(); ArrayList al = map.get(key); System.out.println(key + " : " + al); index++; } } /** *************************************************************** * Compare Stanford and MUC coreference chains. Create a map for * each token in MUC whether it is in Stanford and in which chain * ID and which Stanford token it corresponds to. Use that map to * score which tokens are not found (errors of omission, or false * negatives). Mark the tokens that are found in both MUC and * Stanford in a separate map. Use that map to create a third map * of tokens that are in Stanford but not in MUC. * @param chains are the MUC chains * @param document contains the Stanford chains */ public void compareChains(HashMap> chains, Annotation document) { HashMap stanfordNotMUC = new HashMap<>(); HashMap MUCNotStanford = new HashMap<>(); int thisStanford = 0; HashMap> stanfordChains = stanfordToCoref(document); for (int i : stanfordChains.keySet()) { HashSet chain = chains.get(i); if (chain != null) { for (Coref c : chain) { totalStanford++; thisStanford++; boolean found = find(c, chains); if (!found) stanfordNotMUC.put(c.ID, c); } } } System.out.println("Stanford not MUC: " + (stanfordNotMUC.keySet().size() + "/" + thisStanford)); falsePositive = falsePositive + stanfordNotMUC.keySet().size(); printCorefList(stanfordNotMUC); int thisMUC = 0; for (int i : chains.keySet()) { HashSet chain = chains.get(i); for (Coref c : chain) { totalMUC++; thisMUC++; boolean found = find(c, stanfordChains); if (!found) { MUCNotStanford.put(c.ID, c); if (!missedRefs.containsKey(c.token)) missedRefs.put(c.token,0); else { Integer counter = missedRefs.get(c.token) + 1; missedRefs.put(c.token, counter); } } } } System.out.println("MUC not Stanford : " + (MUCNotStanford.keySet().size() + "/" + thisMUC)); falseNegative = falseNegative + MUCNotStanford.keySet().size(); printCorefList(MUCNotStanford); } /** *************************************************************** * Pick tokens off the input sentence string, capturing corefXML * when present and aligning the corefXML with token numbers */ public void makeCorefList(String sentsDirty, ArrayList> tokenized) { StringBuffer sb = new StringBuffer(sentsDirty); HashMap corefs = new HashMap<>(); HashMap corefTokens = new HashMap<>(); HashMap corefParams = new HashMap<>(); HashMap references = new HashMap<>(); Stack currentCoref = new Stack<>(); Pattern p1 = Pattern.compile("^\\s*(]+>)"); Pattern p2 = Pattern.compile("^\\s*()"); int sentNum = 0; int level = 0; int tokenNum = 0; int firstToken = 0; boolean openTag = false; boolean skipping = false; String tag = ""; while (sb.length() > 0) { if (sentNum > tokenized.size() - 1) { System.out.println("Error in MUC.makeCorefList(): no tokenized sentence for: " + sb); break; } ArrayList tokens = tokenized.get(sentNum); tokens = splitTokens(tokens); //System.out.println("Num tokens: " + tokens.size()); sentNum++; String lastToken = ""; for (String t : tokens) { String tokenNumStr = getTokenNum(t); tokenNum = Integer.parseInt(tokenNumStr); String token = stripTokenNum(t); boolean tokenMatches = false; skipping = false; while (!tokenMatches && !skipping) { //System.out.println("Token: " + token + " Last token: " + lastToken); //System.out.println("sb: " + first100(sb)); Matcher m1 = p1.matcher(sb.toString()); Matcher m2 = p2.matcher(sb.toString()); if (token.length() > 0 && sb.length() > 0 && token.charAt(0) != '-' && sb.charAt(0) == '-') sb.delete(0, 1); else if (token.lastIndexOf('-') == 0) tokenMatches = true; else if (token.length() > 0 && sb.length() > 0 && token.charAt(0) == sb.charAt(0) && token.charAt(0) == '\'' && sb.charAt(0) == '\'' ) { sb.delete(0, 1); token = token.substring(1); System.out.println("altered Token: " + token); System.out.println("altered sb: " + first100(sb)); } // Stanford can insert an extra period if the last token in a sentence is an abbreviation else if (m1.find()) { tag = getTag(sb, m1); level++; int quoteIndex = tag.indexOf("\""); String id = tag.substring(quoteIndex + 1, tag.indexOf("\"", quoteIndex + 1)); currentCoref.push(Integer.parseInt(id)); corefTokens.put(Integer.parseInt(id), ""); corefParams.put(Integer.parseInt(id), tag); int refIndex = tag.indexOf("REF="); if (refIndex > -1) { int refQuoteIndex = tag.indexOf("\"", refIndex + 1); String ref = tag.substring(refQuoteIndex + 1, tag.indexOf("\"", refQuoteIndex + 1)); references.put(Integer.parseInt(id), Integer.parseInt(ref)); } openTag = true; } else if (m2.find()) { if (sb.indexOf("<") > -1) sb.delete(0, sb.indexOf("<")); sb.delete(0, sb.indexOf(">") + 1); if (currentCoref.size() < 1) { System.out.println("Error in MUC.makeCorefList(): no open tag for close tag\n" + first100(sb)); return; } Integer cid = currentCoref.pop(); Coref c = new Coref(); c.ID = cid; c.token = trimPunc(corefTokens.get(cid)); c.firstToken = firstToken; c.lastToken = tokenNum; processParams(c, corefParams); c.sentNum = sentNum; corefs.put(c.ID, c); level--; } else if (stripTokenNum(t).equals(".") && stripTokenNum(lastToken).endsWith(".") && !sb.toString().matches("^\\s*\\..*")) { System.out.println("makeCorefList() Skipping token removal: " + t); System.out.println(first100(sb)); skipping = true; continue; } else { if (openTag) firstToken = tokenNum; lastToken = token; leadingTrim(sb); tokenMatches = removeToken(sb, token); if (level > 0) expandCurrentToken(token, currentCoref, corefTokens); openTag = false; } } } } HashMap> chains = buildChains(corefs); printChains(chains); compareChains(chains, document2); } /** *************************************************************** */ public static void testParamString() { MUC muc = new MUC(); String paramstring = ""; Coref c = muc.new Coref(); paramstring = processOneParamString(paramstring,c); paramstring = processOneParamString(paramstring,c); paramstring = processOneParamString(paramstring,c); paramstring = processOneParamString(paramstring,c); } /** *************************************************************** */ public static void testRemoveToken() { MUC muc = new MUC(); String token = "Corp."; StringBuffer sb = new StringBuffer("Corp. He also served for 10 years as chairman and chief executive of Paramount Pictures Corp., a unit of Paramount Communications Inc. Arrow Investments Inc., a corporation controlled by Mr. Diller, in December agreed to purchase $25 million of QVC stock in a privately negotiated transaction."); Coref c = muc.new Coref(); removeToken(sb,token); System.out.println("MUC.testRemoveToken: " + sb); } /** *************************************************************** */ public static void testWhitespace() { String paramstring = " . By Patrick M. Reilly sentences = document1.get(CoreAnnotations.SentencesAnnotation.class); } /** *************************************************************** */ public void testParallelPipeline() { try (BufferedReader br = new BufferedReader(new InputStreamReader(System.in))) { Properties preprocessprops = new Properties(); preprocessprops.setProperty("annotators", "tokenize, ssplit,pos,lemma, ner, parse"); preprocessprops.setProperty("tokenize.options", "ptb3Escaping=false"); StanfordCoreNLP preprocesspipeline = new StanfordCoreNLP(preprocessprops); Properties corefprops = new Properties(); corefprops.setProperty("annotators", "dcoref"); //corefprops.setProperty("annotators", "hcoref"); corefprops.setProperty("tokenize.options", "ptb3Escaping=false"); corefprops.setProperty("enforceRequirements","false"); StanfordCoreNLP corefpipeline = new StanfordCoreNLP(corefprops); List coreMaps= Lists.newArrayList(); String input; while ((input = br.readLine()) != null) { Annotation document = new Annotation(input); preprocesspipeline.annotate(document); List newcoreMaps = document.get(CoreAnnotations.SentencesAnnotation.class); coreMaps.addAll(newcoreMaps); System.out.println("Stanford corefs: "); Annotation wholeDocument=new Annotation(coreMaps); corefpipeline.annotate(wholeDocument); Map graph = wholeDocument.get(CorefCoreAnnotations.CorefChainAnnotation.class); printStanfordCorefList(graph); } } catch (Exception e) { e.printStackTrace(); } } /** *************************************************************** */ public void testMUC() { //List lines = cleanSGML("/home/apease/IPsoft/corpora/muc6/data/keys/formal-tst.CO.key.cleanup.09jul96"); //List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/formal-tst.CO.key.cleanup.09jul96"); List lines = getDocuments(System.getenv("MUCCORPUS") + File.separator + "formal-tst.CO.key.cleanup.09jul96"); //List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/Wash.txt"); //List lines = getDocuments("/home/apease/IPsoft/corpora/muc6/data/keys/891101-0056.co.v0.sgm" + ""); for (String s : lines) { String cleanedInput = s.replaceAll("]+>", ""); cleanedInput = cleanedInput.replace("",""); List sentsClean = toSentences(cleanedInput); List sentsDirty = toSentences(s); System.out.println("\n\nMUC markup: " + sentsDirty); String allClean = listToString(sentsClean); ArrayList> tokenized = toCoref(listToString(sentsClean)); makeCorefList(s, tokenized); } System.out.println("False positive rate: " + (falsePositive + "/" + totalStanford)); System.out.println("False negative rate: " + (falseNegative + "/" + totalMUC)); System.out.println("Most common missed corefs: "); printTopN(sortTotals(missedRefs), 20); } /** *************************************************************** */ public static void main(String[] args) { //testWhitespace(); MUC muc = new MUC(); muc.initPipeline(); //muc.toCoref("Bob likes to eat. He is big"); muc.testMUC(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy