![JAR search and dependency download from the Maven repository](/logo.png)
edu.stanford.nlp.AdeptKBPAnnotator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-adept Show documentation
Show all versions of stanford-adept Show documentation
The project file to build and run stanford applications using Maven
The newest version!
package edu.stanford.nlp;
import edu.stanford.nlp.classify.Classifier;
import edu.stanford.nlp.classify.LinearClassifier;
import edu.stanford.nlp.coref.data.WordLists;
import edu.stanford.nlp.ie.*;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.simple.Document;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
/**
* An annotator which takes as input sentences, and produces KBP relation annotations.
*
* @author Gabor Angeli
*/
@SuppressWarnings("FieldCanBeLocal")
public class AdeptKBPAnnotator implements Annotator {
private String NOT_PROVIDED = "none";
private Properties kbpProperties;
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(AdeptKBPAnnotator.class);
//@ArgumentParser.Option(name="kbp.language", gloss="language for kbp")
//private String language = "english";
@ArgumentParser.Option(name="kbp.model", gloss="The path to the model, set to \"none\" for no model")
private String model = DefaultPaths.DEFAULT_KBP_CLASSIFIER;
@ArgumentParser.Option(name="kbp.semgrex", gloss="Semgrex patterns directory, set to \"none\" to not use semgrex")
private String semgrexdir = DefaultPaths.DEFAULT_KBP_SEMGREX_DIR;
@ArgumentParser.Option(name="kbp.tokensregex", gloss="Tokensregex patterns directory, set to \"none\" to not use tokensregex")
private String tokensregexdir = DefaultPaths.DEFAULT_KBP_TOKENSREGEX_DIR;
@ArgumentParser.Option(name="kbp.verbose", gloss="Print out KBP logging info")
private boolean VERBOSE = false;
// @ArgumentParser.Option(name="regexner.cased", gloss="The tokensregexner cased path")
// private String regexnerCasedPath = DefaultPaths.DEFAULT_KBP_REGEXNER_CASED;
//
// @ArgumentParser.Option(name="regexner.caseless", gloss="The tokensregexner caseless path")
// private String regexnerCaselessPath = DefaultPaths.DEFAULT_KBP_REGEXNER_CASELESS;
/**
* The extractor implementation.
*/
public final KBPRelationExtractor extractor;
/**
* A serializer to convert to the Simple CoreNLP representation.
*/
private final ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(false);
/*
* A TokensRegexNER annotator for the special KBP NER types (case-sensitive).
*/
//private final TokensRegexNERAnnotator casedNER;
/*
* A TokensRegexNER annotator for the special KBP NER types (case insensitive).
*/
//private final TokensRegexNERAnnotator caselessNER;
/** maximum length sentence to run on **/
private final int maxLength;
/** pattern matchers for processing coref mentions **/
TokenSequencePattern titlePersonPattern =
TokenSequencePattern.compile("[pos:JJ & ner:O]? [ner: TITLE]+ ([ner: PERSON]+)");
/** map for converting KBP relation names to latest names **/
private HashMap relationNameConversionMap;
/**
* Create a new KBP annotator from the given properties.
*
* @param props The properties to use when creating this extractor.
*/
public AdeptKBPAnnotator(String name, Properties props) {
// Parse standard properties
ArgumentParser.fillOptions(this, name, props);
//Locale kbpLanguage =
//(language.toLowerCase().equals("zh") || language.toLowerCase().equals("chinese")) ?
//Locale.CHINESE : Locale.ENGLISH ;
kbpProperties = props;
try {
ArrayList extractors = new ArrayList();
// add tokensregex rules
if (!tokensregexdir.equals(NOT_PROVIDED))
extractors.add(new KBPTokensregexExtractor(tokensregexdir, VERBOSE));
// add semgrex rules
if (!semgrexdir.equals(NOT_PROVIDED))
extractors.add(new KBPSemgrexExtractor(semgrexdir,VERBOSE));
// attempt to add statistical model
if (!model.equals(NOT_PROVIDED)) {
log.info("Loading KBP classifier from: " + model);
Object object = IOUtils.readObjectFromURLOrClasspathOrFileSystem(model);
KBPRelationExtractor statisticalExtractor;
if (object instanceof LinearClassifier) {
//noinspection unchecked
statisticalExtractor = new KBPStatisticalExtractor((Classifier) object);
} else if (object instanceof KBPStatisticalExtractor) {
statisticalExtractor = (KBPStatisticalExtractor) object;
} else {
throw new ClassCastException(object.getClass() + " cannot be cast into a " + KBPStatisticalExtractor.class);
}
extractors.add(statisticalExtractor);
}
// build extractor
this.extractor =
new KBPEnsembleExtractor(extractors.toArray(
new KBPRelationExtractor[extractors.size()]));
// set maximum length of sentence to operate on
maxLength = Integer.parseInt(props.getProperty("kbp.maxlen", "-1"));
} catch (IOException | ClassNotFoundException e) {
throw new RuntimeIOException(e);
}
// set up map for converting between older and new KBP relation names
relationNameConversionMap = new HashMap();
relationNameConversionMap.put("org:dissolved", "org:date_dissolved");
relationNameConversionMap.put("org:founded", "org:date_founded");
relationNameConversionMap.put("org:number_of_employees/members", "org:number_of_employees_members");
relationNameConversionMap.put("org:political/religious_affiliation", "org:political_religious_affiliation");
relationNameConversionMap.put("org:top_members/employees", "org:top_members_employees");
relationNameConversionMap.put("per:member_of", "per:employee_or_member_of");
relationNameConversionMap.put("per:employee_of", "per:employee_or_member_of");
relationNameConversionMap.put("per:stateorprovinces_of_residence", "per:statesorprovinces_of_residence");
}
/** @see KBPAnnotator#KBPAnnotator(String, Properties) */
@SuppressWarnings("unused")
public AdeptKBPAnnotator(Properties properties) {
this(STANFORD_KBP, properties);
}
/**
* Returns whether the given token counts as a valid pronominal mention for KBP.
* This method (at present) works for either Chinese or English.
*
* @param word The token to classify.
* @return true if this token is a pronoun that KBP should recognize.
*/
private static boolean kbpIsPronominalMention(CoreLabel word) {
return WordLists.isKbpPronominalMention(word.word());
}
/**
* Annotate all the pronominal mentions in the document.
* @param ann The document.
* @return The list of pronominal mentions in the document.
*/
private static List annotatePronominalMentions(Annotation ann) {
List pronouns = new ArrayList<>();
List sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
for (int sentenceIndex = 0; sentenceIndex < sentences.size(); sentenceIndex++) {
CoreMap sentence = sentences.get(sentenceIndex);
Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
if (annoTokenBegin == null) {
annoTokenBegin = 0;
}
List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int tokenIndex = 0; tokenIndex < tokens.size(); tokenIndex++) {
CoreLabel token = tokens.get(tokenIndex);
if (kbpIsPronominalMention(token)) {
CoreMap pronoun = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1,
annoTokenBegin, null, CoreAnnotations.TextAnnotation.class, null);
pronoun.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
sentence.get(CoreAnnotations.MentionsAnnotation.class).add(pronoun);
pronouns.add(pronoun);
}
}
}
return pronouns;
}
/**
* Augment the coreferent mention map with acronym matches.
*/
private static void acronymMatch(List mentions, Map> mentionsMap) {
int ticks = 0;
// Get all the candidate antecedents
Map, CoreMap> textToMention = new HashMap<>();
for (CoreMap mention : mentions) {
String nerTag = mention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (nerTag != null && (nerTag.equals(KBPRelationExtractor.NERTag.ORGANIZATION.name)
|| nerTag.equals(KBPRelationExtractor.NERTag.LOCATION.name))) {
List tokens = mention.get(CoreAnnotations.TokensAnnotation.class).stream().map(CoreLabel::word).collect(Collectors.toList());
if (tokens.size() > 1) {
textToMention.put(tokens, mention);
}
}
}
// Look for candidate acronyms
for (CoreMap acronym : mentions) {
String nerTag = acronym.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (nerTag != null && (nerTag.equals(KBPRelationExtractor.NERTag.ORGANIZATION.name)
|| nerTag.equals(KBPRelationExtractor.NERTag.LOCATION.name))) {
String text = acronym.get(CoreAnnotations.TextAnnotation.class);
if (!text.contains(" ")) {
// Candidate acronym
Set acronymCluster = mentionsMap.get(acronym);
if (acronymCluster == null) {
acronymCluster = new LinkedHashSet<>();
acronymCluster.add(acronym);
}
// Try to match it to an antecedent
for (Map.Entry, CoreMap> entry : textToMention.entrySet()) {
// Time out if we take too long in this loop.
ticks += 1;
if (ticks > 1000) {
return;
}
// Check if the pair is an acronym
if (AcronymMatcher.isAcronym(text, entry.getKey())) {
// Case: found a coreferent pair
CoreMap coreferent = entry.getValue();
Set coreferentCluster = mentionsMap.get(coreferent);
if (coreferentCluster == null) {
coreferentCluster = new LinkedHashSet<>();
coreferentCluster.add(coreferent);
}
// Create a new coreference cluster
Set newCluster = new LinkedHashSet<>();
newCluster.addAll(acronymCluster);
newCluster.addAll(coreferentCluster);
// Set the new cluster
for (CoreMap key : newCluster) {
mentionsMap.put(key, newCluster);
}
}
}
}
}
}
}
/**
* Helper method to find best kbp mention in a coref chain
* This is defined as longest kbp mention or null if
* the coref chain does not contain a kbp mention
*
* @param ann the annotation
* @param corefChain CorefChain containing potential KBP mentions to search through
* @param kbpMentions HashMap mapping character offsets to KBP mentions
* @return a list of kbp mentions (or null) for each coref mention in this coref chain, and the index of "best"
* kbp mention, which in this case is the longest kbp mention
*
*/
public Pair, CoreMap> corefChainToKBPMentions(CorefChain corefChain, Annotation ann,
HashMap, CoreMap> kbpMentions) {
// map coref mentions into kbp mentions (possibly null if no corresponding kbp mention)
List annSentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
// create a list of kbp mentions in this coref chain, possibly all null
//System.err.println("---");
//System.err.println("KBP mentions for coref chain");
List kbpMentionsForCorefChain = corefChain.getMentionsInTextualOrder().stream().map((cm) -> {
CoreMap cmSentence = annSentences.get(cm.sentNum - 1);
List cmSentenceTokens = cmSentence.get(CoreAnnotations.TokensAnnotation.class);
int cmCharBegin = cmSentenceTokens.get(cm.startIndex - 1).get(
CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int cmCharEnd = cmSentenceTokens.get(cm.endIndex - 2).get(
CoreAnnotations.CharacterOffsetEndAnnotation.class);
CoreMap kbpMentionFound = kbpMentions.get(new Pair<>(cmCharBegin, cmCharEnd));
// if a best KBP mention can't be found, handle special cases
if (kbpMentionFound == null) {
List corefMentionTokens =
cmSentence.get(CoreAnnotations.TokensAnnotation.class).subList(cm.startIndex-1, cm.endIndex-1);
// look for a PERSON kbp mention in TITLE+ (PERSON+)
TokenSequenceMatcher titlePersonMatcher = titlePersonPattern.matcher(corefMentionTokens);
if (titlePersonMatcher.find()) {
List overallMatch = titlePersonMatcher.groupNodes(0);
List personWithinMatch = titlePersonMatcher.groupNodes(1);
if (overallMatch.size() == corefMentionTokens.size()) {
int personBeginOffset = ((CoreLabel) personWithinMatch.get(0)).beginPosition();
int personEndOffset = ((CoreLabel) personWithinMatch.get(personWithinMatch.size()-1)).endPosition();
Pair personOffsets = new Pair(personBeginOffset, personEndOffset);
kbpMentionFound = kbpMentions.get(personOffsets);
}
}
}
//if (kbpMentionFound != null)
//System.err.println(kbpMentionFound.get(CoreAnnotations.TextAnnotation.class));
return kbpMentionFound;
}).collect(Collectors.toList());
// map kbp mentions to the lengths of their text
List kbpMentionLengths = kbpMentionsForCorefChain.stream().map(
km -> (new Integer(km == null ? 0 : km.get(CoreAnnotations.TextAnnotation.class).length()))).collect(
Collectors.toList());
int bestIndex = kbpMentionLengths.indexOf(kbpMentionLengths.stream().reduce(0, (a, b) -> Math.max(a, b)));
// return the first occurrence of the kbp mention with max length (possibly null)
return new Pair(kbpMentionsForCorefChain, kbpMentionsForCorefChain.get(bestIndex));
}
/**
* Convert between older naming convention and current for relation names
* @param relationName the original relation name.
* @return the converted relation name
*
*/
private String convertRelationNameToLatest(String relationName) {
if (relationNameConversionMap.containsKey(relationName)) {
return relationNameConversionMap.get(relationName);
} else {
return relationName;
}
}
/**
* Annotate this document for KBP relations.
* @param annotation The document to annotate.
*/
@Override
public void annotate(Annotation annotation) {
// get a list of sentences for this annotation
List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
// Create simple document
Document doc = new Document(kbpProperties,serializer.toProto(annotation));
// Get the mentions in the document
List mentions = new ArrayList<>();
for (CoreMap sentence : sentences) {
mentions.addAll(sentence.get(CoreAnnotations.MentionsAnnotation.class));
}
List pronounMentions = annotatePronominalMentions(annotation);
mentions.addAll(pronounMentions);
// Compute coreferent clusters
// (map an index to a KBP mention)
Map, CoreMap> mentionByStartIndex = new HashMap<>();
for (CoreMap mention : mentions) {
for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
mentionByStartIndex.put(Pair.makePair(token.sentIndex(), token.index()), mention);
}
}
// (collect coreferent KBP mentions)
Map> mentionsMap = new HashMap<>(); // map from canonical mention -> other mentions
if (annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) {
for (Map.Entry chain : annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class).entrySet()) {
CoreMap firstMention = null;
for (CorefChain.CorefMention mention : chain.getValue().getMentionsInTextualOrder()) {
CoreMap kbpMention = null;
for (int i = mention.startIndex; i < mention.endIndex; ++i) {
if (mentionByStartIndex.containsKey(Pair.makePair(mention.sentNum - 1, i))) {
kbpMention = mentionByStartIndex.get(Pair.makePair(mention.sentNum - 1, i));
break;
}
}
if (firstMention == null) {
firstMention = kbpMention;
}
if (kbpMention != null) {
if (!mentionsMap.containsKey(firstMention)) {
mentionsMap.put(firstMention, new LinkedHashSet<>());
}
mentionsMap.get(firstMention).add(kbpMention);
}
}
}
}
// (coreference acronyms)
acronymMatch(mentions, mentionsMap);
// (ensure valid NER tag for canonical mention)
for (CoreMap key : new HashSet<>(mentionsMap.keySet())) {
if (key.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null) {
CoreMap newKey = null;
for (CoreMap candidate : mentionsMap.get(key)) {
if (candidate.get(CoreAnnotations.NamedEntityTagAnnotation.class) != null) {
newKey = candidate;
break;
}
}
if (newKey != null) {
mentionsMap.put(newKey, mentionsMap.remove(key));
} else {
mentionsMap.remove(key); // case: no mention in this chain has an NER tag.
}
}
}
// Propagate Entity Link
for (Map.Entry> entry : mentionsMap.entrySet()) {
String entityLink = entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class);
for (CoreMap mention : entry.getValue()) {
for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
token.set(CoreAnnotations.WikipediaEntityAnnotation.class, entityLink);
}
}
}
// create a mapping of char offset pairs to KBPMention
HashMap, CoreMap> charOffsetToKBPMention = new HashMap<>();
for (CoreMap mention : mentions) {
int nerMentionCharBegin = mention.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int nerMentionCharEnd = mention.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
charOffsetToKBPMention.put(new Pair<>(nerMentionCharBegin, nerMentionCharEnd), mention);
}
// Create a canonical mention map
Map mentionToCanonicalMention = new HashMap<>();
// check if there is coref info
Set> corefChains;
if (annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null)
corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class).entrySet();
else
corefChains = new HashSet<>();
for (Map.Entry indexCorefChainPair : corefChains) {
CorefChain corefChain = indexCorefChainPair.getValue();
Pair, CoreMap> corefChainKBPMentionsAndBestIndex = corefChainToKBPMentions(corefChain, annotation,
charOffsetToKBPMention);
List corefChainKBPMentions = corefChainKBPMentionsAndBestIndex.first();
CoreMap bestKBPMentionForChain = corefChainKBPMentionsAndBestIndex.second();
if (bestKBPMentionForChain != null) {
for (CoreMap kbpMention : corefChainKBPMentions) {
if (kbpMention != null) {
//System.err.println("---");
// ad hoc filters ; assume acceptable unless a filter blocks it
boolean acceptableLink = true;
// block people matches without a token overlap, exempting pronominal to non-pronominal
// good: Ashton --> Catherine Ashton
// good: she --> Catherine Ashton
// bad: Morsi --> Catherine Ashton
String kbpMentionNERTag = kbpMention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
String bestKBPMentionForChainNERTag =
bestKBPMentionForChain.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (kbpMentionNERTag != null && bestKBPMentionForChainNERTag != null &&
kbpMentionNERTag.equals("PERSON") && bestKBPMentionForChainNERTag.equals("PERSON")
&& !kbpIsPronominalMention(kbpMention.get(CoreAnnotations.TokensAnnotation.class).get(0))
&& !kbpIsPronominalMention(bestKBPMentionForChain.get(CoreAnnotations.TokensAnnotation.class).get(0))) {
//System.err.println("testing PERSON to PERSON coref link");
boolean tokenMatchFound = false;
for (CoreLabel kbpToken : kbpMention.get(CoreAnnotations.TokensAnnotation.class)) {
for (CoreLabel bestKBPToken : bestKBPMentionForChain.get(CoreAnnotations.TokensAnnotation.class)) {
if (kbpToken.word().toLowerCase().equals(bestKBPToken.word().toLowerCase())) {
tokenMatchFound = true;
break;
}
}
if (tokenMatchFound)
break;
}
if (!tokenMatchFound)
acceptableLink = false;
}
// check the coref link passed the filters
if (acceptableLink)
mentionToCanonicalMention.put(kbpMention, bestKBPMentionForChain);
//System.err.println("kbp mention: " + kbpMention.get(CoreAnnotations.TextAnnotation.class));
//System.err.println("coref mention: " + bestKBPMentionForChain.get(CoreAnnotations.TextAnnotation.class));
}
}
}
}
// Create a canonical mention map
//Map mentionToCanonicalMention = new HashMap<>();
/*for (Map.Entry> entry : mentionsMap.entrySet()) {
for (CoreMap mention : entry.getValue()) {
// (set the NER tag + link to be axiomatically that of the canonical mention)
// FOR NOW allow clusters to have inconsistent types, this seems to cause more problems than solve
// mention.set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.getKey().get(CoreAnnotations.NamedEntityTagAnnotation.class));
// mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class));
// (add the mention (note: this must come after we set the NER!)
mentionToCanonicalMention.put(mention, entry.getKey());
}
}*/
// (add missing mentions)
mentions.stream().filter(mention -> mentionToCanonicalMention.get(mention) == null)
.forEach(mention -> mentionToCanonicalMention.put(mention, mention));
// handle acronym coreference
HashMap> acronymClusters = new HashMap<>();
HashMap> acronymInstances = new HashMap<>();
for (CoreMap acronymMention : mentionToCanonicalMention.keySet()) {
String acronymNERTag = acronymMention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if ((acronymMention == mentionToCanonicalMention.get(acronymMention)) && acronymNERTag != null &&
(acronymNERTag.equals(KBPRelationExtractor.NERTag.ORGANIZATION.name) ||
acronymNERTag.equals(KBPRelationExtractor.NERTag.LOCATION.name))) {
String acronymText = acronymMention.get(CoreAnnotations.TextAnnotation.class);
List coreferentMentions = new ArrayList();
// define acronyms as not containing spaces (e.g. ACLU)
if (!acronymText.contains(" ")) {
int numCoreferentsChecked = 0;
for (CoreMap coreferentMention : mentions) {
// only check first 1000
if (numCoreferentsChecked > 1000)
break;
// don't check a mention against itself
if (acronymMention == coreferentMention)
continue;
// don't check other mentions without " "
String coreferentText = coreferentMention.get(CoreAnnotations.TextAnnotation.class);
if (!coreferentText.contains(" "))
continue;
numCoreferentsChecked++;
List coreferentTokenStrings = coreferentMention.get(
CoreAnnotations.TokensAnnotation.class).stream().map(coreferentToken -> coreferentToken.word()).collect(
Collectors.toList());
// when an acronym match is found:
// store every mention (that isn't ACLU) that matches with ACLU in acronymClusters
// store every instance of "ACLU" in acronymInstances
// afterwards find the best mention in acronymClusters, and match it to every mention in acronymInstances
if (AcronymMatcher.isAcronym(acronymText, coreferentTokenStrings)) {
if (!acronymClusters.containsKey(acronymText))
acronymClusters.put(acronymText, new ArrayList());
if (!acronymInstances.containsKey(acronymText))
acronymInstances.put(acronymText, new ArrayList());
acronymClusters.get(acronymText).add(coreferentMention);
acronymInstances.get(acronymText).add(acronymMention);
}
}
}
}
}
// process each acronym (e.g. ACLU)
for (String acronymText : acronymInstances.keySet()) {
// find longest ORG or null
CoreMap bestORG = null;
for (CoreMap coreferentMention : acronymClusters.get(acronymText)) {
if (!coreferentMention.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(
KBPRelationExtractor.NERTag.ORGANIZATION.name))
continue;
if (bestORG == null)
bestORG = coreferentMention;
else if (coreferentMention.get(CoreAnnotations.TextAnnotation.class).length() >
bestORG.get(CoreAnnotations.TextAnnotation.class).length())
bestORG = coreferentMention;
}
// find longest LOC or null
CoreMap bestLOC = null;
for (CoreMap coreferentMention : acronymClusters.get(acronymText)) {
if (!coreferentMention.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(
KBPRelationExtractor.NERTag.LOCATION.name))
continue;
if (bestLOC == null)
bestLOC = coreferentMention;
else if (coreferentMention.get(CoreAnnotations.TextAnnotation.class).length() >
bestLOC.get(CoreAnnotations.TextAnnotation.class).length())
bestLOC = coreferentMention;
}
// link ACLU to "American Civil Liberties Union" ; make sure NER types match
for (CoreMap acronymMention : acronymInstances.get(acronymText)) {
String mentionType = acronymMention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (mentionType.equals(KBPRelationExtractor.NERTag.ORGANIZATION.name) && bestORG != null)
mentionToCanonicalMention.put(acronymMention, bestORG);
if (mentionType.equals(KBPRelationExtractor.NERTag.LOCATION.name) && bestLOC != null)
mentionToCanonicalMention.put(acronymMention, bestLOC);
}
}
// Cluster mentions by sentence
@SuppressWarnings("unchecked") List[] mentionsBySentence = new List[annotation.get(CoreAnnotations.SentencesAnnotation.class).size()];
for (int i = 0; i < mentionsBySentence.length; ++i) {
mentionsBySentence[i] = new ArrayList<>();
}
for (CoreMap mention : mentionToCanonicalMention.keySet()) {
mentionsBySentence[mention.get(CoreAnnotations.SentenceIndexAnnotation.class)].add(mention);
}
// Classify
for (int sentenceI = 0; sentenceI < mentionsBySentence.length; ++sentenceI) {
HashMap relationStringsToTriples = new HashMap<>();
List finalTriplesList = new ArrayList<>(); // the annotations
List candidates = mentionsBySentence[sentenceI];
// determine sentence length
int sentenceLength =
annotation.get(CoreAnnotations.SentencesAnnotation.class)
.get(sentenceI).get(CoreAnnotations.TokensAnnotation.class).size();
// check if sentence is too long, if it's too long don't run kbp
if (maxLength != -1 && sentenceLength > maxLength) {
// set the triples annotation to an empty list of RelationTriples
annotation.get(
CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(
CoreAnnotations.KBPTriplesAnnotation.class, finalTriplesList);
// continue to next sentence
continue;
}
// sentence isn't too long, so continue processing this sentence
for (int subjI = 0; subjI < candidates.size(); ++subjI) {
CoreMap subj = candidates.get(subjI);
int subjBegin = subj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
int subjEnd = subj.get(CoreAnnotations.TokensAnnotation.class).get(subj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
Optional subjNER = KBPRelationExtractor.NERTag.fromString(subj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
if (subjNER.isPresent()) {
for (int objI = 0; objI < candidates.size(); ++objI) {
if (subjI == objI) {
continue;
}
if (Thread.interrupted()) {
throw new RuntimeInterruptedException();
}
CoreMap obj = candidates.get(objI);
int objBegin = obj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
int objEnd = obj.get(CoreAnnotations.TokensAnnotation.class).get(obj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
Optional objNER = KBPRelationExtractor.NERTag.fromString(obj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
if (objNER.isPresent() &&
KBPRelationExtractor.RelationType.plausiblyHasRelation(subjNER.get(), objNER.get())) { // type check
KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(
new Span(subjBegin, subjEnd),
new Span(objBegin, objEnd),
subjNER.get(),
objNER.get(),
doc.sentence(sentenceI)
);
// -- BEGIN Classify
Pair prediction = extractor.classify(input);
// -- END Classify
// Handle the classifier output
if (!KBPStatisticalExtractor.NO_RELATION.equals(prediction.first)) {
RelationTriple triple = new RelationTriple.WithLink(
subj.get(CoreAnnotations.TokensAnnotation.class),
mentionToCanonicalMention.get(subj).get(CoreAnnotations.TokensAnnotation.class),
Collections.singletonList(
new CoreLabel(new Word(convertRelationNameToLatest(prediction.first)))),
obj.get(CoreAnnotations.TokensAnnotation.class),
mentionToCanonicalMention.get(obj).get(CoreAnnotations.TokensAnnotation.class),
prediction.second,
sentences.get(sentenceI).get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class),
subj.get(CoreAnnotations.WikipediaEntityAnnotation.class),
obj.get(CoreAnnotations.WikipediaEntityAnnotation.class)
);
String tripleString =
triple.subjectGloss()+"\t"+triple.relationGloss()+"\t"+triple.objectGloss();
// ad hoc checks for problems
boolean acceptableTriple = true;
if (triple.objectGloss().equals(triple.subjectGloss()) &&
triple.relationGloss().endsWith("alternate_names"))
acceptableTriple = false;
// only add this triple if it has the highest confidence ; this process generates duplicates with
// different confidence scores, so we want to filter out the lower confidence versions
if (acceptableTriple && !relationStringsToTriples.containsKey(tripleString))
relationStringsToTriples.put(tripleString, triple);
else if (acceptableTriple && triple.confidence > relationStringsToTriples.get(tripleString).confidence)
relationStringsToTriples.put(tripleString, triple);
}
}
}
}
}
finalTriplesList = new ArrayList(relationStringsToTriples.values());
// Set triples
annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(
CoreAnnotations.KBPTriplesAnnotation.class, finalTriplesList);
}
}
/** {@inheritDoc} */
@Override
public Set> requirementsSatisfied() {
Set> requirements = new HashSet<>(Arrays.asList(
CoreAnnotations.MentionsAnnotation.class,
CoreAnnotations.KBPTriplesAnnotation.class
));
return Collections.unmodifiableSet(requirements);
}
/** {@inheritDoc} */
@Override
public Set> requires() {
Set> requirements = new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class,
CoreAnnotations.PartOfSpeechAnnotation.class,
CoreAnnotations.LemmaAnnotation.class,
SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class,
SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class,
SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class
));
return Collections.unmodifiableSet(requirements);
}
/**
* A debugging method to try relation extraction from the console.
* @throws IOException If any IO problem
*/
public static void main(String[] args) throws IOException {
Properties props = StringUtils.argsToProperties(args);
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,regexner,parse,mention,coref,kbp");
props.setProperty("regexner.mapping", "ignorecase=true,validpospattern=^(NN|JJ).*,edu/stanford/nlp/models/kbp/regexner_caseless.tab;edu/stanford/nlp/models/kbp/regexner_cased.tab");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
IOUtils.console("sentence> ", line -> {
Annotation ann = new Annotation(line);
pipeline.annotate(ann);
for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) {
sentence.get(CoreAnnotations.KBPTriplesAnnotation.class).forEach(System.err::println);
}
});
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy