
eu.fbk.dkm.pikes.resources.NAFUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pikes-resources Show documentation
Show all versions of pikes-resources Show documentation
A collection of Java classes for accessing and querying a number of NLP resources.
The newest version!
package eu.fbk.dkm.pikes.resources;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import com.google.common.base.Charsets;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import eu.fbk.utils.core.Range;
import ixa.kaflib.Coref;
import ixa.kaflib.Dep;
import ixa.kaflib.Entity;
import ixa.kaflib.ExternalRef;
import ixa.kaflib.Factuality;
import ixa.kaflib.KAFDocument;
import ixa.kaflib.Opinion;
import ixa.kaflib.Opinion.OpinionExpression;
import ixa.kaflib.Opinion.OpinionHolder;
import ixa.kaflib.Opinion.OpinionTarget;
import ixa.kaflib.Predicate;
import ixa.kaflib.Predicate.Role;
import ixa.kaflib.Span;
import ixa.kaflib.Term;
import ixa.kaflib.Timex3;
import ixa.kaflib.WF;
import eu.fbk.rdfpro.util.IO;
public final class NAFUtils {
public static final String RESOURCE_PROPBANK = "PropBank";
public static final String RESOURCE_NOMBANK = "NomBank";
public static final String RESOURCE_VERBNET = "VerbNet";
public static final String RESOURCE_FRAMENET = "FrameNet";
public static final String RESOURCE_BBN = "BBN";
public static final String RESOURCE_WN_SYNSET = "wn30-ukb";
public static final String RESOURCE_WN_SST = "wn30-sst";
public static final String RESOURCE_SUMO = "SUMO";
public static final String RESOURCE_ENTITY_REF = "NAFFilter-EntityRef";
public static final String RESOURCE_ENTITY_COREF = "NAFFilter-EntityCoref";
public static final String RESOURCE_PREDICATE_REF = "NAFFilter-PredicateRef";
public static final String RESOURCE_PREDICATE_COREF = "NAFFilter-PredicateCoref";
public static final String RESOURCE_TIMEX_REF = "NAFFilter-TimexRef";
public static final String RESOURCE_TIMEX_COREF = "NAFFilter-TimexCoref";
public static final String RESOURCE_VALUE = "value";
public static final String RESOURCE_YAGO = "Yago";
public static final Ordering OPINION_COMPARATOR = new Ordering() {
@Override
public int compare(final Opinion left, final Opinion right) {
final int leftOffset = left.getOpinionExpression().getSpan().getTargets().get(0)
.getOffset();
final int rightOffset = right.getOpinionExpression().getSpan().getTargets().get(0)
.getOffset();
return leftOffset - rightOffset;
}
};
private static final Pattern WF_EXCLUSION_PATTERN = Pattern.compile("[^A-Za-z0-9]*");
private static final Set SYMBOLS = ImmutableSet.of("$", "#", "&", "€");
public static void normalize(final KAFDocument document) {
// Convert SST, synset and BBN attributes to external refs
for (final Term term : document.getTerms()) {
boolean hasBBN = false;
boolean hasSynset = false;
boolean hasSST = false;
for (final ExternalRef ref : term.getExternalRefs()) {
hasBBN |= RESOURCE_BBN.equalsIgnoreCase(ref.getResource());
hasSynset |= RESOURCE_WN_SYNSET.equalsIgnoreCase(ref.getResource());
hasSST |= RESOURCE_WN_SST.equalsIgnoreCase(ref.getResource());
}
if (!hasBBN && term.getBBNTag() != null) {
term.addExternalRef(document.newExternalRef(RESOURCE_BBN, term.getBBNTag()));
}
if (!hasSynset && term.getWordnetSense() != null) {
term.addExternalRef(document.newExternalRef(RESOURCE_WN_SYNSET,
term.getWordnetSense()));
}
if (!hasSST && term.getSupersenseTag() != null) {
term.addExternalRef(document.newExternalRef(RESOURCE_WN_SST,
term.getSupersenseTag()));
}
term.setBBNTag(null);
term.setWordnetSense(null);
term.setSupersenseTag(null);
}
// Remove duplicate external refs
for (final Predicate predicate : document.getPredicates()) {
normalizeRefs(getRefs(predicate));
for (final Role role : predicate.getRoles()) {
normalizeRefs(getRefs(role));
}
}
}
public static List filterTerms(final Iterable terms) {
final List result = Lists.newArrayList();
boolean atBeginning = true;
for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
final char pos = Character.toUpperCase(term.getPos().charAt(0));
if (atBeginning && (pos == 'D' || pos == 'P')) {
continue;
}
for (final WF word : term.getWFs()) {
final String text = word.getForm();
if (SYMBOLS.contains(text) || !WF_EXCLUSION_PATTERN.matcher(text).matches()) {
result.add(term);
atBeginning = false;
break;
}
}
}
return result;
}
public static String getText(final Iterable terms) {
final StringBuilder builder = new StringBuilder();
boolean atBeginning = true;
for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
final boolean properNoun = term.getMorphofeat().startsWith("NNP");
for (final WF word : term.getWFs()) {
builder.append(atBeginning ? "" : " ");
builder.append(properNoun ? word.getForm() : word.getForm().toLowerCase());
atBeginning = false;
}
}
return builder.toString();
}
@Nullable
public static Term extractHead(final KAFDocument document, @Nullable final Span span) {
if (span == null) {
return null;
}
Term head = null; // span.getHead(); TODO
if (head == null) {
head = document.getTermsHead(span.getTargets()); // (re)compute
}
return head;
}
public static Set extractHeads(final KAFDocument document,
@Nullable final Iterable ancestors, @Nullable final Iterable span,
@Nullable final java.util.function.Predicate predicate) {
Set ancestorSet;
if (ancestors != null) {
ancestorSet = ImmutableSet.copyOf(ancestors);
} else {
ancestorSet = Sets.newHashSet();
final Set termSet = Sets.newHashSet(span);
for (final Term term : termSet) {
final Dep dep = document.getDepToTerm(term);
if (dep == null || !termSet.contains(dep.getFrom())) {
ancestorSet.add(term);
}
}
}
final Set result = Sets.newHashSet();
for (final Term ancestor : ancestorSet) {
extractHeadsHelper(document, ancestor, predicate, result);
}
if (span != null) {
result.retainAll(ImmutableSet.copyOf(span));
}
// System.err.println(document.getPublic().uri + " -> " + termFilter + " / " + ancestors
// + " -> " + result);
return result;
}
private static boolean extractHeadsHelper(final KAFDocument document, final Term term,
final java.util.function.Predicate predicate, final Collection result) {
final String pos = extendedPos(document, term);
boolean accepted = false;
if (pos.startsWith("V")) {
final Term srlHead = syntacticToSRLHead(document, term);
if (!term.equals(srlHead)) {
accepted = extractHeadsHelper(document, srlHead, predicate, result);
}
}
if (!accepted && (predicate == null || predicate.test(term))) {
result.add(term);
accepted = true;
}
if (accepted) {
for (final Dep dep : document.getDepsFromTerm(term)) {
if (dep.getRfunc().toUpperCase().contains("COORD")) {
extractHeadsHelper(document, dep.getTo(), predicate, result);
}
}
} else {
for (final Dep dep : document.getDepsFromTerm(term)) {
extractHeadsHelper(document, dep.getTo(), predicate, result);
}
}
return accepted;
}
public static boolean hasHead(final KAFDocument document, final Object annotation,
final Term head) {
List> spans;
if (annotation instanceof Coref) {
spans = ((Coref) annotation).getSpans();
} else if (annotation instanceof Entity) {
spans = ((Entity) annotation).getSpans();
} else if (annotation instanceof Timex3) {
spans = ImmutableList.of(KAFDocument.newTermSpan(document
.getTermsByWFs(((Timex3) annotation).getSpan().getTargets())));
} else if (annotation instanceof Predicate) {
spans = ImmutableList.of(((Predicate) annotation).getSpan());
} else if (annotation instanceof Role) {
spans = ImmutableList.of(((Role) annotation).getSpan());
} else {
throw new IllegalArgumentException("Unsupported annotation: " + annotation);
}
for (final Span span : spans) {
if (head == extractHead(document, span)) {
return true;
}
}
return false;
}
public static Span getNominalSpan(final KAFDocument document, final Term term,
final boolean includeCoord, final boolean includeModifiers) {
// Start from the supplied term
final Set terms = Sets.newHashSet(term);
// Identify head and terms of all NE and TIMEX markables containing supplied term
final Map> markables = Maps.newHashMap();
for (final Entity entity : document.getEntitiesByTerm(term)) {
markables.put(document.getTermsHead(entity.getTerms()), entity.getTerms());
}
for (final WF wf : term.getWFs()) {
for (final Timex3 timex : document.getTimeExsByWF(wf)) {
final List span = document.getTermsByWFs(timex.getSpan().getTargets());
markables.put(document.getTermsHead(span), span);
}
}
// Add the terms of the smallest markable 'matching' the term (i.e., whose head matches
// the term or a term ancestor in the dependency tree)
if (!markables.isEmpty()) {
Term t = term;
while (true) {
final List parent = markables.get(t);
if (parent != null) {
terms.addAll(parent);
break;
}
final Dep dep = document.getDepToTerm(t);
if (dep == null) {
break;
}
t = dep.getFrom();
}
}
// Identify head
final Term head = document.getTermsHead(terms);
// Add all terms reachable from the head using a regex
final String regex = includeCoord ? includeModifiers ? "(COORD CONJ?)* ((NAME|NMOD|AMOD|TMP) .*)?"
: "(COORD CONJ?)* NAME"
: includeModifiers ? "((NAME|NMOD|AMOD|TMP) .*)?" : "NAME";
terms.addAll(document.getTermsByDepAncestors(Collections.singleton(head), regex));
// Sort obtained terms by offset and return resulting list
return KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms),
head);
}
@Nullable
public static String extractLemma(@Nullable final String rolesetOrRole) {
if (rolesetOrRole == null) {
return null;
}
int index = rolesetOrRole.indexOf('.');
if (index < 0) {
index = rolesetOrRole.indexOf('@');
}
return (index >= 0 ? rolesetOrRole.substring(0, index) : rolesetOrRole).toLowerCase();
}
@Nullable
public static Integer extractSense(@Nullable final String rolesetOrRole) {
if (rolesetOrRole == null) {
return null;
}
final int start = Math.max(0, rolesetOrRole.indexOf('.') + 1);
int end = rolesetOrRole.indexOf('@');
end = end > 0 ? end : rolesetOrRole.length();
try {
return Integer.valueOf(rolesetOrRole.substring(start, end));
} catch (final Throwable ex) {
return null;
}
}
@Nullable
public static Integer extractArgNum(@Nullable final String role) {
if (role == null) {
return null;
}
int index = role.length();
while (index > 0 && Character.isDigit(role.charAt(index - 1))) {
--index;
}
return index == role.length() ? null : Integer.valueOf(role.substring(index));
}
// OFFSETS
public static int getBegin(final Term term) {
return term.getOffset();
}
public static int getEnd(final Term term) {
final List wfs = term.getWFs();
final WF wf = wfs.get(wfs.size() - 1);
final String str = wf.getForm();
if (str.equals("-LSB-") || str.equals("-RSB-") || str.equals("''")) {
return wf.getOffset() + 1;
}
return wf.getOffset() + wf.getLength();
}
public static int getLength(final Term term) {
return getEnd(term) - term.getOffset();
}
public static String getRoleset(final Predicate predicate) {
final String res = predicate.getTerms().get(0).getPos().equalsIgnoreCase("V") ? RESOURCE_PROPBANK
: RESOURCE_NOMBANK;
String roleset = null;
for (final ExternalRef ref : predicate.getExternalRefs()) {
if (res.equalsIgnoreCase(ref.getResource())) {
if (ref.getSource() != null) {
roleset = ref.getReference();
break;
} else if (roleset == null) {
roleset = ref.getReference();
}
}
}
return roleset;
}
// EXTERNAL REFS
@Nullable
public static ExternalRef getRef(@Nullable final Object annotation,
@Nullable final String resource, @Nullable final String reference) {
ExternalRef result = null;
for (final ExternalRef ref : getRefs(annotation)) {
if (matchRef(ref, resource, reference)) {
if (result != null) {
throw new IllegalStateException("Multiple ExternalRef matched for resource "
+ resource + ", reference " + reference + ": " + ref.getReference()
+ ", " + result.getReference());
}
result = ref;
}
}
return result;
}
public static List getRefs(final Object annotation,
@Nullable final String resource, @Nullable final String reference) {
final List result = Lists.newArrayList();
for (final ExternalRef ref : getRefs(annotation)) {
if (matchRef(ref, resource, reference)) {
result.add(ref);
}
}
return result;
}
public static void removeRefs(final Object annotation, @Nullable final String resource,
@Nullable final String reference) {
final List refs = getRefs(annotation);
for (final Iterator i = refs.iterator(); i.hasNext();) {
final ExternalRef ref = i.next();
if (matchRef(ref, resource, reference)) {
i.remove();
}
}
}
public static void addRef(final Object annotation, final ExternalRef ref) {
getRefs(annotation).add(ref);
}
public static void setRef(final Object annotation, final ExternalRef ref) {
removeRefs(annotation, ref.getResource(), ref.getReference());
getRefs(annotation).add(ref);
}
public static String toString(final Object annotation) {
if (annotation instanceof Term) {
final Term term = (Term) annotation;
return "term " + term.getId() + " '" + term + "'";
} else if (annotation instanceof Entity) {
final Entity entity = (Entity) annotation;
return "entity " + entity.getId() + " '" + entity.getStr() + "'";
} else if (annotation instanceof Timex3) {
final Timex3 timex = (Timex3) annotation;
return "timex " + timex.getId() + " '" + timex.getSpan().getStr() + "'";
} else if (annotation instanceof Predicate) {
final Predicate pred = (Predicate) annotation;
return "predicate " + pred.getId() + " '" + pred.getSpan().getStr() + "'";
} else if (annotation instanceof Role) {
final Role role = (Role) annotation;
return "role " + role.getId() + " '" + role.getStr() + "' (" + role.getSemRole() + ")";
} else if (annotation instanceof Opinion) {
return "opinion " + ((Opinion) annotation).getId();
} else if (annotation instanceof OpinionTarget) {
return "opinion target '" + ((OpinionTarget) annotation).getSpan().getStr() + "'";
} else if (annotation instanceof OpinionHolder) {
return "opinion holder '" + ((OpinionHolder) annotation).getSpan().getStr() + "'";
} else if (annotation instanceof OpinionExpression) {
return "opinion expression '" + ((OpinionExpression) annotation).getSpan().getStr()
+ "'";
} else if (annotation instanceof Factuality) {
final Factuality fact = (Factuality) annotation;
return "factuality " + fact.getId() + " '" + fact.getWord().getStr() + "'";
} else if (annotation instanceof Coref) {
return "coref " + ((Coref) annotation).getId();
} else {
throw new IllegalArgumentException("Unsupported annotation object: " + annotation);
}
}
private static List getRefs(final Object annotation) {
List refs = ImmutableList.of();
if (annotation instanceof Term) {
refs = ((Term) annotation).getExternalRefs();
} else if (annotation instanceof Entity) {
refs = ((Entity) annotation).getExternalRefs();
} else if (annotation instanceof Predicate) {
refs = ((Predicate) annotation).getExternalRefs();
} else if (annotation instanceof Role) {
refs = ((Role) annotation).getExternalRefs();
} else if (annotation instanceof Opinion) {
refs = ((Opinion) annotation).getExternalRefs();
} else if (annotation instanceof OpinionExpression) {
refs = ((OpinionExpression) annotation).getExternalRefs();
} else if (annotation instanceof OpinionTarget) {
refs = ((OpinionTarget) annotation).getExternalRefs();
} else if (annotation instanceof OpinionHolder) {
refs = ((OpinionHolder) annotation).getExternalRefs();
} else {
throw new IllegalArgumentException("Unsupported annotation object: " + annotation);
}
return refs;
}
private static boolean matchRef(final ExternalRef ref, @Nullable final String resource,
@Nullable final String reference) {
return (resource == null || resource.equalsIgnoreCase(ref.getResource()))
&& (reference == null || reference.equals(ref.getReference()));
}
private static void normalizeRefs(final Collection refs) {
final Set seen = Sets.newHashSet();
for (final Iterator i = refs.iterator(); i.hasNext();) {
final ExternalRef ref = i.next();
final String key = ref.getResource() + "|" + ref.getReference();
if (!seen.add(key)) {
i.remove();
}
}
}
public static List termRangesFor(final KAFDocument document, final Iterable terms) {
final List ranges = Lists.newArrayList();
int startIndex = -1;
int lastIndex = -2;
for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
final int termIndex = document.getTerms().indexOf(term);
if (termIndex - lastIndex > 1) {
if (startIndex >= 0) {
ranges.add(Range.create(startIndex, lastIndex + 1));
}
startIndex = termIndex;
}
lastIndex = termIndex;
}
if (startIndex != -1 && lastIndex >= startIndex) {
ranges.add(Range.create(startIndex, lastIndex + 1));
}
return ranges;
}
public static List rangesFor(final KAFDocument document, final Iterable terms) {
final List ranges = Lists.newArrayList();
int startOffset = -1;
int endOffset = -1;
int termIndex = -2;
for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
final int lastTermIndex = termIndex;
termIndex = document.getTerms().indexOf(term);
if (termIndex - lastTermIndex > 1) {
if (startOffset != -1) {
ranges.add(Range.create(startOffset, endOffset));
}
startOffset = term.getOffset();
}
endOffset = NAFUtils.getEnd(term);
}
if (startOffset != -1 && endOffset > startOffset) {
ranges.add(Range.create(startOffset, endOffset));
}
return ranges;
}
public static Range rangeFor(final Term term) {
return Range.create(NAFUtils.getBegin(term), NAFUtils.getEnd(term));
}
public static Range rangeFor(final Iterable terms) {
int begin = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
for (final Term term : terms) {
begin = Math.min(begin, getBegin(term));
end = Math.max(end, getEnd(term));
}
return Range.create(begin, end);
}
@Nullable
public static Span trimSpan(@Nullable final Span span, final int sentenceID) {
if (span == null || span.isEmpty()) {
return null;
}
boolean sameSentence = true;
for (final Term term : span.getTargets()) {
if (term.getSent() != sentenceID) {
sameSentence = false;
break;
}
}
if (sameSentence) {
return span;
}
final List filteredTerms = Lists.newArrayList();
for (final Term term : span.getTargets()) {
if (term.getSent() == sentenceID) {
filteredTerms.add(term);
}
}
final Span result = KAFDocument.newTermSpan(filteredTerms);
for (final Term head : span.getHeads()) {
if (head.getSent() == sentenceID) {
result.getHeads().add(head);
}
}
return result;
}
// Span methods
public static Span normalizeSpan(final KAFDocument document,
@Nullable final Span span) {
// Handle null and empty span
if (span == null || Iterables.isEmpty(span.getTargets())) {
return KAFDocument.newTermSpan();
}
// Identify all the 'root' terms in the span whose dep tree parent is outside the span
final Set roots = Sets.newHashSet();
final Set terms = ImmutableSet.copyOf(span.getTargets());
for (final Term term : terms) {
final Dep dep = document.getDepToTerm(term);
if (dep == null || !terms.contains(dep.getFrom())) {
roots.add(term);
}
}
// If only one 'root', return the normalized span having that root as the head
if (roots.size() == 1) {
return KAFDocument.newTermSpan(span.getTargets(), roots.iterator().next());
}
// Otherwise, look for the closest head outside the span. First compute all the paths from
// the dep tree roots to the 'root' terms identified before
final List> paths = Lists.newArrayList();
for (final Term root : roots) {
final List path = Lists.newArrayList(root);
for (Dep dep = document.getDepToTerm(root); dep != null; dep = document
.getDepToTerm(dep.getFrom())) {
path.add(dep.getFrom());
}
Collections.reverse(path);
paths.add(path);
}
// Then look for the deepest node common to all those paths
int depth = 0;
Term externalHead = null;
outer: for (; depth < paths.get(0).size(); ++depth) {
final Term t = paths.get(0).get(depth);
for (int i = 1; i < paths.size(); ++i) {
final List path = paths.get(i);
if (depth >= path.size() || !path.get(depth).equals(t)) {
break outer;
}
}
externalHead = t;
}
// If found, compute the terms for the external span
Set externalTerms = null;
if (externalHead != null) {
externalTerms = Sets.newHashSet(terms);
externalTerms.add(externalHead);
for (final List path : paths) {
externalTerms.addAll(path.subList(depth, path.size()));
}
}
// Now look for the internal head that covers the most part terms of the span. Start by
// associating to each candidate internal head the terms it would cover
final Multimap map = HashMultimap.create();
for (final Term term : terms) {
Dep dep = document.getDepToTerm(term);
if (dep == null) {
map.put(term, term);
} else {
for (; dep != null; dep = document.getDepToTerm(dep.getFrom())) {
if (!terms.contains(dep.getFrom())) {
map.put(dep.getTo(), term);
break;
}
}
}
}
// Then identify the best internal head
Term internalHead = null;
Collection internalTerms = null;
for (final Map.Entry> entry : map.asMap().entrySet()) {
if (internalHead == null || entry.getValue().size() >= internalTerms.size()) {
internalTerms = entry.getValue();
internalHead = entry.getKey();
}
}
// Return either the external span (if defined) or the internal one, based on which one is
// most similar in size to the original span (if equal, prefer external one).
if (externalTerms != null
&& externalTerms.size() - terms.size() <= terms.size() - internalTerms.size()) {
return KAFDocument.newTermSpan(
Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(externalTerms), externalHead);
} else {
return KAFDocument.newTermSpan(
Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(internalTerms), internalHead);
}
}
public static List> mergeSpans(final KAFDocument document,
final Iterable> spans, final boolean canAddTerms) {
// Build a map associating to each span head the other heads it is coordinated with
final Map> extents = Maps.newHashMap();
final Map> clusters = Maps.newHashMap();
for (final Span span : spans) {
final Term head = extractHead(document, span);
clusters.put(head, Sets.newHashSet(head));
extents.put(head, span.getTargets());
}
for (final Term head : clusters.keySet()) {
for (Dep dep = document.getDepToTerm(head); dep != null
&& ("CONJ".equals(dep.getRfunc()) || "COORD".equals(dep.getRfunc())); dep = document
.getDepToTerm(dep.getFrom())) {
if (clusters.keySet().contains(dep.getFrom())) {
clusters.get(head).add(dep.getFrom());
clusters.get(dep.getFrom()).add(head);
} else if ("CO".indexOf(dep.getFrom().getPos()) < 0) {
break; // don't include intermediate terms that are not conjunctions or commas
}
}
}
// Create a span for each cluster of heads, including intermediate conjunctions
final List> result = Lists.newArrayList();
while (!clusters.isEmpty()) {
final Set heads = clusters.values().iterator().next();
final Set terms = Sets.newHashSet();
Term spanHead = heads.iterator().next();
for (final Term head : heads) {
clusters.remove(head);
terms.addAll(extents.get(head));
final List path = Lists.newArrayList();
for (Dep dep = document.getDepToTerm(head); dep != null; dep = document
.getDepToTerm(dep.getFrom())) {
final Term term = dep.getFrom();
path.add(term);
if (heads.contains(term)) {
terms.addAll(path);
path.clear();
spanHead = term;
}
}
}
List spanTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms);
if (canAddTerms) {
final List docTerms = document.getTerms();
spanTerms = Lists.newArrayList(docTerms.subList(
docTerms.indexOf(spanTerms.get(0)),
docTerms.indexOf(spanTerms.get(spanTerms.size() - 1)) + 1));
}
result.add(KAFDocument.newTermSpan(spanTerms, spanHead));
}
return result;
}
public static final List> splitSpans(final KAFDocument document,
final Iterable> spans) {
// Identify all the heads taking coordination into consideration
final Set heads = Sets.newHashSet();
final Set terms = Sets.newHashSet();
for (final Span span : spans) {
final Term head = extractHead(document, span);
heads.add(head);
terms.addAll(span.getTargets());
final List queue = Lists.newLinkedList();
queue.add(head);
while (!queue.isEmpty()) {
final Term term = queue.remove(0);
for (final Dep dep : document.getDepsFromTerm(term)) {
final String func = dep.getRfunc();
if ("COORD".equals(func) || "CONJ".equals(func)) {
final Term t = dep.getTo();
queue.add(t);
if ("CC".equals(t.getMorphofeat())
|| !Character.isLetter(t.getMorphofeat().charAt(0))) {
heads.add(term);
}
}
}
}
}
// Build and return a span for each head
final Set excluded = document.getTermsByDepDescendants(heads);
final List> result = Lists.newArrayList();
for (final Term head : heads) {
final Set extent = document.getTermsByDepAncestors(ImmutableSet.of(head));
extent.removeAll(excluded);
extent.add(head);
extent.retainAll(terms);
if (!extent.isEmpty()) {
result.add(KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR)
.sortedCopy(extent), head));
}
}
return result;
}
public static final List> splitSpan(final KAFDocument document,
final Span span, final Iterable heads) {
final Set excludedTerms = document.getTermsByDepDescendants(heads);
final List> spans = Lists.newArrayList();
for (final Term head : heads) {
final Set terms = document.getTermsByDepAncestors(ImmutableSet.of(head));
terms.removeAll(excludedTerms);
terms.add(head);
terms.retainAll(span.getTargets());
if (!terms.isEmpty()) {
spans.add(KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR)
.sortedCopy(terms), head));
}
}
return spans;
}
// End
public static KAFDocument readDocument(@Nullable final Path path) throws IOException {
final KAFDocument document;
if (path == null) {
document = KAFDocument.createFromStream(IO.utf8Reader(IO.buffer(System.in)));
document.getPublic().publicId = "";
} else {
try (BufferedReader reader = Files.newBufferedReader(path)) {
document = KAFDocument.createFromStream(reader);
document.getPublic().publicId = path.toString();
}
}
return document;
}
public static void writeDocument(final KAFDocument document, @Nullable final Path location)
throws IOException {
if (location == null) {
System.out.write(document.toString().getBytes(Charsets.UTF_8));
} else {
try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(location.toString())))) {
writer.write(document.toString());
}
}
}
public static Term syntacticToSRLHead(final KAFDocument document, final Term term) {
for (final Dep dep : document.getDepsFromTerm(term)) {
final String func = dep.getRfunc();
if ("VC".equals(func) || "IM".equals(func)) {
return syntacticToSRLHead(document, dep.getTo());
}
}
return term;
}
public static Term srlToSyntacticHead(final KAFDocument document, final Term term) {
final Dep dep = document.getDepToTerm(term);
if (dep != null) {
final String func = dep.getRfunc();
if ("VC".equals(func) || "IM".equals(func)) {
return srlToSyntacticHead(document, dep.getFrom());
}
}
return term;
}
// Accounts for demonstrative pronouns
public static String extendedPos(final KAFDocument document, final Term term) {
final String pos = term.getMorphofeat();
final String lemma = term.getLemma().toLowerCase();
if ("some".equals(lemma) || "many".equals(lemma) || "all".equals(lemma)
|| "few".equals(lemma) || "this".equals(lemma) || "these".equals(lemma)
|| "that".equals(lemma) || "those".equals(lemma)) {
final Dep dep = document.getDepToTerm(term);
if (dep == null || !"NMOD".equals(dep.getRfunc())) {
return pos + "P"; // determiner (DT) or adj (JJ) used as demonstrative pronoun
}
}
return pos;
}
public static Boolean isActiveForm(final KAFDocument document, final Term term) {
final String word = term.getStr().toLowerCase();
final String pos = term.getMorphofeat();
if (!pos.startsWith("V")) {
return null;
}
if (word.equals("been") || !pos.equals("VBN")) {
return Boolean.TRUE;
}
return isActiveFormHelper(document, term);
}
private static Boolean isActiveFormHelper(final KAFDocument document, final Term term) {
final Dep dep = document.getDepToTerm(term);
if (dep == null) {
return Boolean.FALSE;
}
final Term parent = dep.getFrom();
final String word = parent.getStr().toLowerCase();
final String pos = parent.getMorphofeat();
if (pos.startsWith("NN")) {
return Boolean.FALSE;
}
if (word.matches("am|are|is|was|were|be|been|being")) {
return Boolean.FALSE;
}
if (word.matches("ha(ve|s|d|ving)")) {
return Boolean.TRUE;
}
if (pos.matches("VBZ|VBD|VBP|MD")) {
return Boolean.FALSE;
}
return isActiveFormHelper(document, parent);
}
public static java.util.function.Predicate matchExtendedPos(final KAFDocument document,
final String... posPrefixes) {
return new java.util.function.Predicate() {
@Override
public boolean test(final Term term) {
final String pos = extendedPos(document, term);
for (final String prefix : posPrefixes) {
if (pos.startsWith(prefix)) {
return true;
}
}
return false;
}
};
}
// extracts descendents that are consecutive with the supplied head
public static Set getTermsByDepAncestor(final KAFDocument document, final Term head,
final boolean consecutive) {
final Set descendants = document.getTermsByDepAncestors(ImmutableSet.of(head));
if (consecutive) {
final List sortedTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(
descendants);
final int[] indexes = new int[sortedTerms.size()];
for (int i = 0; i < sortedTerms.size(); ++i) {
indexes[i] = document.getTerms().indexOf(sortedTerms.get(i));
}
final int h = sortedTerms.indexOf(head);
boolean filtered = false;
for (int i = h + 1; i < indexes.length; ++i) {
filtered |= indexes[i] > indexes[i - 1] + 1;
if (filtered) {
descendants.remove(sortedTerms.get(i));
}
}
filtered = false;
for (int i = h - 1; i >= 0; --i) {
filtered |= indexes[i] < indexes[i + 1] - 1;
if (filtered) {
descendants.remove(sortedTerms.get(i));
}
}
}
return descendants;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy