![JAR search and dependency download from the Maven repository](/logo.png)
edu.jhu.hlt.concrete.ingesters.conll.Conll2011Sentence Maven / Gradle / Ivy
package edu.jhu.hlt.concrete.ingesters.conll;
import java.nio.file.Path;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.jhu.hlt.concrete.EntityMention;
import edu.jhu.hlt.concrete.MentionArgument;
import edu.jhu.hlt.concrete.SituationMention;
import edu.jhu.hlt.concrete.TaggedToken;
import edu.jhu.hlt.concrete.Token;
import edu.jhu.hlt.concrete.TokenList;
import edu.jhu.hlt.concrete.TokenRefSequence;
import edu.jhu.hlt.concrete.TokenTagging;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.TokenizationKind;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
/** Many Rows (words) comprising a Sentence */
public class Conll2011Sentence {
private static final Logger LOGGER = LoggerFactory.getLogger(Conll2011Sentence.class);
/** Information for tracking back to a file */
public static class DebugInfo {
public final int firstLine, lastLine;
public final Path source;
public DebugInfo(Path source, int firstLine, int lastLine) {
this.source = source;
this.firstLine = firstLine;
this.lastLine = lastLine;
}
}
public DebugInfo debugInfo = null;
private final Conll2011 conll2011;
private int index;
private String docId;
private List words;
private List nerEMs;
private Tokenization toks;
public Conll2011Sentence(Conll2011 conll2011, int index) {
this.conll2011 = conll2011;
this.index = index;
this.words = new ArrayList<>();
if (this.conll2011.addNerAsEntityMentionSet)
nerEMs = new ArrayList<>();
}
public int getIndex() { return index; }
public String getDocId() { return docId; }
public void add(Conll2011Row word) {
if (docId == null) {
docId = word.docId;
} else {
// Check that things that shouldn't vary between words don't vary
if (!docId.equals(word.docId))
throw new IllegalArgumentException();
if (words.get(0).getNumPredicates() != word.getNumPredicates())
throw new IllegalArgumentException();
}
words.add(word);
}
public int size() {
return words.size();
}
public String getPart() {
return words.get(0).part;
}
public Conll2011Row getWord(int i) {
return words.get(i);
}
public List getWords() {
return words;
}
/** You have to call convertToConcrete first */
public List getNerEntityMentions() {
return nerEMs;
}
/**
* Returns a map with coref cluster ids for keys and mentions in the
* values.
*
* NOTE: Coref annotations are only valid up to a part, so we take the id to
* be "${PART_ID}-${CLUSTER}".
*
* NOTE: Coref mentions may overlap and DO NOT form a tree.
*
* NOTE: Coref mentions may nest however, e.g. (0 someone who makes (0 his) own luck)
*/
public Map> getCoref(AnalyticUUIDGenerator g) {
Map> openMentions = new HashMap<>();
Map> corefMentions = new HashMap<>();
for (int i = 0; i < words.size(); i++) {
Conll2011Row w = words.get(i);
for (String clusterId : w.getCorefClusterStarts()) {
Deque stack = openMentions.get(clusterId);
if (stack == null) {
stack = new ArrayDeque<>();
openMentions.put(clusterId, stack);
}
stack.push(i);
}
for (String clusterId : w.getCorefClusterEnds()) {
Deque stack = openMentions.remove(clusterId);
int start = stack.pop();
if (stack.size() > 0)
openMentions.put(clusterId, stack);
// Make an EntityMention
EntityMention em = new EntityMention();
em.setUuid(g.next());
em.setConfidence(1);
TokenRefSequence trs = new TokenRefSequence();
trs.setTokenizationId(toks.getUuid());
for (int ii = start; ii <= i; ii++)
trs.addToTokenIndexList(ii);
//System.err.println("cluster=" + clusterId + " mention=\"" + words + "\"");
em.setTokens(trs);
// Add the EntityMention the running set of clusters
List ems = corefMentions.get(clusterId);
if (ems == null) {
ems = new ArrayList<>();
corefMentions.put(clusterId, ems);
}
ems.add(em);
}
}
assert openMentions.isEmpty();
// Convert between raw cluster ids, which are only relevant up to a
// "part" or section of the document to that cluster id conjoined
// with the part id.
String partId = getPart();
Map> fixedClusterNames = new HashMap<>();
for (Map.Entry> x : corefMentions.entrySet()) {
String key = partId + "-" + x.getKey();
fixedClusterNames.put(key, x.getValue());
}
return fixedClusterNames;
}
public int getNumPredicates() {
if (words.isEmpty() ||
(words.size() == 1 && words.get(0).pos.startsWith("X"))) {
LOGGER.warn("special case of empty sentence/no predicates "
+ " doc=" + getDocId() + " index=" + index + " part=" + getPart());
return 0;
}
return words.get(0).getNumPredicates();
}
public SituationMention getPredArg(int index, AnalyticUUIDGenerator g) {
if (index < 0 || index >= getNumPredicates())
throw new IllegalArgumentException();
if (toks == null)
throw new IllegalStateException();
ParseWrapper helper = new ParseWrapper();
for (int i = 0; i < words.size(); i++) {
Conll2011Row w = words.get(i);
helper.add(w.getPredArg(index), null);
}
SituationMention sm = new SituationMention();
sm.setUuid(g.next());
sm.setConfidence(1);
sm.setArgumentList(new ArrayList<>());
for (ParseWrapper.ConstituentWrapper c : helper.getConstituents()) {
if (c.getTag().equals("V")) {
// Root (target/verb)
Conll2011Row w = words.get(c.getStart());
//String predId = w.predicateLemma + "." + w.predicateFramesetId;
String predId = w.predicateLemma
+ "-" + w.pos.substring(0, 1).toLowerCase()
+ "-" + w.predicateFramesetId.replaceFirst("^0+", "");
TokenRefSequence trs = new TokenRefSequence();
trs.setTokenizationId(toks.getUuid());
assert c.getStart() == c.getEnd();
trs.setTokenIndexList(Arrays.asList(c.getStart()));
sm.setTokens(trs);
sm.setSituationKind(predId);
sm.setText(w.getWord());
} else {
// Arg
if (!c.getTag().contains("ARG")) {
LOGGER.warn("bad arg name: " + c.getTag()
+ " doc=" + getDocId()
+ " sent=" + getIndex()
+ " part=" + getPart()
+ " words=" + words);
}
MentionArgument arg = new MentionArgument();
arg.setRole(c.getTag());
arg.setConfidence(1);
arg.setSituationMentionId(sm.getUuid());
TokenRefSequence trs = new TokenRefSequence();
trs.setTokenizationId(toks.getUuid());
for (int ti = c.getStart(); ti <= c.getEnd(); ti++)
trs.addToTokenIndexList(ti);
arg.setTokens(trs);
sm.addToArgumentList(arg);
}
}
assert sm.getTokens() != null;
return sm;
}
public edu.jhu.hlt.concrete.Sentence convertToConcrete(AnalyticUUIDGenerator g) {
if (nerEMs != null)
nerEMs.clear();
edu.jhu.hlt.concrete.Sentence s = new edu.jhu.hlt.concrete.Sentence();
s.setUuid(g.next());
if (toks != null)
System.err.println("double generating Tokenization, may have orphaned SituationMentions!");
toks = new Tokenization();
toks.setUuid(g.next());
toks.setKind(TokenizationKind.TOKEN_LIST);
toks.setMetadata(Conll2011.META_GENERAL);
// Tokens/words
TokenList tl = new TokenList();
for (int i = 0; i < words.size(); i++) {
Conll2011Row w = words.get(i);
Token t = new Token();
t.setText(w.getWord());
t.setTokenIndex(i);
tl.addToTokenList(t);
}
toks.setTokenList(tl);
// POS
TokenTagging pos = new TokenTagging();
pos.setUuid(g.next());
pos.setTaggingType("POS");
pos.setMetadata(Conll2011.META_POS);
for (int i = 0; i < words.size(); i++) {
Conll2011Row w = words.get(i);
TaggedToken tt = new TaggedToken();
tt.setConfidence(1);
tt.setTag(w.pos);
tt.setTokenIndex(i);
pos.addToTaggedTokenList(tt);
}
toks.addToTokenTaggingList(pos);
// Constituency parse
ParseWrapper parseHelper = new ParseWrapper();
for (int i = 0; i < words.size(); i++) {
Conll2011Row w = words.get(i);
// Option 1: Detect when children are a mix of terminals and non-terminals
// Option 2: Traverse the tree, and any time you see (lhs (lhs word)) => (lhs word)
if (this.conll2011.includeSingleTokenConstituents) {
//parseHelper.add(w.parseBit, w.pos);
parseHelper.addAlt(w.parseBit, w.pos, w.getWord());
} else {
parseHelper.add(w.parseBit, null);
}
}
edu.jhu.hlt.concrete.Parse p = new edu.jhu.hlt.concrete.Parse();
p.setUuid(g.next());
p.setMetadata(Conll2011.META_PARSE);
for (ParseWrapper.ConstituentWrapper c : parseHelper.getConstituents())
p.addToConstituentList(c.convertToConcrete());
toks.addToParseList(p);
// NER as TokenTagging
if (this.conll2011.addNerAsTokenTagging || this.conll2011.addNerAsEntityMentionSet) {
TokenTagging nerTT = null;
if (this.conll2011.addNerAsTokenTagging) {
nerTT = new TokenTagging();
nerTT.setUuid(g.next());
nerTT.setMetadata(Conll2011.META_NER);
nerTT.setTaggingType("NER");
}
int start = -1;
String tag = null;
for (int i = 0; i < words.size(); i++) {
Conll2011Row w = words.get(i);
parseHelper.add(w.namedEntities, null);
if (tag == null) {
String[] open = parseHelper.getOpenTags();
assert open.length < 2;
if (open.length == 1) {
tag = open[0];
start = i;
}
}
TaggedToken tt = new TaggedToken();
tt.setConfidence(1);
tt.setTag(tag == null ? "O" : tag);
tt.setTokenIndex(i);
nerTT.addToTaggedTokenList(tt);
int nc = parseHelper.getNumCloseTags();
assert nc < 2;
if (nc > 0) {
if (nerEMs != null) {
EntityMention em = new EntityMention();
em.setUuid(g.next());
em.setConfidence(1);
em.setEntityType(tag);
TokenRefSequence trs = new TokenRefSequence();
trs.setTokenizationId(toks.getUuid());
for (int ii = start; ii <= i; ii++)
trs.addToTokenIndexList(ii);
em.setTokens(trs);
nerEMs.add(em);
}
tag = null;
}
}
toks.addToTokenTaggingList(nerTT);
}
s.setTokenization(toks);
return s;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy