![JAR search and dependency download from the Maven repository](/logo.png)
edu.jhu.hlt.concrete.ingesters.conll.Ontonotes4 Maven / Gradle / Ivy
package edu.jhu.hlt.concrete.ingesters.conll;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.google.common.collect.HashBiMap;
import edu.jhu.hlt.concrete.AnnotationMetadata;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Constituent;
import edu.jhu.hlt.concrete.ConstituentRef;
import edu.jhu.hlt.concrete.MentionArgument;
import edu.jhu.hlt.concrete.Parse;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.Sentence;
import edu.jhu.hlt.concrete.SituationMention;
import edu.jhu.hlt.concrete.SituationMentionSet;
import edu.jhu.hlt.concrete.TaggedToken;
import edu.jhu.hlt.concrete.Token;
import edu.jhu.hlt.concrete.TokenList;
import edu.jhu.hlt.concrete.TokenRefSequence;
import edu.jhu.hlt.concrete.TokenTagging;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.TokenizationKind;
import edu.jhu.hlt.concrete.serialization.TarGzCompactCommunicationSerializer;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
/**
* Ingests Ontonotes 4 data, currently only storing:
* - PTB parses (as a {@link Parse})
* - POS tags from the PTB parse (as a {@link TokenTagging})
* - Propbank SRL (as a {@link SituationMention})
* - Word senses (as a {@link TokenTagging}) -- currently sparse
*
* Getting this ingester to handle coref data will be tricky because that data
* is not annotated on top of the PTB-style data, it is SGML overlayed on raw
* text.
*
* The parses produced by this tool will include traces with the "-NONE-" tag.
* These will also appear in the {@link TokenTagging}s, and traces will have
* text fields set to the empty string.
*
* The SRL labels produced by this ingester will use the "continuation roles"
* method of handling split arguments. For example if the role "ARG1" is split
* into two pieces, they will be added as two {@link MentionArgument}s with the
* first one having a role of "ARG1" and the second "C-ARG1".
*
* Arguments which contain traces will be represented by the surface form of the
* argument, or the end of the trace. The start of the trace will be in the
* parse tree, but there will be nothing connecting it to the argument.
*
* If a predicate is not a constituent (e.g. in "the PTB [keeps on] annoying me"
* the predicate "keeps on" is not a single constituent), then the
* {@link SituationMention}'s constituent field is not set. The tokens field is
* always set.
*
* @deprecated This class works but should not be used in favor of the
* CoNLL-formatted version of Ontonotes, see:
* https://github.com/ontonotes/conll-formatted-ontonotes-5.0
* {@link Ontonotes5}
*
* @author travis
*/
@Deprecated
public class Ontonotes4 {
// e.g. "ontonotes-release-4.0/data/files/data/english/annotations/bc/cnn/00/cnn_0000"
private String baseName;
private String communicationType;
private String sectionKind;
private AnnotationMetadata meta;
private AnnotationMetadata posMeta;
public Ontonotes4(String baseName, String communicationType, String sectionKind) {
this.baseName = baseName;
this.communicationType = communicationType;
this.sectionKind = sectionKind;
this.meta = new AnnotationMetadata();
this.meta.setTimestamp(System.currentTimeMillis() / 1000);
this.meta.setTool("ontonotes4");
this.posMeta = new AnnotationMetadata();
this.posMeta.setTimestamp(this.meta.getTimestamp());
this.posMeta.setTool("ontonotes4-pos");
}
public String getIdFromBaseName() {
String[] path = baseName.split("/");
if (path.length < 4)
throw new RuntimeException("path = " + Arrays.toString(path));
int k = 4;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < k; i++) {
sb.append('/');
sb.append(path[(path.length - k) + i]);
}
return sb.toString();
}
public boolean hasNeededFiles() {
return getParseFile().isFile()
&& getPropFile().isFile()
&& getSenseFile().isFile();
}
public File getParseFile() {
return new File(baseName + ".parse");
}
public File getPropFile() {
return new File(baseName + ".prop");
}
public File getSenseFile() {
return new File(baseName + ".sense");
}
/**
* Builds a {@link Communication} with a single section containing a
* constituency parse, Propbank SRL (as a ???), and word senses as a
* {@link TokenTagging}.
*/
public Iterable parse() {
// Read in the data
List parses = getParses(); // one per sentence
int nSent = parses.size();
List[] props = getProps(nSent); // many per sentence
List[] senses = getSenses(nSent); // many per sentence
// Populate the Communication
Communication comm = new Communication();
AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
AnalyticUUIDGenerator g = f.create();
comm.setUuid(g.next());
comm.setId(getIdFromBaseName());
comm.setType(communicationType);
comm.setMetadata(meta);
Section s = new Section();
s.setUuid(g.next());
s.setKind(sectionKind);
comm.addToSectionList(s);
SituationMentionSet sms = new SituationMentionSet();
sms.setUuid(g.next());
sms.setMetadata(meta);
sms.setMentionList(new ArrayList<>());
comm.addToSituationMentionSetList(sms);
for (int i = 0; i < nSent; i++) {
// Build the sentence based on the parse leaves (including POS tags)
PennTreeReader.Node root = parses.get(i);
PennTreeReader.Indexer rootIndex = new PennTreeReader.Indexer(root);
Sentence sent = makeSentence(rootIndex, g);
sent.setUuid(g.next());
s.addToSentenceList(sent);
// Add senses
addSenses(sent, senses[i], g);
// Add parse
Parse cons = new Parse();
cons.setUuid(g.next());
cons.setMetadata(meta);
sent.getTokenization().addToParseList(cons);
HashBiMap cmap = HashBiMap.create();
addConstituents(cons, root, cmap, rootIndex);
// Add propositions
for (OntonotesProposition p : props[i]) {
// Build the SituationMention
SituationMention sm = new SituationMention();
sm.setUuid(g.next());
sm.setConfidence(1);
// Add the predicate
sm.setSituationKind(p.getPredicateString());
OntonotesProposition.Proplabel pred = p.predicate;
SimpleImmutableEntry predBounds = pred.getSplitsAsContiguousSpan(rootIndex);
if (predBounds == null) {
System.err.println("can't handle non-contiguous predicates: " + pred);
continue;
}
sms.addToMentionList(sm);
TokenRefSequence smtrs = getTrs(predBounds, sent);
if (!pred.isSplit()) {
Constituent predC = findNode(pred, rootIndex, cmap);
smtrs.setConstituent(getRef(cons, predC));
}
sm.setTokens(smtrs);
// Add the arguments
sm.setArgumentList(new ArrayList<>());
for (OntonotesProposition.Proplabel a : p.args) {
// Loop over all splits (most of the time there will be only one)
for (int si = 0; si < a.getNumSplits(); si++) {
int t = a.getTerminal(si);
int h = a.getHeight(si);
PennTreeReader.Node anode = rootIndex.get(t, h);
Constituent acons = cmap.get(anode);
String role = a.getLabel();
if (i > 0) // Continuation role
role = "C-" + role;
MentionArgument arg = new MentionArgument();
arg.setConfidence(1);
arg.setRole(a.getLabel());
TokenRefSequence matrs = getTrs(acons.getStart(), acons.getEnding() - 1, sent);
matrs.setConstituent(getRef(cons, acons));
arg.setTokens(matrs);
sm.addToArgumentList(arg);
}
}
}
}
return Arrays.asList(comm);
}
public static TokenRefSequence getTrs(SimpleImmutableEntry inclusivePair, Sentence sent) {
return getTrs(inclusivePair.getKey(), inclusivePair.getValue(), sent);
}
public static TokenRefSequence getTrs(int startInclusive, int endInclusive, Sentence sent) {
TokenRefSequence trs = new TokenRefSequence();
trs.setTokenizationId(sent.getTokenization().getUuid());
for (int i = startInclusive; i <= endInclusive; i++)
trs.addToTokenIndexList(i);
return trs;
}
/**
* Use this method for type safety, make sure you don't use the wrong UUID.
* Also checks that Consituent IDs are equal to their index in parse.consitutentList
*/
public static ConstituentRef getRef(Parse p, Constituent c) {
if (p.getConstituentList().get(c.getId()) != c)
throw new IllegalStateException();
return new ConstituentRef(p.getUuid(), c.getId());
}
/**
* Looks for node in the tree at root then uses node2cons to convert to a
* {@link Constituent}.
*/
public static Constituent findNode(
OntonotesProposition.Proplabel node,
PennTreeReader.Indexer tree,
HashBiMap node2cons) {
if (node.isSplit())
throw new IllegalArgumentException("not allowed");
PennTreeReader.Node n = tree.get(node.getTerminal(), node.getHeight());
Constituent cnode = node2cons.get(n);
assert cnode != null : "not a constituent?";
return cnode;
}
/**
* Recursively adds all Nodes in root as {@link Constituent}s in p.
* @param bimap may be null, but otherwise will have all node entries added.
*/
public static Constituent addConstituents(
Parse p,
PennTreeReader.Node root,
HashBiMap bimap,
PennTreeReader.Indexer indexer) {
Constituent c = new Constituent();
c.setId(p.getConstituentListSize());
c.setTag(root.getCategory());
c.setStart(indexer.getFirstToken(root));
c.setEnding(indexer.getLastToken(root) + 1);
assert c.getEnding() >= 0 && c.getStart() >= 0 && c.getStart() < c.getEnding()
: "start=" + c.getStart() + " end=" + c.getEnding();
p.addToConstituentList(c);
if (bimap != null)
bimap.put(root, c);
c.setChildList(new ArrayList<>());
for (PennTreeReader.Node n : root.getChildren()) {
Constituent nc = addConstituents(p, n, bimap, indexer);
c.addToChildList(nc.getId());
}
return c;
}
/**
* Makes a {@link TokenList} out of the leaf nodes in the given tree.
* Will insert a token for a trace which has no text field set. Also adds POS
* tags as a {@link TokenTagging}.
* @param newParam TODO
*/
private Sentence makeSentence(PennTreeReader.Indexer tree, AnalyticUUIDGenerator g) {
Sentence sent = new Sentence();
Tokenization tok = new Tokenization();
tok.setUuid(g.next());
tok.setMetadata(meta);
sent.setTokenization(tok);
tok.setKind(TokenizationKind.TOKEN_LIST);
TokenList tkl = new TokenList();
tok.setTokenList(tkl);
TokenTagging pos = new TokenTagging();
pos.setUuid(g.next());
pos.setMetadata(posMeta);
pos.setTaggingType("pos");
tok.addToTokenTaggingList(pos);
int i = 0;
for (PennTreeReader.Node n : tree.getLeaves(true)) {
Token t = new Token();
tkl.addToTokenList(t);
t.setTokenIndex(i);
if (n.isTrace())
t.setText("");
else
t.setText(n.getWord());
TaggedToken tt = new TaggedToken();
tt.setTag(n.getCategory());
tt.setTokenIndex(i);
pos.addToTaggedTokenList(tt);
i++;
}
return sent;
}
/**
* Adds word senses to sent as a {@link TokenTagging} with tags that look like
* e.g. "work-v-1". The {@link TokenTagging} may not cover all the tokens (you
* must use the tokenIndex field of TaggedToken).
*/
private void addSenses(Sentence sent, List senses, AnalyticUUIDGenerator g) {
Tokenization tok = sent.getTokenization();
TokenTagging tt = new TokenTagging();
tt.setUuid(g.next());
tt.setMetadata(meta);
tok.addToTokenTaggingList(tt);
tt.setTaggingType("ontonotes-wordsense");
tt.setTaggedTokenList(new ArrayList<>());
for (OntonotesWordsense s : senses) {
TaggedToken tts = new TaggedToken();
tts.setTokenIndex(s.word);
tts.setTag(s.getLemmaAndSense());
tt.addToTaggedTokenList(tts);
}
}
/**
* Returns an array indexed by sentence, items are tokens appearing in
* that sentence.
*/
public List[] getSenses(int numSentences) {
@SuppressWarnings("unchecked")
List[] senses = new List[numSentences];
for (int i = 0; i < numSentences; i++)
senses[i] = new ArrayList<>();
try (BufferedReader r = new BufferedReader(new InputStreamReader(
new FileInputStream(getSenseFile())))) {
while (r.ready()) {
String line = r.readLine();
OntonotesWordsense ow = new OntonotesWordsense(line);
senses[ow.sentence].add(ow);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return senses;
}
/**
* Returns an array indexed by sentence, items are propositions appearing in
* that sentence.
*/
public List[] getProps(int numSentences) {
@SuppressWarnings("unchecked")
List[] props = new List[numSentences];
for (int i = 0; i < numSentences; i++)
props[i] = new ArrayList<>();
try (BufferedReader r = new BufferedReader(new InputStreamReader(
new FileInputStream(getPropFile())))) {
while (r.ready()) {
String line = r.readLine();
OntonotesProposition op = new OntonotesProposition(line, false);
props[op.sentence].add(op);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return props;
}
/**
* Each node in the returned list is the root of a tree for a given sentence.
*/
public List getParses() {
// Read the text in (separated by empty lines)
List sexps = new ArrayList<>();
StringBuilder curSexp = new StringBuilder();
try (BufferedReader r = new BufferedReader(new InputStreamReader(
new FileInputStream(getParseFile())))) {
while (r.ready()) {
String line = r.readLine();
if (line.isEmpty()) {
sexps.add(curSexp.toString().replaceAll("\\s+", " "));
curSexp = new StringBuilder();
} else {
curSexp.append(line);
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
// Parse the text
List parses = new ArrayList<>();
for (String sexp : sexps)
parses.add(PennTreeReader.parse(sexp));
return parses;
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("please provide:");
System.err.println("1) an Ontonotes 4.0 directory (e.g. ontonotes-release-4.0/data/files/data/english/annotations)");
System.err.println("2) a Concrete tar.gz file to dump results to");
System.exit(-1);
}
String sectionKind = "body";
File onDir = new File(args[0]);
File outputFile = new File(args[1]);
long start = System.currentTimeMillis();
System.out.println("reading from " + onDir.getPath());
List all = parseAll(onDir, sectionKind);
System.out.println("read " + all.size() + " documents in "
+ (System.currentTimeMillis() - start)/1000d + " seconds");
System.out.println("writing to " + outputFile.getPath());
TarGzCompactCommunicationSerializer ts = new TarGzCompactCommunicationSerializer();
ts.toTarGz(all, outputFile.toPath());
System.out.println("done, took " + (System.currentTimeMillis() - start)/1000d + " seconds");
}
public static List parseAll(File onDir, String sectionKind) {
List all = new ArrayList<>();
for (File bc : onDir.listFiles()) {
if (bc.listFiles() == null) {
System.err.println("WARN: skipping " + bc.getPath());
continue;
}
for (File cnn : bc.listFiles()) {
if (cnn.listFiles() == null) {
System.err.println("WARN: skipping " + cnn.getPath());
continue;
}
for (File zero : cnn.listFiles()) {
if (cnn.listFiles() == null) {
System.err.println("WARN: skipping " + cnn.getPath());
continue;
}
for (File f : zero.listFiles()) {
if (!f.getName().endsWith(".prop"))
continue;
String baseName = f.getPath().replaceAll(".prop$", "");
Ontonotes4 on4 = new Ontonotes4(baseName, bc.getName(), sectionKind);
if (!on4.hasNeededFiles()) {
System.err.println("missing files in " + baseName);
continue;
}
for (Communication c : on4.parse())
all.add(c);
}
}
}
}
return all;
}
public static void test(String[] args) throws Exception {
long start = System.currentTimeMillis();
String baseName = "ontonotes-release-4.0/data/files/data/english/annotations/bc/cnn/00/cnn_0000";
baseName = "/home/travis/code/fnparse/data/" + baseName;
Ontonotes4 on4 = new Ontonotes4(baseName, "test-document", "body");
Communication c = on4.parse().iterator().next();
TarGzCompactCommunicationSerializer ts = new TarGzCompactCommunicationSerializer();
ts.toTarGz(Arrays.asList(c), "/tmp/foo.concrete.gz");
// System.out.println(c);
System.out.println(System.currentTimeMillis() - start);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy