edu.stanford.nlp.pipeline.CustomAnnotationSerializer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.PennTreeReader;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.*;
/**
* Serializes Annotation objects using our own format.
*
* Note[gabor]: This is a lossy serialization! For similar performance, and
* lossless (or less lossy) serialization see,
* {@link edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer}.
*
* @author Mihai
*/
public class CustomAnnotationSerializer extends AnnotationSerializer {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(CustomAnnotationSerializer.class);
private final boolean compress;
/**
* If true, it means we store/load also AntecedentAnnotation
* This annotation is used ONLY in our KBP annotation.
* By default, it is not needed because we store the entire coref graph anyway.
*/
private final boolean haveExplicitAntecedent;
public CustomAnnotationSerializer() {
this(true, false);
}
public CustomAnnotationSerializer(boolean compress, boolean haveAnte) {
this.compress = compress;
this.haveExplicitAntecedent = haveAnte;
}
private static IntermediateSemanticGraph loadDependencyGraph(BufferedReader reader) throws IOException {
IntermediateSemanticGraph graph = new IntermediateSemanticGraph();
// first line: list of nodes
String line = reader.readLine().trim();
// System.out.println("PARSING LINE: " + line);
if(line.length() > 0){
String [] bits = line.split("\t");
if(bits.length < 3) throw new RuntimeException("ERROR: Invalid dependency node line: " + line);
String docId = bits[0];
if(docId.equals("-")) docId = "";
int sentIndex = Integer.valueOf(bits[1]);
for(int i = 2; i < bits.length; i ++){
String bit = bits[i];
String[] bbits = bit.split("-");
int copyAnnotation = -1;
boolean isRoot = false;
if(bbits.length > 3){
throw new RuntimeException("ERROR: Invalid format for dependency graph: " + line);
} else if(bbits.length == 2){
copyAnnotation = Integer.valueOf(bbits[1]);
} else if(bbits.length == 3){
copyAnnotation = Integer.valueOf(bbits[1]);
isRoot = bbits[2].equals("R");
}
int index = Integer.valueOf(bbits[0]);
graph.nodes.add(new IntermediateNode(docId, sentIndex, index, copyAnnotation, isRoot));
}
}
// second line: list of deps
line = reader.readLine().trim();
if(line.length() > 0){
String [] bits = line.split("\t");
for(String bit: bits){
String [] bbits = bit.split(" ");
if(bbits.length < 3 || bbits.length > 6){
throw new RuntimeException("ERROR: Invalid format for dependency graph: " + line);
}
String dep = bbits[0];
int source = Integer.valueOf(bbits[1]);
int target = Integer.valueOf(bbits[2]);
boolean isExtra = (bbits.length == 4) ? Boolean.valueOf(bbits[3]) : false;
int sourceCopy = (bbits.length > 4) ? Integer.valueOf(bbits[4]) : 0;
int targetCopy = (bbits.length > 5) ? Integer.valueOf(bbits[5]) : 0;
graph.edges.add(new IntermediateEdge(dep, source, sourceCopy, target, targetCopy, isExtra));
}
}
return graph;
}
/**
* Saves all arcs in the graph on two lines: first line contains the vertices, second the edges.
* @param graph
* @param pw
*/
private static void saveDependencyGraph(SemanticGraph graph, PrintWriter pw) {
if(graph == null){
pw.println();
pw.println();
return;
}
boolean outputHeader = false;
for (IndexedWord node: graph.vertexSet()){
// first line: sentence index for all nodes; we recover the words
// from the original tokens the first two tokens in this line
// indicate: docid, sentence index
if (!outputHeader) {
String docId = node.get(CoreAnnotations.DocIDAnnotation.class);
if(docId != null && docId.length() > 0) pw.print(docId);
else pw.print("-");
pw.print("\t");
pw.print(node.get(CoreAnnotations.SentenceIndexAnnotation.class));
outputHeader = true;
}
pw.print("\t");
pw.print(node.index());
// CopyAnnotations indicate copied (or virtual nodes) generated due to CCs (see EnglishGrammaticalStructure)
// These annotations are usually not set, so print them only if necessary
if(node.copyCount() > 0){
pw.print("-");
pw.print(node.copyCount());
// System.out.println("FOUND COPY ANNOTATION: " + node.get(CoreAnnotations.CopyAnnotation.class));
}
if (graph.getRoots().contains(node)) {
if (node.copyCount() > 0) {
pw.print("-R");
} else {
pw.print("-0-R");
}
}
}
pw.println();
// second line: all edges
boolean first = true;
for (SemanticGraphEdge edge : graph.edgeIterable()) {
if(! first) pw.print("\t");
String rel = edge.getRelation().toString();
// no spaces allowed in the relation name
// note that they might occur due to the tokenization of HTML/XML/RDF tags
rel = rel.replaceAll("\\s+", "");
pw.print(rel);
pw.print(" ");
pw.print(edge.getSource().index());
pw.print(" ");
pw.print(edge.getTarget().index());
if (edge.isExtra() || edge.getSource().copyCount() > 0 || edge.getTarget().copyCount() > 0) {
pw.print(" ");
pw.print(edge.isExtra());
pw.print(" ");
pw.print(edge.getSource().copyCount());
pw.print(" ");
pw.print(edge.getTarget().copyCount());
}
first = false;
}
pw.println();
}
/** Serializes the CorefChain objects
*
* @param chains all clusters in a doc
* @param pw the buffer
*/
private static void saveCorefChains(Map chains, PrintWriter pw) {
if(chains == null) {
pw.println();
return;
}
// how many clusters
pw.println(chains.size());
// save each cluster
for (Map.Entry integerCorefChainEntry : chains.entrySet()) {
// cluster id + how many mentions in the cluster
saveCorefChain(pw, integerCorefChainEntry.getKey(), integerCorefChainEntry.getValue());
}
// an empty line at end
pw.println();
}
private static int countMentions(CorefChain cluster) {
int count = 0;
for(IntPair mid: cluster.getMentionMap().keySet()) {
count += cluster.getMentionMap().get(mid).size();
}
return count;
}
/**
* Serializes one coref cluster (i.e., one entity).
*
* @param pw the buffer
* @param cid id of cluster to save
* @param cluster the cluster
*/
public static void saveCorefChain(PrintWriter pw, int cid, CorefChain cluster) {
pw.println(cid + " " + countMentions(cluster));
// each mention saved on one line
Map> mentionMap = cluster.getMentionMap();
for (Map.Entry> intPairSetEntry : mentionMap.entrySet()) {
// all mentions with the same head
IntPair mentionIndices = intPairSetEntry.getKey();
Set mentions = intPairSetEntry.getValue();
for (CorefChain.CorefMention mention: mentions) {
// one mention per line
pw.print(mentionIndices.getSource() + " " + mentionIndices.getTarget());
if(mention == cluster.getRepresentativeMention()) pw.print(" " + 1);
else pw.print(" " + 0);
pw.print(" " + mention.mentionType);
pw.print(" " + mention.number);
pw.print(" " + mention.gender);
pw.print(" " + mention.animacy);
pw.print(" " + mention.startIndex);
pw.print(" " + mention.endIndex);
pw.print(" " + mention.headIndex);
pw.print(" " + mention.corefClusterID);
pw.print(" " + mention.mentionID);
pw.print(" " + mention.sentNum);
pw.print(" " + mention.position.length());
for(int i = 0; i < mention.position.length(); i ++)
pw.print(" " + mention.position.get(i));
pw.print(" " + escapeSpace(mention.mentionSpan));
pw.println();
}
}
}
private static String escapeSpace(String s) {
return s.replaceAll("\\s", SPACE_HOLDER);
}
private static String unescapeSpace(String s) {
return s.replaceAll(SPACE_HOLDER, " ");
}
private static Dictionaries.MentionType parseMentionType(String s) {
return Dictionaries.MentionType.valueOf(s);
}
private static Dictionaries.Number parseNumber(String s) {
return Dictionaries.Number.valueOf(s);
}
private static Dictionaries.Gender parseGender(String s) {
return Dictionaries.Gender.valueOf(s);
}
private static Dictionaries.Animacy parseAnimacy(String s) {
return Dictionaries.Animacy.valueOf(s);
}
/**
* Loads the CorefChain objects from the serialized buffer
* @param reader the buffer
* @return A map from cluster id to clusters
* @throws IOException
*/
private static Map loadCorefChains(BufferedReader reader) throws IOException {
String line = reader.readLine().trim();
if (line.isEmpty()) return null;
int clusterCount = Integer.valueOf(line);
Map chains = Generics.newHashMap();
// read each cluster
for(int c = 0; c < clusterCount; c ++) {
line = reader.readLine().trim();
String [] bits = line.split("\\s");
int cid = Integer.valueOf(bits[0]);
int mentionCount = Integer.valueOf(bits[1]);
Map> mentionMap =
Generics.newHashMap();
CorefChain.CorefMention representative = null;
// read each mention in this cluster
for(int m = 0; m < mentionCount; m ++) {
line = reader.readLine();
bits = line.split("\\s");
IntPair key = new IntPair(
Integer.valueOf(bits[0]),
Integer.valueOf(bits[1]));
boolean rep = bits[2].equals("1");
Dictionaries.MentionType mentionType = parseMentionType(bits[3]);
Dictionaries.Number number = parseNumber(bits[4]);
Dictionaries.Gender gender = parseGender(bits[5]);
Dictionaries.Animacy animacy = parseAnimacy(bits[6]);
int startIndex = Integer.valueOf(bits[7]);
int endIndex = Integer.valueOf(bits[8]);
int headIndex = Integer.valueOf(bits[9]);
int clusterID = Integer.valueOf(bits[10]);
int mentionID = Integer.valueOf(bits[11]);
int sentNum = Integer.valueOf(bits[12]);
int posLen = Integer.valueOf(bits[13]);
int [] posElems = new int[posLen];
for(int i = 0; i < posLen; i ++) {
posElems[i] = Integer.valueOf(bits[14 + i]);
}
IntTuple position = new IntTuple(posElems);
String span = unescapeSpace(bits[14 + posLen]);
CorefChain.CorefMention mention = new CorefChain.CorefMention(
mentionType,
number,
gender,
animacy,
startIndex,
endIndex,
headIndex,
clusterID,
mentionID,
sentNum,
position,
span);
Set mentionsWithThisHead =
mentionMap.get(key);
if(mentionsWithThisHead == null) {
mentionsWithThisHead = Generics.newHashSet();
mentionMap.put(key, mentionsWithThisHead);
}
mentionsWithThisHead.add(mention);
if(rep) representative = mention;
}
// construct the cluster
CorefChain chain = new CorefChain(cid, mentionMap, representative);
chains.put(cid, chain);
}
reader.readLine();
return chains;
}
@Override
public OutputStream write(Annotation corpus, OutputStream os) throws IOException {
if (!(os instanceof GZIPOutputStream)) {
if(compress) os = new GZIPOutputStream(os);
}
PrintWriter pw = new PrintWriter(os);
// save the coref graph in the new format
Map chains = corpus.get(CorefCoreAnnotations.CorefChainAnnotation.class);
saveCorefChains(chains, pw);
// save the coref graph on one line
// Note: this is the old format!
List> corefGraph = corpus.get(CorefCoreAnnotations.CorefGraphAnnotation.class);
if(corefGraph != null){
boolean first = true;
for(Pair arc: corefGraph){
if(! first) pw.print(" ");
pw.printf("%d %d %d %d", arc.first.get(0), arc.first.get(1), arc.second.get(0), arc.second.get(1));
first = false;
}
}
pw.println();
// save sentences separated by an empty line
List sentences = corpus.get(CoreAnnotations.SentencesAnnotation.class);
for(CoreMap sent: sentences){
// save the parse tree first, on a single line
Tree tree = sent.get(TreeCoreAnnotations.TreeAnnotation.class);
if(tree != null){
String treeString = tree.toString();
// no \n allowed in the parse tree string (might happen due to tokenization of HTML/XML/RDF tags)
treeString = treeString.replaceAll("\n", " ");
pw.println(treeString);
}
else pw.println();
SemanticGraph collapsedDeps = sent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
saveDependencyGraph(collapsedDeps, pw);
SemanticGraph uncollapsedDeps = sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
saveDependencyGraph(uncollapsedDeps, pw);
SemanticGraph ccDeps = sent.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
saveDependencyGraph(ccDeps, pw);
// save all sentence tokens
List tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
if(tokens != null){
for(CoreLabel token: tokens){
saveToken(token, haveExplicitAntecedent, pw);
pw.println();
}
}
// add an empty line after every sentence
pw.println();
}
pw.flush();
return os;
}
@Override
public Pair read(InputStream is) throws IOException {
if(compress && !(is instanceof GZIPInputStream)) is = new GZIPInputStream(is);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
Annotation doc = new Annotation("");
String line;
// read the coref graph (new format)
Map chains = loadCorefChains(reader);
if(chains != null) doc.set(CorefCoreAnnotations.CorefChainAnnotation.class, chains);
// read the coref graph (old format)
line = reader.readLine().trim();
if(line.length() > 0){
String [] bits = line.split(" ");
if(bits.length % 4 != 0){
throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line);
}
List> corefGraph = new ArrayList<>();
for(int i = 0; i < bits.length; i += 4){
IntTuple src = new IntTuple(2);
IntTuple dst = new IntTuple(2);
src.set(0, Integer.parseInt(bits[i]));
src.set(1, Integer.parseInt(bits[i + 1]));
dst.set(0, Integer.parseInt(bits[i + 2]));
dst.set(1, Integer.parseInt(bits[i + 3]));
corefGraph.add(new Pair<>(src, dst));
}
doc.set(CorefCoreAnnotations.CorefGraphAnnotation.class, corefGraph);
}
// read individual sentences
List sentences = new ArrayList<>();
while((line = reader.readLine()) != null){
CoreMap sentence = new Annotation("");
// first line is the parse tree. construct it with CoreLabels in Tree nodes
Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree();
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
// read the dependency graphs
IntermediateSemanticGraph intermCollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermUncollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermCcDeps = loadDependencyGraph(reader);
// the remaining lines until empty line are tokens
List tokens = new ArrayList<>();
while((line = reader.readLine()) != null){
if(line.length() == 0) break;
CoreLabel token = loadToken(line, haveExplicitAntecedent);
tokens.add(token);
}
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
// convert the intermediate graph to an actual SemanticGraph
SemanticGraph collapsedDeps = intermCollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, collapsedDeps);
SemanticGraph uncollapsedDeps = intermUncollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
SemanticGraph ccDeps = intermCcDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
sentences.add(sentence);
}
doc.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return Pair.makePair(doc, is);
}
private static final String SPACE_HOLDER = "##";
private static CoreLabel loadToken(String line, boolean haveExplicitAntecedent) {
CoreLabel token = new CoreLabel();
String [] bits = line.split("\t", -1);
if(bits.length < 7) throw new RuntimeIOException("ERROR: Invalid format token for serialized token (only " + bits.length + " tokens): " + line);
// word
String word = bits[0].replaceAll(SPACE_HOLDER, " ");
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.ValueAnnotation.class, word);
// if(word.length() == 0) log.info("FOUND 0-LENGTH TOKEN!");
// lemma
if(bits[1].length() > 0 || bits[0].length() == 0){
String lemma = bits[1].replaceAll(SPACE_HOLDER, " ");
token.set(CoreAnnotations.LemmaAnnotation.class, lemma);
}
// POS tag
if(bits[2].length() > 0) token.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[2]);
// NE tag
if(bits[3].length() > 0) token.set(CoreAnnotations.NamedEntityTagAnnotation.class, bits[3]);
// Normalized NE tag
if(bits[4].length() > 0) token.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, bits[4]);
// Character offsets
if(bits[5].length() > 0) token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, Integer.parseInt(bits[5]));
if(bits[6].length() > 0) token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, Integer.parseInt(bits[6]));
if(haveExplicitAntecedent){
// This block is specific to KBP
// We may have AntecedentAnnotation
if(bits.length > 7){
String aa = bits[7].replaceAll(SPACE_HOLDER, " ");
if(aa.length() > 0) token.set(CoreAnnotations.AntecedentAnnotation.class, aa);
}
}
return token;
}
/**
* Saves one individual sentence token, in a simple tabular format, in the style of CoNLL
* @param token
* @param pw
*/
private static void saveToken(CoreLabel token, boolean haveExplicitAntecedent, PrintWriter pw) {
String word = token.get(CoreAnnotations.TextAnnotation.class);
if (word == null) {
word = token.get(CoreAnnotations.ValueAnnotation.class);
}
if(word != null){
word = word.replaceAll("\\s+", SPACE_HOLDER); // spaces are used for formatting
pw.print(word);
}
pw.print("\t");
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
if(lemma != null){
lemma = lemma.replaceAll("\\s+", SPACE_HOLDER); // spaces are used for formatting
pw.print(lemma);
}
pw.print("\t");
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
if(pos != null) pw.print(pos);
pw.print("\t");
String ner = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if(ner != null) pw.print(ner);
pw.print("\t");
String normNer = token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
if(normNer != null) pw.print(normNer);
pw.print("\t");
Integer charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
if(charBegin != null) pw.print(charBegin);
pw.print("\t");
Integer charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
if(charEnd != null) pw.print(charEnd);
if(haveExplicitAntecedent){
// This block is specific to KBP
// in some cases where we now the entity in focus (i.e., web queries), AntecedentAnnotation is generated
// let's save it as an optional, always last, token
String aa = token.get(CoreAnnotations.AntecedentAnnotation.class);
if(aa != null){
pw.print("\t");
aa = aa.replaceAll("\\s+", SPACE_HOLDER); // spaces are used for formatting
pw.print(aa);
}
}
}
public static void main(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
String file = props.getProperty("file");
String loadFile = props.getProperty("loadFile");
if (loadFile != null && ! loadFile.isEmpty()) {
CustomAnnotationSerializer ser = new CustomAnnotationSerializer(false, false);
InputStream is = new FileInputStream(loadFile);
Pair pair = ser.read(is);
pair.second.close();
Annotation anno = pair.first;
System.out.println(anno.toShorterString(StringUtils.EMPTY_STRING_ARRAY));
is.close();
} else if (file != null && ! file.equals("")) {
String text = edu.stanford.nlp.io.IOUtils.slurpFile(file);
Annotation doc = new Annotation(text);
pipeline.annotate(doc);
CustomAnnotationSerializer ser = new CustomAnnotationSerializer(false, false);
PrintStream os = new PrintStream(new FileOutputStream(file + ".ser"));
ser.write(doc, os).close();
log.info("Serialized annotation saved in " + file + ".ser");
} else {
log.info("usage: CustomAnnotationSerializer [-file file] [-loadFile file]");
}
}
}