nlp.pipeline.SentenceUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
package nlp.pipeline;
/*
Copyright 2014-2015 IPsoft
Author: Andrei Holub [email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
*/
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.*;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.util.CoreMap;
import nlp.constants.LangLib;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
//import edu.stanford.nlp.trees.TreeCoreAnnotations.KBestTreesAnnotation;
public class SentenceUtil {
/** ***************************************************************
* Print all the sentences in this document
* CoreMap is essentially a Map that uses class objects as keys and
* has values with custom types
*/
public static void printSentences(Annotation document) {
List sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence : sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
int count = 1;
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
String ne = token.get(NamedEntityTagAnnotation.class);
List entity = token.get(MentionsAnnotation.class);
System.out.println(word + "-" + count + "/" + pos + "/" + ne + "/" + entity);
count++;
}
// this is the parse tree of the current sentence
// Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
// this is the Stanford dependency graph of the current sentence
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
System.out.println(dependencies.toList());
//System.out.println(dependencies.toPOSList());
//System.out.println(getFullNamedEntities(sentence));
/*
List> scoredTrees = sentence.get(KBestTreesAnnotation.class);
System.out.println("\nTree Scores:");
for (ScoredObject scoredTree : scoredTrees) {
//SemanticGraph graph = SemanticGraphFactory.generateUncollapsedDependencies(scoredTree.object());
System.out.println(scoredTree.score());
}
*/
}
}
/** ***************************************************************
* @return a List of Strings which are concatenated tokens forming
* a single named entity, with the suffix of the number of the
* head. For example "I went to New York." would return
* "NewYork-4".
*/
public static ArrayList getFullNamedEntities (CoreMap sentence) {
ArrayList nes = new ArrayList();
StringBuffer ne = new StringBuffer();
String neType = "";
int count = 1;
int wordCount = 0; // number of words packed into a given ne
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
System.out.println(word);
// this is the NER label of the token
String type = token.get(NamedEntityTagAnnotation.class);
if (neType == "") {
neType = type;
if (!type.equals("O"))
ne.append(word);
wordCount = 1;
}
else if (!neType.equals(type)) {
if (!neType.equals("O"))
nes.add(ne.toString() + "-" + (count-wordCount));
ne = new StringBuffer();
if (!type.equals("O")) {
ne.append(word);
wordCount = 1;
}
else
wordCount = 0;
neType = type;
}
else {
if (!type.equals("O"))
ne.append(word);
wordCount++;
}
System.out.println(word + "-" + count + "/" + type + "/" + neType + "/" + ne + "/" + wordCount);
count++;
}
return nes;
}
/** ***************************************************************
* Print the coreference link graph
* Each chain stores a set of mentions that link to each other,
* along with a method for getting the most representative mention
* Both sentence and token offsets start at 1!
*/
public static void printCorefChain(Annotation document) {
Map graph = document.get(CorefChainAnnotation.class);
if (graph == null)
return;
for (CorefChain cc : graph.values()) {
List mentions = cc.getMentionsInTextualOrder();
if (mentions.size() > 1) {
for (CorefChain.CorefMention ment : mentions) {
System.out.println(ment.sentNum + " : " + ment.headIndex + " : " + ment.mentionSpan);
}
System.out.println();
}
}
}
//TODO: I'm a monster! Refactor me
/** *************************************************************
* returns a list of strings that add tense, number, etc. information about words in input
* ex. tense(PAST, Verb)
* number(SINGULAR, Noun)
*/
public static List findPOSInformation(List tokens, List dependenciesList) {
List posInformation = Lists.newArrayList();
for (CoreLabel label : tokens) {
Pattern auxPattern = Pattern.compile("aux\\(.*, " + label.toString() + "\\)");
boolean isAux = false;
for (String dep : dependenciesList) {
if (auxPattern.matcher(dep).find()) {
isAux = true;
break;
}
}
if (!isAux) {
boolean progressive = false;
boolean perfect = false;
String pos = label.get(PartOfSpeechAnnotation.class);
if (LangLib.POS_VBD.equals(pos)) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString()));
}
else if (LangLib.POS_VBP.equals(pos) || LangLib.POS_VBZ.equals(pos)) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString()));
}
else if (LangLib.POS_VBG.equals(pos) || LangLib.POS_VB.equals(pos) || LangLib.POS_VBN.equals(pos)) {
Pattern reverseAuxPattern = Pattern.compile("aux\\(" + label.toString() + ", .*-(\\d+)\\)");
for (String dep : dependenciesList) {
Matcher auxMatcher = reverseAuxPattern.matcher(dep);
if (auxMatcher.find()) {
int i = Integer.parseInt(auxMatcher.group(1));
CoreLabel t = tokens.get(i-1);
if (t.get(LemmaAnnotation.class).equals("be")) {
if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBP) || t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBZ)) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString()));
}
else if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBD)) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString()));
}
progressive = true;
}
else if (t.get(LemmaAnnotation.class).equals("will")) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_FUTURE, label.toString()));
}
else if (t.get(LemmaAnnotation.class).equals("have")) {
if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBP) || t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBZ)) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString()));
}
else if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBD)) {
posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString()));
}
perfect = true;
}
}
}
}
else if (LangLib.POS_NN.equals(pos) || LangLib.POS_NNP.equals(pos)) {
posInformation.add(makeBinaryRelationship("number", LangLib.NUMBER_SINGULAR, label.toString()));
}
else if (LangLib.POS_NNS.equals(pos) || LangLib.POS_NNPS.equals(pos)) {
posInformation.add(makeBinaryRelationship("number", LangLib.NUMBER_PLURAL, label.toString()));
}
if (progressive && perfect) {
posInformation.add(makeBinaryRelationship("aspect", LangLib.ASPECT_PROGRESSIVE_PERFECT, label.toString()));
}
else if (progressive) {
posInformation.add(makeBinaryRelationship("aspect", LangLib.ASPECT_PROGRESSIVE, label.toString()));
}
else if (perfect) {
posInformation.add(makeBinaryRelationship("aspect", LangLib.ASPECT_PERFECT, label.toString()));
}
}
}
return posInformation;
}
//TODO: see if this exists somewhere else or move to utility class
/** *************************************************************
*/
public static String makeBinaryRelationship(String relationship, String argument1, String argument2) {
StringBuilder sb = new StringBuilder(relationship);
sb.append("(");
sb.append(argument1);
sb.append(", ");
sb.append(argument2);
sb.append(")");
return sb.toString();
}
public static CoreMap getLastSentence(Annotation document) {
List sentences = document.get(SentencesAnnotation.class);
if(!sentences.isEmpty()) {
return Iterables.getLast(sentences);
} else {
return null;
}
}
/** ***************************************************************
*/
public static ArrayList toDependenciesList(Annotation document) {
return toDependenciesList(document.get(SentencesAnnotation.class));
}
/** ***************************************************************
*/
public static ArrayList toDependenciesList(List sentences) {
ArrayList results = Lists.newArrayList();
for (CoreMap sentence : sentences) {
//SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
results = Lists.newArrayList(dependencies.toList().split("\n"));
}
ArrayList deps= new ArrayList();
for (String dep : results)
if (!dep.startsWith("punct(")) // TODO: handle punctuations instead of just removing - AP
deps.add(dep);
return deps;
}
/** ***************************************************************
*/
public static void main(String[] args) {
Annotation a = Pipeline.toAnnotation("I went to New York and had cookies and cream in the Empire State Building in January with Mary.");
printCorefChain(a);
printSentences(a);
}
}