de.unihd.dbs.uima.annotator.jvntextprowrapper.JVnTextProWrapper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/**
* This is a preprocessing engine for use in a UIMA pipeline. It will invoke
* the JVnTextPro api that is supposed to be available in the classpath.
*/
package de.unihd.dbs.uima.annotator.jvntextprowrapper;
import java.io.File;
import java.util.LinkedList;
import java.util.List;
import jmaxent.Classification;
import jvnpostag.POSContextGenerator;
import jvnpostag.POSDataReader;
import jvnsegmenter.CRFSegmenter;
import jvnsensegmenter.JVnSenSegmenter;
import jvntextpro.JVnTextPro;
import jvntextpro.conversion.CompositeUnicode2Unicode;
import jvntextpro.data.DataReader;
import jvntextpro.data.TWord;
import jvntextpro.data.TaggingData;
import jvntextpro.util.StringUtils;
import jvntokenizer.PennTokenizer;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
/**
* @author Julian Zell
*
*/
public class JVnTextProWrapper extends JCasAnnotator_ImplBase {
private Class> component = this.getClass();
// definitions of what names these parameters have in the wrapper's descriptor file
public static final String PARAM_SENTSEGMODEL_PATH = "sent_model_path";
public static final String PARAM_WORDSEGMODEL_PATH = "word_model_path";
public static final String PARAM_POSMODEL_PATH = "pos_model_path";
public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
// switches for annotation parameters
private Boolean annotate_tokens = false;
private Boolean annotate_sentences = false;
private Boolean annotate_partofspeech = false;
private String sentModelPath = null;
private String wordModelPath = null;
private String posModelPath = null;
// private jvntextpro objects
JVnSenSegmenter vnSenSegmenter = new JVnSenSegmenter();
CRFSegmenter vnSegmenter = new CRFSegmenter();
DataReader reader = new POSDataReader();
TaggingData dataTagger = new TaggingData();
Classification classifier = null;
/**
* initialization method where we fill configuration values and check some prerequisites
*/
public void initialize(UimaContext aContext) {
// get configuration from the descriptor
annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
sentModelPath = (String) aContext.getConfigParameterValue(PARAM_SENTSEGMODEL_PATH);
wordModelPath = (String) aContext.getConfigParameterValue(PARAM_WORDSEGMODEL_PATH);
posModelPath = (String) aContext.getConfigParameterValue(PARAM_POSMODEL_PATH);
if(sentModelPath != null)
if(!vnSenSegmenter.init(sentModelPath)) {
Logger.printError(component, "Error initializing the sentence segmenter model: " + sentModelPath);
System.exit(-1);
}
if(wordModelPath != null)
try {
vnSegmenter.init(wordModelPath);
} catch(Exception e) {
Logger.printError(component, "Error initializing the word segmenter model: " + wordModelPath);
System.exit(-1);
}
if(posModelPath != null)
try {
dataTagger.addContextGenerator(new POSContextGenerator(posModelPath + File.separator + "featuretemplate.xml"));
classifier = new Classification(posModelPath);
} catch(Exception e) {
Logger.printError(component, "Error initializing the POS tagging model: " + posModelPath);
System.exit(-1);
}
}
/**
* Method that gets called to process the documents' cas objects
*/
public void process(JCas jcas) throws AnalysisEngineProcessException {
CompositeUnicode2Unicode convertor = new CompositeUnicode2Unicode();
String origText = jcas.getDocumentText();
final String convertedText = convertor.convert(origText);
final String senSegmentedText = vnSenSegmenter.senSegment(convertedText).trim();
final String tokenizedText = PennTokenizer.tokenize(senSegmentedText).trim();
final String segmentedText = vnSegmenter.segmenting(tokenizedText);
final String postProcessedString = (new JVnTextPro()).postProcessing(segmentedText).trim();
List posSentences = jvnTagging(postProcessedString);
LinkedList posWords = new LinkedList();
for(jvntextpro.data.Sentence sent : posSentences)
for(Integer i = 0; i < sent.size(); ++i)
posWords.add(sent.getTWordAt(i));
/*
* annotate sentences
*/
if(annotate_sentences) {
Integer offset = 0;
String[] sentences = senSegmentedText.split("\n");
for(String sentence : sentences) {
Sentence s = new Sentence(jcas);
sentence = sentence.trim();
Integer sentOffset = origText.indexOf(sentence, offset);
if(sentOffset >= 0) {
s.setBegin(sentOffset);
offset = sentOffset + sentence.length();
s.setEnd(offset);
s.addToIndexes();
} else {
sentence = sentence.substring(0, sentence.length() - 1).trim();
sentOffset = origText.indexOf(sentence, offset);
if(sentOffset >= 0) {
s.setBegin(sentOffset);
offset = sentOffset + sentence.length();
s.setEnd(offset);
s.addToIndexes();
} else {
System.err.println("Sentence \"" + sentence + "\" was not found in the original text.");
}
}
}
}
/*
* annotate tokens
*/
if(annotate_tokens) {
Integer offset = 0;
String[] tokens = postProcessedString.split("\\s+");
for(Integer i = 0; i < tokens.length; ++i) {
final String token = tokens[i].trim();
String thisPosTag = null;
if(posWords.size() >= i + 1) {
if(!token.equals(posWords.get(i).getWord())) {
System.err.println("Couldn't match token: " + token
+ " to expected word/tag combination " + posWords.get(i).getWord());
} else {
thisPosTag = posWords.get(i).getTag();
}
}
Integer tokenOffset = origText.indexOf(token, offset);
Token t = new Token(jcas);
if(tokenOffset >= 0 ) {
/*
* first, try to find the string in the form the tokenizer returned it
*/
t.setBegin(tokenOffset);
offset = tokenOffset + token.length();
t.setEnd(offset);
sanitizeToken(t, jcas);
if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
} else {
/*
* straight up token not found.
* assume that it is a compound word (e.g. some_thing)
* and try to find it in the original text again; first using
* a "_" -> " " replacement, then try just removing the underscore.
*/
String underscoreToSpaceToken = token.replaceAll("_", " ");
Integer spaceOffset = origText.indexOf(underscoreToSpaceToken, offset);
String underscoreRemovedToken = token.replaceAll("_", "");
Integer removedOffset = origText.indexOf(underscoreRemovedToken, offset);
/*
* offsets are the same. can't think of a good example where this could
* possibly happen, but maybe there is one.
*/
if(removedOffset >= 0 && spaceOffset >= 0) {
if(removedOffset >= spaceOffset) {
t.setBegin(spaceOffset);
offset = spaceOffset + underscoreToSpaceToken.length();
t.setEnd(offset);
sanitizeToken(t, jcas);
if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
} else {
t.setBegin(removedOffset);
offset = removedOffset + underscoreRemovedToken.length();
t.setEnd(offset);
sanitizeToken(t, jcas);
t.addToIndexes();
}
}
/*
* underscore removed was found, underscore replaced to space was not
*/
else if(removedOffset >= 0 && spaceOffset == -1) {
t.setBegin(removedOffset);
offset = removedOffset + underscoreRemovedToken.length();
t.setEnd(offset);
sanitizeToken(t, jcas);
if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
}
/*
* underscore removed was not found, underscore replaced was found
*/
else if(removedOffset == -1 && spaceOffset >= 0) {
t.setBegin(spaceOffset);
offset = spaceOffset + underscoreToSpaceToken.length();
t.setEnd(offset);
sanitizeToken(t, jcas);
if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
}
/*
* there is no hope of finding this token
*/
else {
System.err.println("Token \"" + token + "\" was not found in the original text.");
}
}
}
}
}
private Boolean sanitizeToken(Token t, JCas jcas) {
Boolean workDone = false;
// check the beginning of the token for punctuation and split off into a new token
if(t.getCoveredText().matches("^\\p{Punct}.*") && t.getCoveredText().length() > 1) {
Character thisChar = t.getCoveredText().charAt(0);
t.setBegin(t.getBegin() + 1); // set corrected token boundary for the word
Token puncToken = new Token(jcas); // create a new token for the punctuation character
puncToken.setBegin(t.getBegin() - 1);
puncToken.setEnd(t.getBegin());
// check if we want to annotate pos or the token itself
if(annotate_partofspeech)
puncToken.setPos(""+thisChar);
if(annotate_tokens)
puncToken.addToIndexes();
workDone = true;
}
// check the end of the token for punctuation and split off into a new token
if(t.getCoveredText().matches(".*\\p{Punct}$") && t.getCoveredText().length() > 1) {
Character thisChar = t.getCoveredText().charAt(t.getEnd() - t.getBegin() - 1);
t.setEnd(t.getEnd() - 1); // set corrected token boundary for the word
Token puncToken = new Token(jcas); // create a new token for the punctuation character
puncToken.setBegin(t.getEnd());
puncToken.setEnd(t.getEnd() + 1);
// check if we want to annotate pos or the token itself
if(annotate_partofspeech)
puncToken.setPos(""+thisChar);
if(annotate_tokens)
puncToken.addToIndexes();
workDone = true;
}
// get into a recursion to sanitize tokens as long as there are stray ones
if(workDone) {
workDone = sanitizeToken(t, jcas);
}
return workDone;
}
/**
* Taken from the JVnTextPro package and adapted to not output a string
* @param instr input string to be tagged
* @return tagged text
*/
public List jvnTagging(String instr) {
List data = reader.readString(instr);
for (int i = 0; i < data.size(); ++i) {
jvntextpro.data.Sentence sent = data.get(i);
for (int j = 0; j < sent.size(); ++j) {
String [] cps = dataTagger.getContext(sent, j);
String label = classifier.classify(cps);
if (label.equalsIgnoreCase("Mrk")) {
if (StringUtils.isPunc(sent.getWordAt(j)))
label = sent.getWordAt(j);
else label = "X";
}
sent.getTWordAt(j).setTag(label);
}
}
return data;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy