de.unihd.dbs.uima.annotator.alllanguagestokenizer.AllLanguagesTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
package de.unihd.dbs.uima.annotator.alllanguagestokenizer;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
public class AllLanguagesTokenizer extends JCasAnnotator_ImplBase {
private String PChar = "\\[¿¡\\{\\(\\`\"‚„†‡‹‘’“”•–—›'";
private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%‚„…†‡‰‹‘’“”•–—›";
private String FClitic = "";
private String PClitic = "";
public AllLanguagesTokenizer() {
FClitic += "'(s|re|ve|d|m|em|ll)|n't";
PClitic += "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'";
PClitic += "|[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";
FClitic += "|-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
FClitic += "|-la|-las|-lo|-los|-nos";
}
public void process(JCas jcas) throws AnalysisEngineProcessException {
tokenize(jcas);
sentenceTokenize(jcas);
}
public List tokenize(JCas jcas) {
StringBuilder outBuf = new StringBuilder();
for(String text : jcas.getDocumentText().split("\n")) {
// replace newlines and tab characters with blanks
text = text.replaceAll("[\r\n\t]", " ");
// replace blanks within SGML tags
text = text.replaceAll("(<[^<> ]*) ([^<>]*>)", "$1\377$2");
// replace whitespace with a special character
text = text.replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "\376");
// restore SGML tags
text = text.replaceAll("\377", " ");
text = text.replaceAll("\376", "\377");
// prepare SGML-Tags for tokenization
text = text.replaceAll("(<[^<>]*>)", "\377$1\377");
text = text.replaceAll("^\377", "");
text = text.replaceAll("\377$", "");
text = text.replaceAll("\377\377\377*", "\377");
String[] texts = text.split("\377");
for(String line : texts) {
if(line.matches("^<.*>$")) {
// SGML tag
outBuf.append(line + "\n");
} else {
// add a blank at the beginning and the end of each segment
line = " " + line + " ";
// insert missing blanks after punctuation
line = line.replaceAll("\\.\\.\\.", " ... ");
line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
String[] lines = line.split(" ");
for(String token : lines) {
// remove some whitespaces that \s doesn't catch
if(token.equals(""))
continue;
String suffix = "";
// separate punctuation and parentheses from words
Boolean finished = false;
Matcher m;
do {
finished = true;
// cut off preceding punctuation
m = Pattern.compile("^([" + PChar + "])(.)").matcher(token);
if(m.find()) {
token = token.replaceAll("^([" + PChar + "])(.)", "$2");
outBuf.append(m.group(1) + "\n");
finished = false;
}
// cut off trailing punctuation
m = Pattern.compile("(.)([" + FChar + "])$").matcher(token);
if(m.find()) {
token = token.replaceAll("(.)([" + FChar + "])$", "$1");
suffix = m.group(2) + "\n" + suffix;
finished = false;
}
// cut off trailing periods if punctuation precedes
m = Pattern.compile("([" + FChar + "])\\.$").matcher(token);
if(m.find()) {
token = token.replaceAll("([" + FChar + "])\\.$", "");
suffix = ".\n" + suffix;
if(token.equals("")) {
token = m.group(1);
} else {
suffix = m.group(1) + "\n" + suffix;
}
finished = false;
}
} while(!finished);
/* TODO:commented out because those are language-specific
// handle explicitly listed tokens
if(abbreviations.contains(token)) {
outBuf.append(token + "\n" + suffix);
continue;
}*/
// abbreviations of the form A. or U.S.A.
if(token.matches("^([A-Za-z-]\\.)+$")) {
outBuf.append(token + "\n" + suffix);
continue;
}
// disambiguate periods
m = Pattern.compile("^(..*)\\.$").matcher(token);
if(m.matches() && !line.equals("...")
/* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) {
token = m.group(1);
suffix = ".\n" + suffix;
/* TODO:commented out because those are language-specific
if(abbreviations.contains(token)) {
outBuf.append(token + "\n" + suffix);
continue;
}*/
}
// cut off clitics
while(true) {
m = Pattern.compile("^(--)(.)").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("^(--)(.)", "$2");
outBuf.append(m.group(1) + "\n");
}
if(!PClitic.equals("")) {
while(true) {
m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("^(" + PClitic + ")(.)", "$2");
outBuf.append(m.group(1) + "\n");
}
}
while(true) {
m = Pattern.compile("^(--)(.)").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("^(--)(.)", "$1");
suffix = m.group(2) + "\n" + suffix;
}
if(!FClitic.equals("")) {
while(true) {
m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("(.)(" + FClitic + ")$", "$1");
suffix = m.group(2) + "\n" + suffix;
}
}
outBuf.append(token + "\n" + suffix);
}
}
}
}
// find the tokens in the original text and create token annotations
LinkedList outList = new LinkedList();
String origText = jcas.getDocumentText();
Integer origTextOffset = 0;
for(String s : outBuf.toString().split("\n")) {
Integer begin = origText.indexOf(s, origTextOffset);
Integer end = begin + s.length();
Token t = new Token(jcas);
t.setBegin(begin);
t.setPos("");
t.setEnd(end);
t.addToIndexes();
origTextOffset = t.getEnd();
outList.add(t);
}
return outList;
}
public List sentenceTokenize(JCas jcas) {
List outList = new LinkedList();
FSIterator tokIt = jcas.getAnnotationIndex(Token.type).iterator();
Sentence s = new Sentence(jcas);
Boolean sentenceStarted = false;
Token tOld = null;
Token t = null;
while(tokIt.hasNext()) {
if (!(t == null)){
tOld = t;
}
t = (Token) tokIt.next();
// set sentence beginning
if(sentenceStarted == false) {
sentenceStarted = true;
s.setBegin(t.getBegin());
}
/* detect sentence ends
* second character class taken from: http://en.wikipedia.org/wiki/Quotation_mark#Curved_quotes_and_Unicode
*/
if(!tokIt.hasNext() ||
(t.getCoveredText().matches("[.:!\\?]+") &&
(!((tOld.getCoveredText().matches("[\\d]+")) || ((jcas.getDocumentText().substring(t.getEnd()).length() > 2) && (jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3)).matches(" [A-Z][.-]")))))){
// ((!(tOld.getCoveredText().matches("[\\d]+")))) && (!((jcas.getDocumentText().substring(t.getEnd())).matches("^[\\s]*"))))) {
// (t.getCoveredText().matches("[.:!\\?]+") && (!(tOld.getCoveredText().matches("[\\d]+"))))) { // das funktioniert ok
sentenceStarted = false;
s.setEnd(t.getEnd());
// check for whether the punctuation mark is followed by a closing quotation mark
if(tokIt.hasNext()) {
Token tNext = (Token) tokIt.next();
if(tNext.getCoveredText().matches("[»’'\"‛”‟›〞』」﹄"'」﹂]+")) {
s.setEnd(tNext.getEnd());
} else {
tokIt.moveToPrevious();
}
}
s.addToIndexes();
outList.add(s);
s = new Sentence(jcas);
}
}
return outList;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy