de.unihd.dbs.uima.annotator.alllanguagestokenizer.AllLanguagesTokenizer Maven / Gradle / Ivy
package de.unihd.dbs.uima.annotator.alllanguagestokenizer;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
public class AllLanguagesTokenizer extends JCasAnnotator_ImplBase {
private String PChar = "\\[¿¡\\{\\(\\`\"‚„†‡‹‘’“”•–—›'";
private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%‚„…†‡‰‹‘’“”•–—›";
private String FClitic = "";
private String PClitic = "";
public AllLanguagesTokenizer() {
FClitic += "'(s|re|ve|d|m|em|ll)|n't";
PClitic += "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'";
PClitic += "|[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";
FClitic += "|-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
FClitic += "|-la|-las|-lo|-los|-nos";
}
public void process(JCas jcas) throws AnalysisEngineProcessException {
tokenize(jcas);
sentenceTokenize(jcas);
}
public List tokenize(JCas jcas) {
StringBuilder outBuf = new StringBuilder();
for(String text : jcas.getDocumentText().split("\n")) {
// replace newlines and tab characters with blanks
text = text.replaceAll("[\r\n\t]", " ");
// replace blanks within SGML tags
text = text.replaceAll("(<[^<> ]*) ([^<>]*>)", "$1\377$2");
// replace whitespace with a special character
text = text.replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "\376");
// restore SGML tags
text = text.replaceAll("\377", " ");
text = text.replaceAll("\376", "\377");
// prepare SGML-Tags for tokenization
text = text.replaceAll("(<[^<>]*>)", "\377$1\377");
text = text.replaceAll("^\377", "");
text = text.replaceAll("\377$", "");
text = text.replaceAll("\377\377\377*", "\377");
String[] texts = text.split("\377");
for(String line : texts) {
if(line.matches("^<.*>$")) {
// SGML tag
outBuf.append(line + "\n");
} else {
// add a blank at the beginning and the end of each segment
line = " " + line + " ";
// insert missing blanks after punctuation
line = line.replaceAll("\\.\\.\\.", " ... ");
line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
String[] lines = line.split(" ");
for(String token : lines) {
// remove some whitespaces that \s doesn't catch
if(token.equals(""))
continue;
String suffix = "";
// separate punctuation and parentheses from words
Boolean finished = false;
Matcher m;
do {
finished = true;
// cut off preceding punctuation
m = Pattern.compile("^([" + PChar + "])(.)").matcher(token);
if(m.find()) {
token = token.replaceAll("^([" + PChar + "])(.)", "$2");
outBuf.append(m.group(1) + "\n");
finished = false;
}
// cut off trailing punctuation
m = Pattern.compile("(.)([" + FChar + "])$").matcher(token);
if(m.find()) {
token = token.replaceAll("(.)([" + FChar + "])$", "$1");
suffix = m.group(2) + "\n" + suffix;
finished = false;
}
// cut off trailing periods if punctuation precedes
m = Pattern.compile("([" + FChar + "])\\.$").matcher(token);
if(m.find()) {
token = token.replaceAll("([" + FChar + "])\\.$", "");
suffix = ".\n" + suffix;
if(token.equals("")) {
token = m.group(1);
} else {
suffix = m.group(1) + "\n" + suffix;
}
finished = false;
}
} while(!finished);
/* TODO:commented out because those are language-specific
// handle explicitly listed tokens
if(abbreviations.contains(token)) {
outBuf.append(token + "\n" + suffix);
continue;
}*/
// abbreviations of the form A. or U.S.A.
if(token.matches("^([A-Za-z-]\\.)+$")) {
outBuf.append(token + "\n" + suffix);
continue;
}
// disambiguate periods
m = Pattern.compile("^(..*)\\.$").matcher(token);
if(m.matches() && !line.equals("...")
/* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) {
token = m.group(1);
suffix = ".\n" + suffix;
/* TODO:commented out because those are language-specific
if(abbreviations.contains(token)) {
outBuf.append(token + "\n" + suffix);
continue;
}*/
}
// cut off clitics
while(true) {
m = Pattern.compile("^(--)(.)").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("^(--)(.)", "$2");
outBuf.append(m.group(1) + "\n");
}
if(!PClitic.equals("")) {
while(true) {
m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("^(" + PClitic + ")(.)", "$2");
outBuf.append(m.group(1) + "\n");
}
}
while(true) {
m = Pattern.compile("^(--)(.)").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("^(--)(.)", "$1");
suffix = m.group(2) + "\n" + suffix;
}
if(!FClitic.equals("")) {
while(true) {
m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token);
if(!m.find()) {
break;
}
token = token.replaceAll("(.)(" + FClitic + ")$", "$1");
suffix = m.group(2) + "\n" + suffix;
}
}
outBuf.append(token + "\n" + suffix);
}
}
}
}
// find the tokens in the original text and create token annotations
LinkedList outList = new LinkedList();
String origText = jcas.getDocumentText();
Integer origTextOffset = 0;
for(String s : outBuf.toString().split("\n")) {
Integer begin = origText.indexOf(s, origTextOffset);
Integer end = begin + s.length();
Token t = new Token(jcas);
t.setBegin(begin);
t.setPos("");
t.setEnd(end);
t.addToIndexes();
origTextOffset = t.getEnd();
outList.add(t);
}
return outList;
}
public List sentenceTokenize(JCas jcas) {
List outList = new LinkedList();
FSIterator tokIt = jcas.getAnnotationIndex(Token.type).iterator();
Sentence s = new Sentence(jcas);
Boolean sentenceStarted = false;
Token tOld = null;
Token t = null;
while(tokIt.hasNext()) {
if (!(t == null)){
tOld = t;
}
t = (Token) tokIt.next();
// set sentence beginning
if(sentenceStarted == false) {
sentenceStarted = true;
s.setBegin(t.getBegin());
}
/* detect sentence ends
* second character class taken from: http://en.wikipedia.org/wiki/Quotation_mark#Curved_quotes_and_Unicode
*/
if(!tokIt.hasNext() ||
(t.getCoveredText().matches("[.:!\\?]+") &&
(!((tOld.getCoveredText().matches("[\\d]+")) || ((jcas.getDocumentText().substring(t.getEnd()).length() > 2) && (jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3)).matches(" [A-Z][.-]")))))){
// ((!(tOld.getCoveredText().matches("[\\d]+")))) && (!((jcas.getDocumentText().substring(t.getEnd())).matches("^[\\s]*"))))) {
// (t.getCoveredText().matches("[.:!\\?]+") && (!(tOld.getCoveredText().matches("[\\d]+"))))) { // das funktioniert ok
sentenceStarted = false;
s.setEnd(t.getEnd());
// check for whether the punctuation mark is followed by a closing quotation mark
if(tokIt.hasNext()) {
Token tNext = (Token) tokIt.next();
if(tNext.getCoveredText().matches("[»’'\"‛”‟›〞』」﹄"'」﹂]+")) {
s.setEnd(tNext.getEnd());
} else {
tokIt.moveToPrevious();
}
}
s.addToIndexes();
outList.add(s);
s = new Sentence(jcas);
}
}
return outList;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy