de.unihd.dbs.uima.annotator.alllanguagestokenizer.AllLanguagesTokenizer Maven / Gradle / Ivy

Go to download
package de.unihd.dbs.uima.annotator.alllanguagestokenizer;

import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;

import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;


public class AllLanguagesTokenizer extends JCasAnnotator_ImplBase {
	private String PChar = "\\[¿¡\\{\\(\\`\"‚„†‡‹‘’“”•–—›'";
	private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%‚„…†‡‰‹‘’“”•–—›";
	private String FClitic = "";
	private String PClitic = "";
	
	public AllLanguagesTokenizer() {
		FClitic += "'(s|re|ve|d|m|em|ll)|n't";
		PClitic += "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'";
		PClitic += "|[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";
		FClitic += "|-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
		FClitic += "|-la|-las|-lo|-los|-nos";
	}

	public void process(JCas jcas) throws AnalysisEngineProcessException {
		tokenize(jcas);
		
		sentenceTokenize(jcas);
	}

	
	public List tokenize(JCas jcas) {
		StringBuilder outBuf = new StringBuilder();
		
		for(String text : jcas.getDocumentText().split("\n")) {
			// replace newlines and tab characters with blanks
			text = text.replaceAll("[\r\n\t]", " ");
			// replace blanks within SGML tags
			text = text.replaceAll("(<[^<> ]*) ([^<>]*>)", "$1\377$2");
			// replace whitespace with a special character
			text = text.replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "\376");
			// restore SGML tags
			text = text.replaceAll("\377", " ");
			text = text.replaceAll("\376", "\377");
			// prepare SGML-Tags for tokenization
			text = text.replaceAll("(<[^<>]*>)", "\377$1\377");
			text = text.replaceAll("^\377", "");
			text = text.replaceAll("\377$", "");
			text = text.replaceAll("\377\377\377*", "\377");
			
			String[] texts = text.split("\377");
			
			for(String line : texts) {
				if(line.matches("^<.*>$")) {
					// SGML tag
					outBuf.append(line + "\n");
				} else {
					// add a blank at the beginning and the end of each segment
					line = " " + line + " ";
					
					// insert missing blanks after punctuation
					line = line.replaceAll("\\.\\.\\.", " ... ");
					line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
					line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
					
					String[] lines = line.split(" ");
					
					for(String token : lines) {
						// remove some whitespaces that \s doesn't catch
						if(token.equals(""))
							continue;
						
						String suffix = "";
						
						// separate punctuation and parentheses from words
						Boolean finished = false;
						Matcher m;
						do {
							finished = true;
							
							// cut off preceding punctuation
							m = Pattern.compile("^([" + PChar + "])(.)").matcher(token);
							if(m.find()) {
								token = token.replaceAll("^([" + PChar + "])(.)", "$2");
								outBuf.append(m.group(1) + "\n");
								finished = false;
							}
							
							// cut off trailing punctuation
							m = Pattern.compile("(.)([" + FChar + "])$").matcher(token);
							if(m.find()) {
								token = token.replaceAll("(.)([" + FChar + "])$", "$1");
								suffix = m.group(2) + "\n" + suffix;
								finished = false;
							}
							
							// cut off trailing periods if punctuation precedes
							m = Pattern.compile("([" + FChar + "])\\.$").matcher(token);
							if(m.find()) {
								token = token.replaceAll("([" + FChar + "])\\.$", "");
								suffix = ".\n" + suffix;
								
								if(token.equals("")) {
									token = m.group(1);
								} else {
									suffix = m.group(1) + "\n" + suffix;
								}
								
								finished = false;
							}
						} while(!finished);
						/* TODO:commented out because those are language-specific
						// handle explicitly listed tokens
						if(abbreviations.contains(token)) {
							outBuf.append(token + "\n" + suffix);
							continue;
						}*/
						
						// abbreviations of the form A. or U.S.A.
						if(token.matches("^([A-Za-z-]\\.)+$")) {
							outBuf.append(token + "\n" + suffix);
							continue;
						}
						
						// disambiguate periods
						m = Pattern.compile("^(..*)\\.$").matcher(token);
						if(m.matches() && !line.equals("...") 
								/* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) {
							token = m.group(1);
							suffix = ".\n" + suffix;
							/* TODO:commented out because those are language-specific
							if(abbreviations.contains(token)) {
								outBuf.append(token + "\n" + suffix);
								continue;
							}*/
						}
						
						// cut off clitics
						while(true) {
							m = Pattern.compile("^(--)(.)").matcher(token);
							
							if(!m.find()) {
								break;
							}
							
							token = token.replaceAll("^(--)(.)", "$2");
							outBuf.append(m.group(1) + "\n");
						}
						if(!PClitic.equals("")) {
							while(true) {
								m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token);
								
								if(!m.find()) {
									break;
								}
								
								token = token.replaceAll("^(" + PClitic + ")(.)", "$2");
								outBuf.append(m.group(1) + "\n");
							}
						}
	
						while(true) {
							m = Pattern.compile("^(--)(.)").matcher(token);
							
							if(!m.find()) {
								break;
							}
							
							token = token.replaceAll("^(--)(.)", "$1");
							suffix = m.group(2) + "\n" + suffix;
						}
						if(!FClitic.equals("")) {
							while(true) {
								m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token);
								
								if(!m.find()) {
									break;
								}
								
								token = token.replaceAll("(.)(" + FClitic + ")$", "$1");
								suffix = m.group(2) + "\n" + suffix;
							}
						}
						outBuf.append(token + "\n" + suffix);
					}
				}
			}
		}
		
		// find the tokens in the original text and create token annotations
		LinkedList outList = new LinkedList();
		String origText = jcas.getDocumentText();
		Integer origTextOffset = 0;
		
		for(String s : outBuf.toString().split("\n")) {
			Integer begin = origText.indexOf(s, origTextOffset);
			Integer end = begin + s.length();
			
			Token t = new Token(jcas);
			t.setBegin(begin);
			t.setPos("");
			t.setEnd(end);
			
			t.addToIndexes();
			
			origTextOffset = t.getEnd();
			
			outList.add(t);
		}
		
		return outList;
	}
	
	public List sentenceTokenize(JCas jcas) {
		List outList = new LinkedList();
		FSIterator tokIt = jcas.getAnnotationIndex(Token.type).iterator();
		
		Sentence s = new Sentence(jcas);
		Boolean sentenceStarted = false;
		Token tOld = null;
		Token t = null;
		while(tokIt.hasNext()) {
			if (!(t == null)){
				tOld = t;
			}
			t = (Token) tokIt.next();
			
			// set sentence beginning
			if(sentenceStarted == false) {
				sentenceStarted = true;
				
				s.setBegin(t.getBegin());
			}
			
			/* detect sentence ends
			 * second character class taken from: http://en.wikipedia.org/wiki/Quotation_mark#Curved_quotes_and_Unicode
			 */
			if(!tokIt.hasNext() ||
					(t.getCoveredText().matches("[.:!\\?]+") && 
							(!((tOld.getCoveredText().matches("[\\d]+")) || ((jcas.getDocumentText().substring(t.getEnd()).length() > 2) && (jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3)).matches(" [A-Z][.-]")))))){
//							((!(tOld.getCoveredText().matches("[\\d]+")))) && (!((jcas.getDocumentText().substring(t.getEnd())).matches("^[\\s]*"))))) {
//					(t.getCoveredText().matches("[.:!\\?]+") && (!(tOld.getCoveredText().matches("[\\d]+"))))) { // das funktioniert ok
				sentenceStarted = false;
				s.setEnd(t.getEnd());

				// check for whether the punctuation mark is followed by a closing quotation mark
				if(tokIt.hasNext()) {
					Token tNext = (Token) tokIt.next();
					
					if(tNext.getCoveredText().matches("[»’'\"‛”‟›〞』」﹄＂＇｣﹂]+")) {
						s.setEnd(tNext.getEnd());
					} else {
						tokIt.moveToPrevious();
					}
				}
				
				s.addToIndexes();
				
				outList.add(s);
				
				s = new Sentence(jcas);
			}
		}
		
		return outList;
	}
}