de.unihd.dbs.uima.annotator.jvntextprowrapper.JVnTextProWrapper Maven / Gradle / Ivy

Go to download
/**
 * This is a preprocessing engine for use in a UIMA pipeline. It will invoke
 * the JVnTextPro api that is supposed to be available in the classpath.
 */
package de.unihd.dbs.uima.annotator.jvntextprowrapper;

import java.io.File;
import java.util.LinkedList;
import java.util.List;

import jmaxent.Classification;
import jvnpostag.POSContextGenerator;
import jvnpostag.POSDataReader;
import jvnsegmenter.CRFSegmenter;
import jvnsensegmenter.JVnSenSegmenter;
import jvntextpro.JVnTextPro;
import jvntextpro.conversion.CompositeUnicode2Unicode;
import jvntextpro.data.DataReader;
import jvntextpro.data.TWord;
import jvntextpro.data.TaggingData;
import jvntextpro.util.StringUtils;
import jvntokenizer.PennTokenizer;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;

import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;

/**
 * @author Julian Zell
 *
 */
public class JVnTextProWrapper extends JCasAnnotator_ImplBase {
	private Class component = this.getClass();

	// definitions of what names these parameters have in the wrapper's descriptor file
	public static final String PARAM_SENTSEGMODEL_PATH = "sent_model_path";
	public static final String PARAM_WORDSEGMODEL_PATH = "word_model_path";
	public static final String PARAM_POSMODEL_PATH = "pos_model_path";
	public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens";
	public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
	public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
	
	// switches for annotation parameters
	private Boolean annotate_tokens = false;
	private Boolean annotate_sentences = false;
	private Boolean annotate_partofspeech = false;
	private String sentModelPath = null;
	private String wordModelPath = null;
	private String posModelPath = null;
	
	// private jvntextpro objects
	JVnSenSegmenter vnSenSegmenter = new JVnSenSegmenter();
	CRFSegmenter vnSegmenter = new CRFSegmenter();
	DataReader reader = new POSDataReader();
	TaggingData dataTagger = new TaggingData();
	Classification classifier = null;
	
	/**
	 * initialization method where we fill configuration values and check some prerequisites
	 */
	public void initialize(UimaContext aContext) {
		// get configuration from the descriptor
		annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
		annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
		annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
		sentModelPath = (String) aContext.getConfigParameterValue(PARAM_SENTSEGMODEL_PATH);
		wordModelPath = (String) aContext.getConfigParameterValue(PARAM_WORDSEGMODEL_PATH);
		posModelPath = (String) aContext.getConfigParameterValue(PARAM_POSMODEL_PATH);
		
		if(sentModelPath != null)
			if(!vnSenSegmenter.init(sentModelPath)) {
				Logger.printError(component, "Error initializing the sentence segmenter model: " + sentModelPath);
				System.exit(-1);
			}
		
		if(wordModelPath != null) 
			try {
				vnSegmenter.init(wordModelPath);
			} catch(Exception e) {
				Logger.printError(component, "Error initializing the word segmenter model: " + wordModelPath);
				System.exit(-1);
			}
		
		if(posModelPath != null) 
			try {
				dataTagger.addContextGenerator(new POSContextGenerator(posModelPath + File.separator + "featuretemplate.xml"));
				classifier = new Classification(posModelPath);	
			} catch(Exception e) {
				Logger.printError(component, "Error initializing the POS tagging model: " + posModelPath);
				System.exit(-1);
			}
	}
	
	/**
	 * Method that gets called to process the documents' cas objects
	 */
	public void process(JCas jcas) throws AnalysisEngineProcessException {
		CompositeUnicode2Unicode convertor = new CompositeUnicode2Unicode();
		String origText = jcas.getDocumentText();

		final String convertedText = convertor.convert(origText);
		final String senSegmentedText = vnSenSegmenter.senSegment(convertedText).trim();
		
		final String tokenizedText = PennTokenizer.tokenize(senSegmentedText).trim();
		final String segmentedText = vnSegmenter.segmenting(tokenizedText);
		final String postProcessedString = (new JVnTextPro()).postProcessing(segmentedText).trim();
		
		List posSentences = jvnTagging(postProcessedString);
		LinkedList posWords = new LinkedList();
		for(jvntextpro.data.Sentence sent : posSentences)
			for(Integer i = 0; i < sent.size(); ++i)
				posWords.add(sent.getTWordAt(i));
		
		/*
		 * annotate sentences
		 */
		if(annotate_sentences) {
			Integer offset = 0;
			String[] sentences = senSegmentedText.split("\n");
			for(String sentence : sentences) {
				Sentence s = new Sentence(jcas);
				sentence = sentence.trim();
				Integer sentOffset = origText.indexOf(sentence, offset);
				
				if(sentOffset >= 0) {
					s.setBegin(sentOffset);
					offset = sentOffset + sentence.length();
					s.setEnd(offset);
					s.addToIndexes();
				} else {
					sentence = sentence.substring(0, sentence.length() - 1).trim();
					sentOffset = origText.indexOf(sentence, offset);
					if(sentOffset >= 0) {
						s.setBegin(sentOffset);
						offset = sentOffset + sentence.length();
						s.setEnd(offset);
						s.addToIndexes();
					} else {
						System.err.println("Sentence \"" + sentence + "\" was not found in the original text.");
					}
				}
			}
		}
		
		/*
		 * annotate tokens
		 */
		if(annotate_tokens) {
			Integer offset = 0;
			String[] tokens = postProcessedString.split("\\s+");
			for(Integer i = 0; i < tokens.length; ++i) {
				final String token = tokens[i].trim();
				String thisPosTag = null;
				if(posWords.size() >= i + 1) {
					if(!token.equals(posWords.get(i).getWord())) {
						System.err.println("Couldn't match token: " + token 
								+ " to expected word/tag combination " + posWords.get(i).getWord());
					} else {
						thisPosTag = posWords.get(i).getTag();
					}
				}
				Integer tokenOffset = origText.indexOf(token, offset);
				
				Token t = new Token(jcas);
				
				if(tokenOffset >= 0 ) {
					/*
					 * first, try to find the string in the form the tokenizer returned it
					 */
					t.setBegin(tokenOffset);
					offset = tokenOffset + token.length();
					t.setEnd(offset);
					
					sanitizeToken(t, jcas);
					
					if(annotate_tokens) t.setPos(thisPosTag);
					t.addToIndexes();
				} else {
					/*
					 * straight up token not found.
					 * assume that it is a compound word (e.g. some_thing)
					 * and try to find it in the original text again; first using
					 * a "_" -> " " replacement, then try just removing the underscore.
					 */
					String underscoreToSpaceToken = token.replaceAll("_", " ");
					Integer spaceOffset = origText.indexOf(underscoreToSpaceToken, offset);
					String underscoreRemovedToken = token.replaceAll("_", "");
					Integer removedOffset = origText.indexOf(underscoreRemovedToken, offset);
					
					/*
					 * offsets are the same. can't think of a good example where this could 
					 * possibly happen, but maybe there is one.
					 */
					if(removedOffset >= 0 && spaceOffset >= 0) {
						if(removedOffset >= spaceOffset) {
							t.setBegin(spaceOffset);
							offset = spaceOffset + underscoreToSpaceToken.length();
							t.setEnd(offset);
							
							sanitizeToken(t, jcas);
							
							if(annotate_tokens) t.setPos(thisPosTag);
							t.addToIndexes();
						} else {
							t.setBegin(removedOffset);
							offset = removedOffset + underscoreRemovedToken.length();
							t.setEnd(offset);
							
							sanitizeToken(t, jcas);
							
							t.addToIndexes();
						}
					}
					/*
					 * underscore removed was found, underscore replaced to space was not
					 */
					else if(removedOffset >= 0 && spaceOffset == -1) {
						t.setBegin(removedOffset);
						offset = removedOffset + underscoreRemovedToken.length();
						t.setEnd(offset);
						
						sanitizeToken(t, jcas);

						if(annotate_tokens) t.setPos(thisPosTag);
						t.addToIndexes();
					} 
					/*
					 * underscore removed was not found, underscore replaced was found
					 */
					else if(removedOffset == -1 && spaceOffset >= 0) {
						t.setBegin(spaceOffset);
						offset = spaceOffset + underscoreToSpaceToken.length();
						t.setEnd(offset);
						
						sanitizeToken(t, jcas);

						if(annotate_tokens) t.setPos(thisPosTag);
						t.addToIndexes();
					}
					/*
					 * there is no hope of finding this token
					 */
					else {
						System.err.println("Token \"" + token + "\" was not found in the original text.");
					}
				}
			}
		}
	}
	
	private Boolean sanitizeToken(Token t, JCas jcas) {
		Boolean workDone = false;
		
		// check the beginning of the token for punctuation and split off into a new token
		if(t.getCoveredText().matches("^\\p{Punct}.*") && t.getCoveredText().length() > 1) {
			Character thisChar = t.getCoveredText().charAt(0);
			t.setBegin(t.getBegin() + 1); // set corrected token boundary for the word
			Token puncToken = new Token(jcas); // create a new token for the punctuation character
			puncToken.setBegin(t.getBegin() - 1);
			puncToken.setEnd(t.getBegin());
			// check if we want to annotate pos or the token itself
			if(annotate_partofspeech)
				puncToken.setPos(""+thisChar);
			if(annotate_tokens)
				puncToken.addToIndexes();
			
			workDone = true;
		}
		
		// check the end of the token for punctuation and split off into a new token
		if(t.getCoveredText().matches(".*\\p{Punct}$") && t.getCoveredText().length() > 1) {
			Character thisChar = t.getCoveredText().charAt(t.getEnd() - t.getBegin() - 1);
			t.setEnd(t.getEnd() - 1); // set corrected token boundary for the word
			Token puncToken = new Token(jcas); // create a new token for the punctuation character
			puncToken.setBegin(t.getEnd());
			puncToken.setEnd(t.getEnd() + 1);
			// check if we want to annotate pos or the token itself
			if(annotate_partofspeech)
				puncToken.setPos(""+thisChar);
			if(annotate_tokens)
				puncToken.addToIndexes();
			
			workDone = true;
		}
		
		// get into a recursion to sanitize tokens as long as there are stray ones
		if(workDone) {
			workDone = sanitizeToken(t, jcas);
		}
		
		return workDone;
	}
	
	/**
	 * Taken from the JVnTextPro package and adapted to not output a string
	 * @param instr input string to be tagged
	 * @return tagged text
	 */
	public List jvnTagging(String instr) {
		List data = reader.readString(instr);
		for (int i = 0; i < data.size(); ++i) {
        	
			jvntextpro.data.Sentence sent = data.get(i);
    		for (int j = 0; j < sent.size(); ++j) {
    			String [] cps = dataTagger.getContext(sent, j);
    			String label = classifier.classify(cps);
    			
    			if (label.equalsIgnoreCase("Mrk")) {
    				if (StringUtils.isPunc(sent.getWordAt(j)))
    					label = sent.getWordAt(j);
    				else label = "X";
    			}
    			
    			sent.getTWordAt(j).setTag(label);
    		}
    	}
		
		return data;
	}
}