org.whitesource.jninka.SentenceSplitter Maven / Gradle / Ivy

Go to download
/**
 *  Copyright (C) 2012 White Source (www.whitesourcesoftware.com)
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as
 *  published by the Free Software Foundation, either version 3 of the
 *  License, or (at your option) any later version.
 *
 *  This patch is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this patch.  If not, see .
 */
package org.whitesource.jninka;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Rami.Sass
 */
public class SentenceSplitter extends StageProcessor {
	
	/* --- Static members --- */

    private static final String SEPARATOR_BREAK_REGEX = "^([^\\.\\!\\?\\:\n]*)([\\.\\!\\?\\:\n])(?=(.?))";
    private static final Pattern SEPARATOR_BREAK_PATTERN = Pattern.compile(SEPARATOR_BREAK_REGEX, Pattern.MULTILINE);
    private static final Pattern LAST_WORD_ABBREVIATION_PATTERN = Pattern.compile("(.?)([^\\p{Punct}\\s]+)$");

	private static Logger logger = Logger.getLogger(SentenceSplitter.class.getCanonicalName());
	
	/* --- Members --- */
	
	private InputStream abbrvFile;
	
	private InputStream dictionary;

//	private Map commonTerms = new Hashtable();
	
	private List abbreviations = new ArrayList();	 
	
	/* --- Concrete implementation methods --- */
	   
	public boolean process() {
		boolean result = true;
		try {
			List outputInfo = new ArrayList();
			
			String text = JNinkaUtils.joinArrayList(getInputInfo(), "\n");
			// append a "\n" just in case
			text += "\n";

			text = this.preProcessText(text);

			Pattern pat = Pattern.compile("^([^\n]*)\n", Pattern.MULTILINE);
			Matcher m = pat.matcher(text);

			StringBuffer sb = new StringBuffer();

			while (m.find() && (m.groupCount() >= 1)) {
				String curr = m.group(1);
				curr = JNinkaRegullarExpression.escapeForRegex(curr);
				m.appendReplacement(sb, curr);

				// let us count the number of alphabetic chars to check if we
				// are skipping anything we should not
				int count = JNinkaUtils.alphabeticCount(curr);

                List sentences = this.splitText(curr);

				int count2 = 0;
				Iterator it = sentences.iterator();
				while (it.hasNext()) {
					String s = it.next();
                    count2 += JNinkaUtils.alphabeticCount(s);
					s = cleanSentence(s);
					s = JNinkaRegullarExpression.unescapeAfterRegex(s);
					outputInfo.add(s);
				}
				
				if (count != count2) {
					logger.severe("[" + curr + "]");
					it = sentences.iterator();
					while (it.hasNext()) {
						String s = it.next();
						logger.severe(cleanSentence(s));
					}
					result = false;
					logger.severe("Number of printable chars does not match!  [" + count + "][" + count2 + "]");
				}
			}
			
			this.setOutputInfo(outputInfo);

		} catch (Exception e) {
			result = false;
			logger.log(Level.SEVERE, e.getMessage(), e);
		}
			
		return result;
	}
	
	/* --- Private methods --- */
	
	private String cleanSentence(String text){
		//check for trailing bullets of different types
		text = JNinkaRegullarExpression.applyReplace(text, "^o ", "");				
		text = JNinkaRegullarExpression.applyReplace(text, "^\\s*[0-9]+\\s*[\\-\\)]", "");		
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]+", "");
		text = JNinkaRegullarExpression.applyReplace(text, "[ \t]+$", "");		
		//remove a trailing -
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]*[\\-\\.\\s*] +", "");
		//replace quotes
		text = JNinkaRegullarExpression.applyReplace(text, "\\s+", " ");
		text = JNinkaRegullarExpression.applyReplace(text, "['\"`]+", "");
		
		text = JNinkaRegullarExpression.applyReplace(text, ":", "");
		text = JNinkaRegullarExpression.applyReplace(text, "\\.+$", ".");
		if ( text.matches("\n") ){
			throw new IllegalArgumentException("text cannot be \\n");
		}
		return text;
	}	

	/**
	* Open and read a file, and return the lines in the file as a  hashtable
	 * @throws Exception 
	*/
    private List splitText(String text) throws Exception {
		//int len = 0;
		List result = new ArrayList();
		String currentSentence = "";
		/*
		this breaks the sentence into
		1. Any text before a separator
		2. The separator [.!?:\n]
		3.
		*/
        Matcher matcher = SEPARATOR_BREAK_PATTERN.matcher(text);
        while(matcher.find()) {
            String sentenceMatch = JNinkaRegullarExpression.getGroupValue(matcher, 1);
            String punctuation = JNinkaRegullarExpression.getGroupValue(matcher, 2);
            String sentence = sentenceMatch + punctuation;
            String after = JNinkaRegullarExpression.getGroupValue(matcher, 3);
            text = JNinkaRegullarExpression.postMatch(SEPARATOR_BREAK_PATTERN, text);//!!!put after all operations

			//if next character is not a space, then we are not in a sentence"
			if (!" ".equals(after) && !"\t".equals(after)) {
                currentSentence += sentence;
				continue;
			}

			//at this point we know that there is a space after
			if (":".equals(punctuation) || "?".equals(punctuation) || "!".equals(punctuation)){
				//let us consider this right here a beginning of a sentence
				result.add(currentSentence + sentence);			
				currentSentence = "";
				continue;
			}
			if (".".equals(punctuation)){
				//we have a bunch of alternatives
				//for the time being just consider a new sentence
						
				/*					
	 				TODO
					simple heuristic... let us check that the next words are not the beginning of a sentence
					in our library
					ENDTODO
				*/
	
				//is the last word an abbreviation? For this the period has to follow the word
				//this expression might have to be updated to take care of special characters in names :(
                Matcher matcher2 = LAST_WORD_ABBREVIATION_PATTERN.matcher(sentenceMatch);
                if (matcher2.matches()) {
                    String before = JNinkaRegullarExpression.getGroupValue(matcher2, 1);
                    String lastWord = JNinkaRegullarExpression.getGroupValue(matcher2, 2);

					//is it an abbreviation
					if (lastWord.length() == 1 ){      
						//single character abbreviations are special...
						//we will assume they never split the sentence if they are capitalized.
						char c = lastWord.charAt(0);
						if ((c >= 'A') && (c <= 'Z')){
							currentSentence += sentence;
							continue;
						}
                        if (logger.isLoggable(Level.FINER)) {
                            logger.finer("last word an abbrev " + sentenceMatch + " lastword [" + lastWord + "] before [" + before + "]");
                        }

						//but some are lowercase!
						if ((c == 'e') || (c == 'i')){
							currentSentence += sentence;
							continue;
						}
                        if (logger.isLoggable(Level.FINER)) {
                            logger.finer("2 last word an abbrev " + sentenceMatch + " lastword [" + lastWord + "] before [" + before + "]");
                        }
					} else {
						lastWord = lastWord.toLowerCase();
						//only accept abbreviations if the previous char to the abbrev is space or
						//is empty (beginning of line). This avoids things like .c
						if (("".equals(before) || " ".equals(before)) && this.abbreviations.contains(lastWord)) {
							currentSentence += sentence;
							continue;
						} 
//						else {
//							//just keep going, we handle this case below
//						}
					}
				}
				result.add(currentSentence + sentence);					
				currentSentence = "";
				continue;
			}
			
			logger.severe("We have not dealt with this case");
			throw new IllegalStateException();
		}
		
		result.add(currentSentence + text);
		
		return result;
	}	
	
	/**
	* Open and read a file, and return the lines in the file as a hashtable
	*/
    private void loadDictionary() {
//		commonTerms = new Hashtable();
//
//		BufferedReader reader = null;
//		try{
//			reader = new BufferedReader(new InputStreamReader(this.getDictionary()));
//			String line;
//			while ( (line = reader.readLine()) != null ){
//				if (JNinkaRegullarExpression.isMatch(line, "^[A-Z]")){
//					commonTerms.put(line, 1);
//				}
//			}
//		} catch(IOException e) {
//			logger.log(Level.SEVERE, "cannot open dictionary file " + this.getDictionary() + ": " + e.getMessage(), e);
//		} finally {
//            JNinkaUtils.close(reader, logger);
//		}
	}

	/**
	* Open and read a file, and return the lines in the file as a hashtable
	*/
    private void loadAbbreviations(){
		abbreviations = new ArrayList();
		
		BufferedReader reader = null;
		try{
			reader = new BufferedReader(new InputStreamReader(this.getAbbrvFile()));
			String line;
			while ( (line = reader.readLine()) != null ){
				line = line.toLowerCase();//java=>perl
				abbreviations.add(line);
			}
		} catch(IOException e){
			logger.log(Level.SEVERE, "cannot open dictionary file " + this.getAbbrvFile() + ": " + e.getMessage(), e);
		} finally {
			JNinkaUtils.close(reader, logger);
		}
	}

    private String preProcessText(String text){
		text = JNinkaRegullarExpression.applyReplace(text, "\\+?\\-{3,1000}\\+?", " ", Pattern.MULTILINE); 
		text = JNinkaRegullarExpression.applyReplace(text, "={3,1000}", " ", Pattern.MULTILINE); 
		text = JNinkaRegullarExpression.applyReplace(text, ":{3,1000}", " ", Pattern.MULTILINE); 
		text = JNinkaRegullarExpression.applyReplace(text, "\\*{3,1000}", " ", Pattern.MULTILINE); 				
		
		//some characters are used for prettyprinting but never appear in sentences
		text = JNinkaRegullarExpression.applyReplace(text, "\\|+", " ", Pattern.MULTILINE); 
		text = JNinkaRegullarExpression.applyReplace(text, "\\\\+", " ", Pattern.MULTILINE); 								
		
		//let us deal with /* before we do anything
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]*/\\*", "", Pattern.MULTILINE); //Last Bug!!!!
		text = JNinkaRegullarExpression.applyReplace(text, "\\*\\/[ \t]*$", "", Pattern.MULTILINE); 
		text = JNinkaRegullarExpression.applyReplace(text, "([^:])//", "$1", Pattern.MULTILINE); 				
		
		//Replace /\r\n/ with \n only
		text = JNinkaRegullarExpression.applyReplace(text, "\r\n", "\n");

		//now, try to replace the leading/ending character of each line #/-, at most 3 heading characters
		// and each repeated as many times as necessaary
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]{0,3}[\\*\\#\\/\\;]+", "", Pattern.MULTILINE);
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]{0,3}[\\-]+", "", Pattern.MULTILINE);
		
		text = JNinkaRegullarExpression.applyReplace(text, "[\\*\\#\\/]+[ \t]{0,3}$", "", Pattern.MULTILINE);
		text = JNinkaRegullarExpression.applyReplace(text, "[\\-]+[ \t]{0,3}$", "", Pattern.MULTILINE);				
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]{0,3}[\\*\\#\\/\\;]+", "", Pattern.MULTILINE);
		
		//now, try to replace the ending character of each line if it is * or #
		text = JNinkaRegullarExpression.applyReplace(text, "[\\*\\#]+$", "", Pattern.MULTILINE);

		//at this point we have lines with nothing but spaces, let us get rid of them
		text = JNinkaRegullarExpression.applyReplace(text, "^[ \t]+$", "\n", Pattern.MULTILINE);
										
		//let us try the following trick
		// We first get rid of \t and replace it with ' '
		// we then use \t as a "single line separator" and \n as multiple line.
		// so we can match each with a single character.
		text = JNinkaRegullarExpression.applyReplace(text, "\t", " ");	
				
		text = JNinkaRegullarExpression.applyReplace(text, "\n(?!\n)", "\t");//MIKL - some problem!!!
		text = JNinkaRegullarExpression.applyReplace(text, "\n\n+", "\n");
		
		text += "\n";	
	
		return text;
	}
	
	/* --- Getters / Setters --- */
	
	public void setDictionary(InputStream lDictionary) {
		dictionary = lDictionary;
		// Load in the dictionary and find the common words.
		// Here, we assume the words in upper case are simply names and one
		// word per line - i.e. in same form as /usr/dict/words
		loadDictionary();
	}

	public void setAbbrvFile(InputStream lAbbrvFile) {
		abbrvFile = lAbbrvFile;
		// Same assumptions as for dictionary
		loadAbbreviations();
	}

	public InputStream getDictionary() {
		return dictionary;
	}

	public InputStream getAbbrvFile() {
		return abbrvFile;
	}
	
}