jvntextpro.JVnTextPro Maven / Gradle / Ivy

Go to download
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvntextpro;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import jvnpostag.MaxentTagger;
import jvnsegmenter.CRFSegmenter;
import jvnsensegmenter.JVnSenSegmenter;
import jvntextpro.conversion.CompositeUnicode2Unicode;
import jvntextpro.util.VnSyllParser;
import jvntokenizer.PennTokenizer;

// TODO: Auto-generated Javadoc
/**
 * The Class JVnTextPro.
 */
public class JVnTextPro {

	//==============================================
	// Instance Variables
	//==============================================
	/** The vn sen segmenter. */
	JVnSenSegmenter vnSenSegmenter = null;
	
	/** The vn segmenter. */
	CRFSegmenter vnSegmenter = null;
	
	/** The vn pos tagger. */
	MaxentTagger vnPosTagger = null;
	
	/** The is tokenization. */
	boolean isTokenization = false;
	
	/** The convertor. */
	public CompositeUnicode2Unicode convertor;
	
	//==============================================
	// Constructors
	//==============================================
	
	/**
	 * Instantiates a new j vn text pro.
	 */
	public JVnTextPro(){
		//do nothing
		convertor = new CompositeUnicode2Unicode();	
	}
	
	//==============================================
	// initial methods
	//==============================================	
	/**
	 * Initialize the sentence segmetation for Vietnamese
	 * return true if the initialization is successful and false otherwise.
	 *
	 * @param modelDir the model dir
	 * @return true, if successful
	 */
	public boolean initSenSegmenter(String modelDir){
		System.out.println("Initilize JVnSenSegmenter ...");
		
		//initialize sentence segmentation
		vnSenSegmenter = new JVnSenSegmenter();		
		if (!vnSenSegmenter.init(modelDir)){
			System.out.println("Error while initilizing JVnSenSegmenter");
			vnSenSegmenter = null;
			return false;
		}
			
		return true;
	}
	
	/**
	 * Initialize the word segmetation for Vietnamese.
	 *
	 * @param modelDir the model dir
	 * @return true if the initialization is successful and false otherwise
	 */
	public boolean initSegmenter(String modelDir){		
		System.out.println("Initilize JVnSegmenter ...");
		System.out.println(modelDir);
		vnSegmenter = new CRFSegmenter();
		
		try{
			vnSegmenter.init(modelDir);
		}
		catch (Exception e){
			System.out.println("Error while initializing JVnSegmenter");
			vnSegmenter = null;
			return false;
		}
		
		//initialize taggerData		
		return true;		
	}
	
	/**
	 * Initialize the pos tagger for Vietnamese.
	 *
	 * @param modelDir the model dir
	 * @return true if the initialization is successful and false otherwise
	 */
	public boolean initPosTagger(String modelDir){
		try{
			this.vnPosTagger = new MaxentTagger(modelDir);
		}
		catch (Exception e){
			System.out.println("Error while initializing POS TAgger");
			vnPosTagger = null;
			return false;
		}
		return true;
	}
	
	/**
	 * Initialize the sentence tokenization.
	 */
	public void initSenTokenization(){
		isTokenization = true;
	}
	//==============================================
	// public methods
	//==============================================

	/**
	 * Process the text and return the processed text
	 * pipeline : sentence segmentation, tokenization, word segmentation, part of speech tagging.
	 *
	 * @param text text to be processed
	 * @return processed text
	 */
	public String process(String text){
		String ret = text;
		
		//Pipeline
		ret = convertor.convert(ret);
		ret = senSegment(ret);
		ret = senTokenize(ret);
		ret = wordSegment(ret);
		ret = postProcessing(ret);
		ret = posTagging(ret);
		return ret;
	}	
	
	/**
	 * Process a file and return the processed text
	 * pipeline : sentence segmentation, tokenization, tone recover, word segmentation.
	 *
	 * @param infile data file
	 * @return processed text
	 */
	public String process(File infile){		
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					new FileInputStream(infile), "UTF-8"));
			
			String line, data = "";
			while((line = reader.readLine()) != null){
				data += line + "\n";				
			}
			reader.close();
			
			String ret =  process(data);
			return ret;
		}
		catch (Exception e){
			System.out.println(e.getMessage());
			e.printStackTrace();
			return "";
		}
	}
	
	/**
	 * Do sentence segmentation.
	 *
	 * @param text text to have sentences segmented
	 * @return the string
	 */
	public String senSegment(String text){
		String ret = text;
		
		//Segment sentences
		if (vnSenSegmenter != null){
			ret = vnSenSegmenter.senSegment(text);
		}
		
		return ret.trim();
	}
	
	/**
	 * Do sentence tokenization.
	 *
	 * @param text to be tokenized
	 * @return the string
	 */
	public String senTokenize(String text){
		String ret = text;
		
		if (isTokenization){
			ret = PennTokenizer.tokenize(text);
		}
		
		return ret.trim();
	}
	
	/**
	 * Do word segmentation.
	 *
	 * @param text to be segmented by words
	 * @return text with words segmented, syllables in words are joined by '_'
	 */
	public String wordSegment(String text){
		String ret = text;
		
		if (vnSegmenter == null) return ret;
		ret = vnSegmenter.segmenting(ret);
		return ret;
	}	

	/**
	 * Do pos tagging.
	 *
	 * @param text to be tagged with POS of speech (need to have words segmented)
	 * @return the string
	 */
	public String posTagging(String text){
		String ret = text;
		if (vnPosTagger != null){
			ret = vnPosTagger.tagging(text);			
		}
		
		return ret;
	}
	
	/**
	 * Do post processing for word segmentation: break not valid vietnamese words into single syllables.
	 *
	 * @param text the text
	 * @return the string
	 */
	public String postProcessing(String text){
		
		String [] lines = text.split("\n");
		String ret = "";

		for (String line : lines){
			String [] words = line.split("[ \t]");			
			String templine = "";
			
			for (String currentWord : words ){
				//break word into syllable and check if one of it is not valid vi syllable
				String [] syllables = currentWord.split("_");			
				boolean isContainNotValidSyll = false;
				
				for (String syllable : syllables){			
					VnSyllParser parser = new VnSyllParser(syllable.toLowerCase());
					
					if (!parser.isValidVnSyllable()){
						isContainNotValidSyll = true;
						break;
					}
				}
				
				if (isContainNotValidSyll){
					String temp = "";
					
					for (String syll : syllables){
						temp += syll + " ";
					}
					
					templine += temp.trim() + " ";
				}
				else templine += currentWord + " ";
			}
			
			ret += templine.trim() + "\n";
		}		
		
		return ret.trim();		
	}
}