All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jvntextpro.conversion.CompositeUnicode2Unicode Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvntextpro.conversion;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

// TODO: Auto-generated Javadoc
/**
 * This class provides functionality to convert from
 * a composite unicode string in vietnamese to a unicode string.
 *
 * @author TuNC
 */
public class CompositeUnicode2Unicode {
	
	/** The cps uni2 uni. */
	Map cpsUni2Uni;
	
	/** The Constant DEFAULT_MAP_RESOURCE. */
	private static final String DEFAULT_MAP_RESOURCE = "jvntextpro/conversion/Composite2Unicode.txt";
	
	//---------------------------------------------------------------
	//Constructor
	//----------------------------------------------------------------
	
	/**
	 * Instantiates a new composite unicode2 unicode.
	 */
	public CompositeUnicode2Unicode(){	
		try{
			cpsUni2Uni = new HashMap();
			
			URL url = CompositeUnicode2Unicode.class.getClassLoader().getResource(DEFAULT_MAP_RESOURCE);
			
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					url.openStream(), "UTF-8"));
			
			String line;
			while ((line = reader.readLine()) != null){
				String [] onemap = line.split("\t");
				
				if (onemap.length != 2) continue;
				cpsUni2Uni.put(onemap[0], onemap[1]);
			}
			
			reader.close();
		}
		catch (Exception e){
			System.err.println("Loading composite to unicode map fail: " + e.getMessage());
			cpsUni2Uni = null;
		}
	}
	
	//---------------------------------------------------------------
	//Public method
	//----------------------------------------------------------------

	/**
	 * Convert a vietnamese string with composite unicode encoding to unicode encoding.
	 *
	 * @param text string in vietnamese with composite unicode encoding
	 * @return string with unicode encoding
	 */
	public String convert(String text){
		String ret = text;
		
		if (cpsUni2Uni == null) return ret;
		
		Iterator it = cpsUni2Uni.keySet().iterator();
		while(it.hasNext()){
			String cpsChar = it.next();
			ret = ret.replaceAll(cpsChar, cpsUni2Uni.get(cpsChar));
		}
		
		return ret;
		
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy