jvnpostag.POSContextGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

package jvnpostag;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

import jvntextpro.data.ContextGenerator;
import jvntextpro.data.Sentence;
import jvntextpro.util.StringUtils;
import jvntextpro.util.VnSyllParser;

public class POSContextGenerator extends ContextGenerator {
	
	//----------------------------------------------
	// Member variables
	//----------------------------------------------
	private static final String DEFAULT_E_DICT = "jvnpostag/ComputerDict.txt";
	Map word2dictags = new HashMap();
	Vector cpnames;
	Vector> paras;
	
	//----------------------------------------------
	// Constructor and Override methods
	//----------------------------------------------
	public POSContextGenerator(String featureTemplateFile){
		readDict();
		readFeatureTemplate(featureTemplateFile);
	}
	
	@Override
	public String[] getContext(Sentence sent, int pos) {
		// TODO Auto-generated method stub
		List cps = new ArrayList();
		
		for (int it = 0; it < cpnames.size(); ++it){			
			String cp = cpnames.get(it);
			Vector paras = this.paras.get(it);
			String cpvalue = "";
			if (cp.equals("w")){
				cpvalue = w(sent,pos,paras.get(0));
			}
			else if (cp.equals("wj")){
				cpvalue = wj(sent,pos,paras.get(0), paras.get(1));
			}
			else if (cp.equals("prf")){
				cpvalue = prf(sent,pos, paras.get(0));
			}
			else if (cp.equals("sff")){
				cpvalue = sff(sent,pos,paras.get(0));
			}
			else if (cp.equals("an")){
				cpvalue = an(sent,pos, paras.get(0));				
			}
			else if (cp.equals("hn")){
				cpvalue = hn(sent, pos, paras.get(0));
			}
			else if (cp.equals("hyph")){
				cpvalue = hyph(sent, pos, paras.get(0));
			}
			else if (cp.equals("slash")){
				cpvalue = slash(sent, pos, paras.get(0));
			}
			else if (cp.equals("com")){
				cpvalue = com(sent, pos, paras.get(0));
			}
			else if (cp.equals("ac")){
				cpvalue = ac(sent, pos, paras.get(0));				
			}
			else if (cp.equals("ic")){
				cpvalue = ic(sent, pos, paras.get(0));
			}
			else if (cp.equals("mk")){
				cpvalue = mk(sent, pos, paras.get(0));
			}
			else if (cp.equals("dict")){
				cps.add(dict(sent, pos, paras.get(0)));
			}
			else if (cp.equals("rr")){
				cpvalue = rr(sent, pos, paras.get(0));
			}
			if (!cpvalue.equals("")) cps.add(cpvalue);
		}
		String [] ret = new String[cps.size()];		
		return cps.toArray(ret);
	}
	
	//----------------------------------------------
	// IO methods
	//----------------------------------------------
	public boolean readDict(){
		try {
			URL url = POSContextGenerator.class.getClassLoader().getResource(DEFAULT_E_DICT);
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					url.openStream(), "UTF-8"));
			word2dictags.clear();
						
			String line, temp = null;
			while ((line = reader.readLine()) != null ){
				String [] tokens = line.split("\t");
		
				String word, tag;
				if (tokens == null)
					continue;
				
				if (tokens.length != 2){
					continue;					
				}
				else if (tokens.length == 2){
					if (tokens[0].equals("")){
						if (temp == null)
							continue;
						else {
							//System.out.println(temp);
							word = temp;
							tag = tokens[1];
						}
					}
					else{ 
						word = tokens[0].trim().toLowerCase();
						tag = tokens[1].trim();
						temp = word;
					}
				}
				else continue;
				
				word = word.replace(" ","_");
				//System.out.println(word);
				List dictags = (List) word2dictags.get(word);
				if (dictags == null){
					dictags = new ArrayList();
				}
				dictags.add(tag);
				word2dictags.put(word, dictags);
			}
			
			reader.close();
			return true;
		}
		
		catch (Exception e){
			System.out.println(e.getMessage());
			e.printStackTrace();
			return false;
		}
	}
	
	public boolean readFeatureTemplate(String file){
		try{
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
			DocumentBuilder builder = factory.newDocumentBuilder();
			
			InputStream stream = new FileInputStream(file);
			Document doc  = builder.parse(stream);
			
			Element root = doc.getDocumentElement();
			NodeList childrent = root.getChildNodes();
			cpnames = new Vector();
			paras = new Vector>();
			
			for (int i = 0; i < childrent.getLength(); i++)
				if (childrent.item(i) instanceof Element) {
					Element child = (Element) childrent.item(i);
					String value = child.getAttribute("value");
					
					//parse the value and get the parameters
					String [] parastr = value.split(":");
					Vector para = new Vector();
					for (int j = 1; j < parastr.length; ++j){
						para.add(Integer.parseInt(parastr[j]));
					}
					
					cpnames.add(parastr[0]);
					paras.add(para);
				}
			
		}
		catch (Exception e){
			System.out.println(e.getMessage());
			e.printStackTrace();
			return false;
		}
		return true;
	}
	
	//-----------------------------------------------
	// feature generating methods
	//-----------------------------------------------
		
	private String w(Sentence sent, int pos, int i){
		String cp = "w:" + Integer.toString(i) + ":"; 
		//if (pos + i == -1)
		//	cp += "BS";
		//else if (pos + i == sent.size())
		//	cp += "ES";
		if (0 <= (pos + i) && (pos + i) < sent.size())
			cp += sent.getWordAt(pos + i);
		else cp="";
		
		return cp;
	}
	
	private String wj(Sentence sent, int pos, int i, int j){
		String cp = "wj:" + Integer.toString(i) + ":" + Integer.toString(j) + ":";
		if ((pos + i) >= sent.size() || (pos + i) < 0 || 
				(pos + j) < 0 || (pos + j) >= sent.size())
			cp = "";
		else {
			cp += sent.getWordAt(pos + i) + ":" + sent.getWordAt(pos + j);
		}
		return cp;
	}
	
	private String prf(Sentence sent, int pos, int i){
		 String cp;
		 if (0 <= (pos + i) && (pos + i) < sent.size()){
			 cp = "prf:" + Integer.toString(i) + ":";
			 
			 String word = sent.getWordAt(pos + i);
			 String [] sylls = word.split("_");
			 if (sylls.length >= 2){
				 cp += sylls[0];
			 }
			 else cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String sff(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			 cp = "sff:" + Integer.toString(i) + ":";
			 
			 String word = sent.getWordAt(pos + i);
			 String [] sylls = word.split("_");
			 if (sylls.length >= 2){
				 cp += sylls[sylls.length - 1];
			 }
			 else cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String an(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			 cp = "an:" + Integer.toString(i);
			 
			String word = sent.getWordAt(pos + i);
			if (!StringUtils.isAllNumber(word))
				cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String hn(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "hn:" + Integer.toString(i);
			 
			String word = sent.getWordAt(pos + i);
			if (!StringUtils.containNumber(word))
				cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String hyph(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "hyph:" + Integer.toString(i);
			 
			String word = sent.getWordAt(pos + i);
			if (!word.contains("-"))
				cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String slash(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "hyph:" + Integer.toString(i);
			 
			String word = sent.getWordAt(pos + i);
			if (!word.contains("/"))
				cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String com(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "hyph:" + Integer.toString(i);
			 
			String word = sent.getWordAt(pos + i);
			if (!word.contains(":"))
				cp = "";
		 }
		 else cp = "";
		 
		 return cp;
	}
	
	private String ac(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "ac:" + Integer.toString(i);
			
			String word = sent.getWordAt(pos + i);
			boolean isAllCap = true;
			
			for (int j = 0 ; j < word.length(); ++j){
				if (word.charAt(j) == '_' || word.charAt(j) == '.') continue;
				
				if (!Character.isUpperCase(word.charAt(j))){
					isAllCap = false;
					break;
				}
			}
			
			if (!isAllCap)
				cp = "";
		}
		else cp = "";
		return cp;
	}
	
	private String ic(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "ic:" + Integer.toString(i);
			
			String word = sent.getWordAt(pos + i);
			if (!StringUtils.isFirstCap(word))
				cp = "";
		}
		else cp = "";
		
		return cp;
	}
	
	private String mk(Sentence sent, int pos, int i){
		String cp;
		if (0 <= (pos + i) && (pos + i) < sent.size()){
			cp = "mk:" + Integer.toString(i);
			String word = sent.getWordAt(pos + i);
			if (!StringUtils.isPunc(word))
				cp = "";
		}
		else cp = "";
		
		return cp;
	} 
	
	private String dict(Sentence sent, int pos, int i){
		String cp = "";
		
		if (0 <= (pos + i) && (pos + i) < sent.size()){			
			String word = sent.getWordAt(pos + i);
			if (word2dictags.containsKey(word)){
				List tags = (List) word2dictags.get(word);
				
				for (int j = 0; j < tags.size(); ++j){
					cp += "dict:" + Integer.toString(i) + ":" + tags.get(j) + " ";
				}
			}
		}
		
		return cp.trim();
	}
	
	private String rr(Sentence sent, int pos, int i){
		String cp = "";
		
		if (0 <= (pos + i) && (pos + i) < sent.size()){			
			String word = sent.getWordAt(pos + i);
			String [] sylls = word.split("_");
			
			if (sylls.length == 2){ //consider 2-syllable words
				VnSyllParser parser1 = new VnSyllParser(sylls[0]);
				VnSyllParser parser2 = new VnSyllParser(sylls[1]);
				
				if (parser1.isValidVnSyllable() && parser2.isValidVnSyllable()){
					if (parser1.getNonToneSyll().equalsIgnoreCase(parser2.getNonToneSyll())){
						cp += "fr:" + Integer.toString(i) + " ";
					}
					else if (parser1.getRhyme().equalsIgnoreCase(parser2.getRhyme())){
						cp += "pr:" + Integer.toString(i) + " ";
					}
				}
			}
		}
		
		
		return cp.trim();
	}
}