jvnpostag.POSContextGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnpostag;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import jvntextpro.data.ContextGenerator;
import jvntextpro.data.Sentence;
import jvntextpro.util.StringUtils;
import jvntextpro.util.VnSyllParser;
public class POSContextGenerator extends ContextGenerator {
//----------------------------------------------
// Member variables
//----------------------------------------------
private static final String DEFAULT_E_DICT = "jvnpostag/ComputerDict.txt";
Map word2dictags = new HashMap();
Vector cpnames;
Vector> paras;
//----------------------------------------------
// Constructor and Override methods
//----------------------------------------------
public POSContextGenerator(String featureTemplateFile){
readDict();
readFeatureTemplate(featureTemplateFile);
}
@Override
public String[] getContext(Sentence sent, int pos) {
// TODO Auto-generated method stub
List cps = new ArrayList();
for (int it = 0; it < cpnames.size(); ++it){
String cp = cpnames.get(it);
Vector paras = this.paras.get(it);
String cpvalue = "";
if (cp.equals("w")){
cpvalue = w(sent,pos,paras.get(0));
}
else if (cp.equals("wj")){
cpvalue = wj(sent,pos,paras.get(0), paras.get(1));
}
else if (cp.equals("prf")){
cpvalue = prf(sent,pos, paras.get(0));
}
else if (cp.equals("sff")){
cpvalue = sff(sent,pos,paras.get(0));
}
else if (cp.equals("an")){
cpvalue = an(sent,pos, paras.get(0));
}
else if (cp.equals("hn")){
cpvalue = hn(sent, pos, paras.get(0));
}
else if (cp.equals("hyph")){
cpvalue = hyph(sent, pos, paras.get(0));
}
else if (cp.equals("slash")){
cpvalue = slash(sent, pos, paras.get(0));
}
else if (cp.equals("com")){
cpvalue = com(sent, pos, paras.get(0));
}
else if (cp.equals("ac")){
cpvalue = ac(sent, pos, paras.get(0));
}
else if (cp.equals("ic")){
cpvalue = ic(sent, pos, paras.get(0));
}
else if (cp.equals("mk")){
cpvalue = mk(sent, pos, paras.get(0));
}
else if (cp.equals("dict")){
cps.add(dict(sent, pos, paras.get(0)));
}
else if (cp.equals("rr")){
cpvalue = rr(sent, pos, paras.get(0));
}
if (!cpvalue.equals("")) cps.add(cpvalue);
}
String [] ret = new String[cps.size()];
return cps.toArray(ret);
}
//----------------------------------------------
// IO methods
//----------------------------------------------
public boolean readDict(){
try {
URL url = POSContextGenerator.class.getClassLoader().getResource(DEFAULT_E_DICT);
BufferedReader reader = new BufferedReader(new InputStreamReader(
url.openStream(), "UTF-8"));
word2dictags.clear();
String line, temp = null;
while ((line = reader.readLine()) != null ){
String [] tokens = line.split("\t");
String word, tag;
if (tokens == null)
continue;
if (tokens.length != 2){
continue;
}
else if (tokens.length == 2){
if (tokens[0].equals("")){
if (temp == null)
continue;
else {
//System.out.println(temp);
word = temp;
tag = tokens[1];
}
}
else{
word = tokens[0].trim().toLowerCase();
tag = tokens[1].trim();
temp = word;
}
}
else continue;
word = word.replace(" ","_");
//System.out.println(word);
List dictags = (List) word2dictags.get(word);
if (dictags == null){
dictags = new ArrayList();
}
dictags.add(tag);
word2dictags.put(word, dictags);
}
reader.close();
return true;
}
catch (Exception e){
System.out.println(e.getMessage());
e.printStackTrace();
return false;
}
}
public boolean readFeatureTemplate(String file){
try{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
InputStream stream = new FileInputStream(file);
Document doc = builder.parse(stream);
Element root = doc.getDocumentElement();
NodeList childrent = root.getChildNodes();
cpnames = new Vector();
paras = new Vector>();
for (int i = 0; i < childrent.getLength(); i++)
if (childrent.item(i) instanceof Element) {
Element child = (Element) childrent.item(i);
String value = child.getAttribute("value");
//parse the value and get the parameters
String [] parastr = value.split(":");
Vector para = new Vector();
for (int j = 1; j < parastr.length; ++j){
para.add(Integer.parseInt(parastr[j]));
}
cpnames.add(parastr[0]);
paras.add(para);
}
}
catch (Exception e){
System.out.println(e.getMessage());
e.printStackTrace();
return false;
}
return true;
}
//-----------------------------------------------
// feature generating methods
//-----------------------------------------------
private String w(Sentence sent, int pos, int i){
String cp = "w:" + Integer.toString(i) + ":";
//if (pos + i == -1)
// cp += "BS";
//else if (pos + i == sent.size())
// cp += "ES";
if (0 <= (pos + i) && (pos + i) < sent.size())
cp += sent.getWordAt(pos + i);
else cp="";
return cp;
}
private String wj(Sentence sent, int pos, int i, int j){
String cp = "wj:" + Integer.toString(i) + ":" + Integer.toString(j) + ":";
if ((pos + i) >= sent.size() || (pos + i) < 0 ||
(pos + j) < 0 || (pos + j) >= sent.size())
cp = "";
else {
cp += sent.getWordAt(pos + i) + ":" + sent.getWordAt(pos + j);
}
return cp;
}
private String prf(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "prf:" + Integer.toString(i) + ":";
String word = sent.getWordAt(pos + i);
String [] sylls = word.split("_");
if (sylls.length >= 2){
cp += sylls[0];
}
else cp = "";
}
else cp = "";
return cp;
}
private String sff(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "sff:" + Integer.toString(i) + ":";
String word = sent.getWordAt(pos + i);
String [] sylls = word.split("_");
if (sylls.length >= 2){
cp += sylls[sylls.length - 1];
}
else cp = "";
}
else cp = "";
return cp;
}
private String an(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "an:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!StringUtils.isAllNumber(word))
cp = "";
}
else cp = "";
return cp;
}
private String hn(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "hn:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!StringUtils.containNumber(word))
cp = "";
}
else cp = "";
return cp;
}
private String hyph(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "hyph:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!word.contains("-"))
cp = "";
}
else cp = "";
return cp;
}
private String slash(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "hyph:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!word.contains("/"))
cp = "";
}
else cp = "";
return cp;
}
private String com(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "hyph:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!word.contains(":"))
cp = "";
}
else cp = "";
return cp;
}
private String ac(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "ac:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
boolean isAllCap = true;
for (int j = 0 ; j < word.length(); ++j){
if (word.charAt(j) == '_' || word.charAt(j) == '.') continue;
if (!Character.isUpperCase(word.charAt(j))){
isAllCap = false;
break;
}
}
if (!isAllCap)
cp = "";
}
else cp = "";
return cp;
}
private String ic(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "ic:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!StringUtils.isFirstCap(word))
cp = "";
}
else cp = "";
return cp;
}
private String mk(Sentence sent, int pos, int i){
String cp;
if (0 <= (pos + i) && (pos + i) < sent.size()){
cp = "mk:" + Integer.toString(i);
String word = sent.getWordAt(pos + i);
if (!StringUtils.isPunc(word))
cp = "";
}
else cp = "";
return cp;
}
private String dict(Sentence sent, int pos, int i){
String cp = "";
if (0 <= (pos + i) && (pos + i) < sent.size()){
String word = sent.getWordAt(pos + i);
if (word2dictags.containsKey(word)){
List tags = (List) word2dictags.get(word);
for (int j = 0; j < tags.size(); ++j){
cp += "dict:" + Integer.toString(i) + ":" + tags.get(j) + " ";
}
}
}
return cp.trim();
}
private String rr(Sentence sent, int pos, int i){
String cp = "";
if (0 <= (pos + i) && (pos + i) < sent.size()){
String word = sent.getWordAt(pos + i);
String [] sylls = word.split("_");
if (sylls.length == 2){ //consider 2-syllable words
VnSyllParser parser1 = new VnSyllParser(sylls[0]);
VnSyllParser parser2 = new VnSyllParser(sylls[1]);
if (parser1.isValidVnSyllable() && parser2.isValidVnSyllable()){
if (parser1.getNonToneSyll().equalsIgnoreCase(parser2.getNonToneSyll())){
cp += "fr:" + Integer.toString(i) + " ";
}
else if (parser1.getRhyme().equalsIgnoreCase(parser2.getRhyme())){
cp += "pr:" + Integer.toString(i) + " ";
}
}
}
}
return cp.trim();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy