All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jvnsegmenter.RegexContextGenerator Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvnsegmenter;

import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.w3c.dom.Element;

import jvntextpro.data.Sentence;

// TODO: Auto-generated Javadoc
/**
 * The Class RegexContextGenerator.
 */
public class RegexContextGenerator extends BasicContextGenerator {
	//----------------------------
	//variables
	//----------------------------
	// Regular Expression Pattern string
	/** The str number pattern. */
	private static String strNumberPattern = "[+-]?\\d+([,.]\\d+)*";
	
	/** The str short date pattern. */
	private static String strShortDatePattern = "\\d+[/-:]\\d+";
	
	/** The str long date pattern. */
	private static String strLongDatePattern = "\\d+[/-:]\\d+[/-:]\\d+";
	
	/** The str percentage pattern. */
	private static String strPercentagePattern = strNumberPattern + "%";
	
	/** The str currency pattern. */
	private static String strCurrencyPattern = "\\p{Sc}" + strNumberPattern;
	
	/** The str vi currency pattern. */
	private static String strViCurrencyPattern = strNumberPattern + "[ \t]*\\p{Sc}";

	// Regular Expression Pattern
	/** The ptn number. */
	private static Pattern ptnNumber;
	
	/** The ptn short date. */
	private static Pattern ptnShortDate;
	
	/** The ptn long date. */
	private static Pattern ptnLongDate;
	
	/** The ptn percentage. */
	private static Pattern ptnPercentage;
	
	/** The ptn currency. */
	private static Pattern ptnCurrency;
	
	/** The ptn vi currency. */
	private static Pattern ptnViCurrency;
	
	//----------------------------
	//methods
	//----------------------------
	/**
	 * Instantiates a new regex context generator.
	 *
	 * @param node the node
	 */
	public RegexContextGenerator(Element node){
		readFeatureParameters(node);
	}
	
	/* (non-Javadoc)
	 * @see jvntextpro.data.ContextGenerator#getContext(jvntextpro.data.Sentence, int)
	 */
	@Override
	public String[] getContext(Sentence sent, int pos) {
		// generate context predicates
		List cps = new ArrayList();
		
		
		// get the context information from sequence
		for (int it = 0; it < cpnames.size(); ++it){			
			String cp = cpnames.get(it);
			Vector paras = this.paras.get(it);
			String cpvalue = "";
			
			String suffix = "", regex = "";
			String word = "";
			boolean outOfArrayIndex = false;
			for (int i = 0; i < paras.size(); ++i) {
				if (pos + paras.get(i) < 0 || pos + paras.get(i)>= sent.size()){
					cpvalue = "";
					outOfArrayIndex = true;
					break;
				}
	
				suffix += paras.get(i) + ":";
				word += sent.getWordAt(pos + paras.get(i)) + " ";
			}
			if (outOfArrayIndex) continue;
			
			word = word.trim().toLowerCase();
			suffix = suffix.substring(0, suffix.length() - 1);
			suffix = ":" + suffix;
	
			// Match to a specific pattern
			regex = patternMatching(cp, word);
			if (!regex.equals("")) {
				cpvalue = "re" + suffix + regex;
			}
			
			if (!cpvalue.equals("")) cps.add(cpvalue);
		}
		String [] ret = new String[cps.size()];		
		return cps.toArray(ret);
	}

	//----------------------------
	// utility methods
	//----------------------------
	/**
	 * Pattern compile.
	 */
	private static void patternCompile() {
		try {
			ptnNumber = Pattern.compile(strNumberPattern);
			ptnShortDate = Pattern.compile(strShortDatePattern);
			ptnLongDate = Pattern.compile(strLongDatePattern);
			ptnPercentage = Pattern.compile(strPercentagePattern);
			ptnCurrency = Pattern.compile(strCurrencyPattern);
			ptnViCurrency = Pattern.compile(strViCurrencyPattern);
		} catch (PatternSyntaxException ex) {
			System.err.println(ex.getMessage());
			System.exit(1);
		}

	}

	/**
	 * Pattern matching.
	 *
	 * @param ptnName the ptn name
	 * @param input the input
	 * @return the string
	 */
	private static String patternMatching(String ptnName, String input) {
		String suffix = "";
		if (ptnNumber == null)
			patternCompile();

		Matcher matcher;
		if (ptnName.equals("number")) {
			matcher = ptnNumber.matcher(input);
			if (matcher.matches())
				suffix = ":number";
		} else if (ptnName.equals("short_date")) {
			matcher = ptnShortDate.matcher(input);
			if (matcher.matches())
				suffix = ":short-date";
		} else if (ptnName.equals("long_date")) {
			matcher = ptnLongDate.matcher(input);
			if (matcher.matches())
				suffix = ":long-date";
		} else if (ptnName.equals("percentage")) {
			matcher = ptnPercentage.matcher(input);
			if (matcher.matches())
				suffix = ":percentage";
		} else if (ptnName.equals("currency")) {
			matcher = ptnCurrency.matcher(input);
			if (matcher.matches())
				suffix = ":currency";
			else {
				matcher = ptnViCurrency.matcher(input);
				if (matcher.matches()) {
					suffix = ":currency";
				}
			}
		}
		return suffix;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy