jvnsegmenter.RegexContextGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnsegmenter;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.w3c.dom.Element;
import jvntextpro.data.Sentence;
// TODO: Auto-generated Javadoc
/**
* The Class RegexContextGenerator.
*/
public class RegexContextGenerator extends BasicContextGenerator {
//----------------------------
//variables
//----------------------------
// Regular Expression Pattern string
/** The str number pattern. */
private static String strNumberPattern = "[+-]?\\d+([,.]\\d+)*";
/** The str short date pattern. */
private static String strShortDatePattern = "\\d+[/-:]\\d+";
/** The str long date pattern. */
private static String strLongDatePattern = "\\d+[/-:]\\d+[/-:]\\d+";
/** The str percentage pattern. */
private static String strPercentagePattern = strNumberPattern + "%";
/** The str currency pattern. */
private static String strCurrencyPattern = "\\p{Sc}" + strNumberPattern;
/** The str vi currency pattern. */
private static String strViCurrencyPattern = strNumberPattern + "[ \t]*\\p{Sc}";
// Regular Expression Pattern
/** The ptn number. */
private static Pattern ptnNumber;
/** The ptn short date. */
private static Pattern ptnShortDate;
/** The ptn long date. */
private static Pattern ptnLongDate;
/** The ptn percentage. */
private static Pattern ptnPercentage;
/** The ptn currency. */
private static Pattern ptnCurrency;
/** The ptn vi currency. */
private static Pattern ptnViCurrency;
//----------------------------
//methods
//----------------------------
/**
* Instantiates a new regex context generator.
*
* @param node the node
*/
public RegexContextGenerator(Element node){
readFeatureParameters(node);
}
/* (non-Javadoc)
* @see jvntextpro.data.ContextGenerator#getContext(jvntextpro.data.Sentence, int)
*/
@Override
public String[] getContext(Sentence sent, int pos) {
// generate context predicates
List cps = new ArrayList();
// get the context information from sequence
for (int it = 0; it < cpnames.size(); ++it){
String cp = cpnames.get(it);
Vector paras = this.paras.get(it);
String cpvalue = "";
String suffix = "", regex = "";
String word = "";
boolean outOfArrayIndex = false;
for (int i = 0; i < paras.size(); ++i) {
if (pos + paras.get(i) < 0 || pos + paras.get(i)>= sent.size()){
cpvalue = "";
outOfArrayIndex = true;
break;
}
suffix += paras.get(i) + ":";
word += sent.getWordAt(pos + paras.get(i)) + " ";
}
if (outOfArrayIndex) continue;
word = word.trim().toLowerCase();
suffix = suffix.substring(0, suffix.length() - 1);
suffix = ":" + suffix;
// Match to a specific pattern
regex = patternMatching(cp, word);
if (!regex.equals("")) {
cpvalue = "re" + suffix + regex;
}
if (!cpvalue.equals("")) cps.add(cpvalue);
}
String [] ret = new String[cps.size()];
return cps.toArray(ret);
}
//----------------------------
// utility methods
//----------------------------
/**
* Pattern compile.
*/
private static void patternCompile() {
try {
ptnNumber = Pattern.compile(strNumberPattern);
ptnShortDate = Pattern.compile(strShortDatePattern);
ptnLongDate = Pattern.compile(strLongDatePattern);
ptnPercentage = Pattern.compile(strPercentagePattern);
ptnCurrency = Pattern.compile(strCurrencyPattern);
ptnViCurrency = Pattern.compile(strViCurrencyPattern);
} catch (PatternSyntaxException ex) {
System.err.println(ex.getMessage());
System.exit(1);
}
}
/**
* Pattern matching.
*
* @param ptnName the ptn name
* @param input the input
* @return the string
*/
private static String patternMatching(String ptnName, String input) {
String suffix = "";
if (ptnNumber == null)
patternCompile();
Matcher matcher;
if (ptnName.equals("number")) {
matcher = ptnNumber.matcher(input);
if (matcher.matches())
suffix = ":number";
} else if (ptnName.equals("short_date")) {
matcher = ptnShortDate.matcher(input);
if (matcher.matches())
suffix = ":short-date";
} else if (ptnName.equals("long_date")) {
matcher = ptnLongDate.matcher(input);
if (matcher.matches())
suffix = ":long-date";
} else if (ptnName.equals("percentage")) {
matcher = ptnPercentage.matcher(input);
if (matcher.matches())
suffix = ":percentage";
} else if (ptnName.equals("currency")) {
matcher = ptnCurrency.matcher(input);
if (matcher.matches())
suffix = ":currency";
else {
matcher = ptnViCurrency.matcher(input);
if (matcher.matches()) {
suffix = ":currency";
}
}
}
return suffix;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy