jvnsegmenter.RegexContextGenerator Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnsegmenter;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.w3c.dom.Element;
import jvntextpro.data.Sentence;
// TODO: Auto-generated Javadoc
/**
* The Class RegexContextGenerator.
*/
public class RegexContextGenerator extends BasicContextGenerator {
//----------------------------
//variables
//----------------------------
// Regular Expression Pattern string
/** The str number pattern. */
private static String strNumberPattern = "[+-]?\\d+([,.]\\d+)*";
/** The str short date pattern. */
private static String strShortDatePattern = "\\d+[/-:]\\d+";
/** The str long date pattern. */
private static String strLongDatePattern = "\\d+[/-:]\\d+[/-:]\\d+";
/** The str percentage pattern. */
private static String strPercentagePattern = strNumberPattern + "%";
/** The str currency pattern. */
private static String strCurrencyPattern = "\\p{Sc}" + strNumberPattern;
/** The str vi currency pattern. */
private static String strViCurrencyPattern = strNumberPattern + "[ \t]*\\p{Sc}";
// Regular Expression Pattern
/** The ptn number. */
private static Pattern ptnNumber;
/** The ptn short date. */
private static Pattern ptnShortDate;
/** The ptn long date. */
private static Pattern ptnLongDate;
/** The ptn percentage. */
private static Pattern ptnPercentage;
/** The ptn currency. */
private static Pattern ptnCurrency;
/** The ptn vi currency. */
private static Pattern ptnViCurrency;
//----------------------------
//methods
//----------------------------
/**
* Instantiates a new regex context generator.
*
* @param node the node
*/
public RegexContextGenerator(Element node){
readFeatureParameters(node);
}
/* (non-Javadoc)
* @see jvntextpro.data.ContextGenerator#getContext(jvntextpro.data.Sentence, int)
*/
@Override
public String[] getContext(Sentence sent, int pos) {
// generate context predicates
List cps = new ArrayList();
// get the context information from sequence
for (int it = 0; it < cpnames.size(); ++it){
String cp = cpnames.get(it);
Vector paras = this.paras.get(it);
String cpvalue = "";
String suffix = "", regex = "";
String word = "";
boolean outOfArrayIndex = false;
for (int i = 0; i < paras.size(); ++i) {
if (pos + paras.get(i) < 0 || pos + paras.get(i)>= sent.size()){
cpvalue = "";
outOfArrayIndex = true;
break;
}
suffix += paras.get(i) + ":";
word += sent.getWordAt(pos + paras.get(i)) + " ";
}
if (outOfArrayIndex) continue;
word = word.trim().toLowerCase();
suffix = suffix.substring(0, suffix.length() - 1);
suffix = ":" + suffix;
// Match to a specific pattern
regex = patternMatching(cp, word);
if (!regex.equals("")) {
cpvalue = "re" + suffix + regex;
}
if (!cpvalue.equals("")) cps.add(cpvalue);
}
String [] ret = new String[cps.size()];
return cps.toArray(ret);
}
//----------------------------
// utility methods
//----------------------------
/**
* Pattern compile.
*/
private static void patternCompile() {
try {
ptnNumber = Pattern.compile(strNumberPattern);
ptnShortDate = Pattern.compile(strShortDatePattern);
ptnLongDate = Pattern.compile(strLongDatePattern);
ptnPercentage = Pattern.compile(strPercentagePattern);
ptnCurrency = Pattern.compile(strCurrencyPattern);
ptnViCurrency = Pattern.compile(strViCurrencyPattern);
} catch (PatternSyntaxException ex) {
System.err.println(ex.getMessage());
System.exit(1);
}
}
/**
* Pattern matching.
*
* @param ptnName the ptn name
* @param input the input
* @return the string
*/
private static String patternMatching(String ptnName, String input) {
String suffix = "";
if (ptnNumber == null)
patternCompile();
Matcher matcher;
if (ptnName.equals("number")) {
matcher = ptnNumber.matcher(input);
if (matcher.matches())
suffix = ":number";
} else if (ptnName.equals("short_date")) {
matcher = ptnShortDate.matcher(input);
if (matcher.matches())
suffix = ":short-date";
} else if (ptnName.equals("long_date")) {
matcher = ptnLongDate.matcher(input);
if (matcher.matches())
suffix = ":long-date";
} else if (ptnName.equals("percentage")) {
matcher = ptnPercentage.matcher(input);
if (matcher.matches())
suffix = ":percentage";
} else if (ptnName.equals("currency")) {
matcher = ptnCurrency.matcher(input);
if (matcher.matches())
suffix = ":currency";
else {
matcher = ptnViCurrency.matcher(input);
if (matcher.matches()) {
suffix = ":currency";
}
}
}
return suffix;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy