jvntextpro.util.VnSyllParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvntextpro.util;
import java.util.*;
// TODO: Auto-generated Javadoc
class TONE {
public static final TONE NO_TONE = new TONE(0);
public static final TONE ACUTE = new TONE(1);
public static final TONE ACCENT = new TONE(2);
public static final TONE QUESTION = new TONE(3);
public static final TONE TILDE = new TONE(4);
public static final TONE DOT = new TONE(5);
public static TONE getTone(int v) {
switch (v) {
case 0:
return NO_TONE;
case 1:
return ACUTE;
case 2:
return ACCENT;
case 3:
return QUESTION;
case 4:
return TILDE;
case 5:
return DOT;
default:
return NO_TONE;
}
}
public int getValue() {
return value;
}
private TONE(int v) {
value = v;
}
private int value;
}
/*
* This class parse a vietnamese syllable in UTF-8 encoding
*/
/**
* The Class VnSyllParser.
*/
public class VnSyllParser {
// Member Data
/** The Constant vnFirstConsonants. */
private static final String vnFirstConsonants = "ngh|ng|gh|ph|ch|tr|nh|kh|th|m|b|v|t|\u0111|n|x|s|l|h|r|d|gi|g|q|k|c";
/** The Constant vnLastConsonants. */
private static final String vnLastConsonants = "ng|nh|ch|p|t|c|m|n|u|o|y|i";
/** The Constant vnMainVowels. */
private static final String vnMainVowels = "i\u00EA|y\u00EA|ia|ya|\u01B0\u01A1|\u01B0a|u\u00F4|ua|oo|\u00EA|e|a|\u01B0|\u0103|o|\u01A1|\u00E2|\u00F4|u|i|y|";
/** The Constant vnSecondaryVowels. */
private static final String vnSecondaryVowels = "o|u";
/** The Constant ZERO. */
public static final String ZERO = "";
/** The vn vowels. */
private static String vnVowels = "a\u00E1\u00E0\u1EA3\u00E3\u1EA1"
+ "\u0103\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7"
+ "\u00E2\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD"
+ "e\u00E9\u00E8\u1EBB\u1EBD\u1EB9"
+ "\u00EA\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7"
+ "i\u00ED\u00EC\u1EC9\u0129\u1ECB"
+ "o\u00F3\u00F2\u1ECF\u00F5\u1ECD"
+ "\u00F4\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9"
+ "\u01A1\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3"
+ "u\u00FA\u00F9\u1EE7\u0169\u1EE5"
+ "\u01B0\u1EE9\u1EEB\u1EED\u1EEF\u1EF1"
+ "y\u00FD\u1EF3\u1EF7\u1EF9\u1EF5";
/** The al first consonants. */
private static ArrayList alFirstConsonants;
/** The al last consonants. */
private static ArrayList alLastConsonants;
/** The al main vowels. */
private static ArrayList alMainVowels;
/** The al secondary vowels. */
private static ArrayList alSecondaryVowels;
/** The str syllable. */
private String strSyllable;
/** The str main vowel. */
private String strMainVowel;
/** The str secondary vowel. */
private String strSecondaryVowel;
/** The str first consonant. */
private String strFirstConsonant;
/** The str last consonant. */
private String strLastConsonant;
/** The tone. */
private TONE tone = TONE.NO_TONE;
/** The i cur pos. */
private int iCurPos;
/** The valid vi syll. */
private boolean validViSyll;
// private boolean validSyll;
// Public Methods
/**
* Instantiates a new vn syll parser.
*
* @param syll the syll
*/
public VnSyllParser(String syll) {
init();
parseVnSyllable(syll);
}
/**
* Instantiates a new vn syll parser.
*/
public VnSyllParser() {
init();
}
/**
* Parses the vn syllable.
*
* @param syll the syll
*/
public void parseVnSyllable(String syll) {
strSyllable = syll;
strMainVowel = "";
strSecondaryVowel = "";
strFirstConsonant = "";
strLastConsonant = "";
iCurPos = 0;
validViSyll = true;
parseFirstConsonant();
parseSecondaryVowel();
parseMainVowel();
parseLastConsonant();
}
/**
* Gets the first consonant.
*
* @return the first consonant
*/
public String getFirstConsonant() {
return strFirstConsonant;
}
/**
* Gets the second vowel.
*
* @return the second vowel
*/
public String getSecondVowel() {
return strSecondaryVowel;
}
/**
* Gets the main vowel.
*
* @return the main vowel
*/
public String getMainVowel() {
return strMainVowel;
}
/**
* Gets the last consonant.
*
* @return the last consonant
*/
public String getLastConsonant() {
return strLastConsonant;
}
/**
* Gets the tone.
*
* @return the tone
*/
public TONE getTone() {
return tone;
}
/**
* Gets the rhyme.
*
* @return the rhyme
*/
public String getRhyme(){
return strSecondaryVowel + strMainVowel + strLastConsonant;
}
/**
* Gets the non tone syll.
*
* @return the non tone syll
*/
public String getNonToneSyll(){
return strFirstConsonant + strSecondaryVowel + strMainVowel + strLastConsonant;
}
/**
* Checks if is valid vn syllable.
*
* @return true, if is valid vn syllable
*/
public boolean isValidVnSyllable() {
return validViSyll ;
}
// Private Methods
/**
* Parses the first consonant.
*/
private void parseFirstConsonant() {
// find first of (vnfirstconsonant)
// if not found, first consonant = ZERO
// else the found consonant
Iterator iter = alFirstConsonants.iterator();
while (iter.hasNext()) {
String strFirstCon = (String) iter.next();
if (strSyllable.startsWith(strFirstCon, iCurPos)) {
strFirstConsonant = strFirstCon;
iCurPos += strFirstCon.length();
return;
}
}
strFirstConsonant = ZERO;
}
/**
* Parses the secondary vowel.
*/
private void parseSecondaryVowel() {
if (!validViSyll)
return;
// get the current and next character in the syllable string
char curChar, nextChar;
if (iCurPos > strSyllable.length() - 1) {
validViSyll = false;
return;
}
curChar = strSyllable.charAt(iCurPos);
if (iCurPos == strSyllable.length() - 1)
nextChar = '$';
else
nextChar = strSyllable.charAt(iCurPos + 1);
// get the tone and the original vowel (without tone)
TONE tone = TONE.NO_TONE;
int idx1 = vnVowels.indexOf(curChar);
int idx2 = vnVowels.indexOf(nextChar);
if (idx1 == -1)
return;// current char is not a vowel
tone = TONE.getTone(idx1 % 6);
curChar = vnVowels.charAt((idx1 / 6) * 6);
if (idx2 == -1) { // next char is not a vowel
strSecondaryVowel = ZERO;
return;
}
nextChar = vnVowels.charAt((idx2 / 6) * 6);
if (tone.getValue() == TONE.NO_TONE.getValue())
tone = TONE.getTone(idx2 % 6);
// Check the secondary vowel
if (curChar == 'o') {
if (nextChar == 'a' || nextChar == 'e') {
strSecondaryVowel += curChar;
iCurPos++;
} else
strSecondaryVowel = ZERO; // oo
return;
} else if (curChar == 'u') {
if (nextChar != 'i' && nextChar != '$') {
strSecondaryVowel += curChar;
iCurPos++;
} else
strSecondaryVowel = ZERO;
return;
}
}
/**
* Parses the main vowel.
*/
private void parseMainVowel() {
if (!validViSyll)
return;
if (iCurPos > strSyllable.length() - 1) {
validViSyll = false;
return;
}
String strVowel = "";
for (int i = iCurPos; i < strSyllable.length(); ++i) {
int idx = vnVowels.indexOf(strSyllable.charAt(i));
if (idx == -1)
break;
strVowel += vnVowels.charAt((idx / 6) * 6);
if (tone.getValue() == TONE.NO_TONE.getValue())
tone = TONE.getTone(idx % 6);
}
Iterator iter = alMainVowels.iterator();
while (iter.hasNext()) {
String tempVowel = (String) iter.next();
if (strVowel.startsWith(tempVowel)) {
strMainVowel = tempVowel;
iCurPos += tempVowel.length();
return;
}
}
validViSyll = false;
return;
}
/**
* Parses the last consonant.
*/
private void parseLastConsonant() {
if (!validViSyll)
return;
if (iCurPos > strSyllable.length())
strLastConsonant = ZERO;
String strCon = strSyllable.substring(iCurPos, strSyllable.length());
if (strCon.length() > 3) {
validViSyll = false;
return;
}
Iterator iter = alLastConsonants.iterator();
while (iter.hasNext()) {
String tempLastCon = (String) iter.next();
if (strCon.equals(tempLastCon)) {
strLastConsonant = tempLastCon;
iCurPos += strLastConsonant.length();
return;
}
}
strLastConsonant = ZERO;
if (iCurPos >= strSyllable.length())
validViSyll = true;
else validViSyll = false;
return;
}
/**
* Inits the.
*/
private static void init() {
if (alFirstConsonants == null) {
alFirstConsonants = new ArrayList();
alLastConsonants = new ArrayList();
alMainVowels = new ArrayList();
alSecondaryVowels = new ArrayList();
initArrayList(alFirstConsonants, vnFirstConsonants);
initArrayList(alLastConsonants, vnLastConsonants);
initArrayList(alMainVowels, vnMainVowels);
initArrayList(alSecondaryVowels, vnSecondaryVowels);
}
}
/**
* Inits the array list.
*
* @param al the al
* @param str the str
*/
private static void initArrayList(ArrayList al, String str) {
StringTokenizer strTknr = new StringTokenizer(str, "|");
while (strTknr.hasMoreTokens()) {
al.add(strTknr.nextToken());
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy