jvntextpro.util.VnSyllParser Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvntextpro.util;
import java.util.*;
// TODO: Auto-generated Javadoc
class TONE {
public static final TONE NO_TONE = new TONE(0);
public static final TONE ACUTE = new TONE(1);
public static final TONE ACCENT = new TONE(2);
public static final TONE QUESTION = new TONE(3);
public static final TONE TILDE = new TONE(4);
public static final TONE DOT = new TONE(5);
public static TONE getTone(int v) {
switch (v) {
case 0:
return NO_TONE;
case 1:
return ACUTE;
case 2:
return ACCENT;
case 3:
return QUESTION;
case 4:
return TILDE;
case 5:
return DOT;
default:
return NO_TONE;
}
}
public int getValue() {
return value;
}
private TONE(int v) {
value = v;
}
private int value;
}
/*
* This class parse a vietnamese syllable in UTF-8 encoding
*/
/**
* The Class VnSyllParser.
*/
public class VnSyllParser {
// Member Data
/** The Constant vnFirstConsonants. */
private static final String vnFirstConsonants = "ngh|ng|gh|ph|ch|tr|nh|kh|th|m|b|v|t|\u0111|n|x|s|l|h|r|d|gi|g|q|k|c";
/** The Constant vnLastConsonants. */
private static final String vnLastConsonants = "ng|nh|ch|p|t|c|m|n|u|o|y|i";
/** The Constant vnMainVowels. */
private static final String vnMainVowels = "i\u00EA|y\u00EA|ia|ya|\u01B0\u01A1|\u01B0a|u\u00F4|ua|oo|\u00EA|e|a|\u01B0|\u0103|o|\u01A1|\u00E2|\u00F4|u|i|y|";
/** The Constant vnSecondaryVowels. */
private static final String vnSecondaryVowels = "o|u";
/** The Constant ZERO. */
public static final String ZERO = "";
/** The vn vowels. */
private static String vnVowels = "a\u00E1\u00E0\u1EA3\u00E3\u1EA1"
+ "\u0103\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7"
+ "\u00E2\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD"
+ "e\u00E9\u00E8\u1EBB\u1EBD\u1EB9"
+ "\u00EA\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7"
+ "i\u00ED\u00EC\u1EC9\u0129\u1ECB"
+ "o\u00F3\u00F2\u1ECF\u00F5\u1ECD"
+ "\u00F4\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9"
+ "\u01A1\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3"
+ "u\u00FA\u00F9\u1EE7\u0169\u1EE5"
+ "\u01B0\u1EE9\u1EEB\u1EED\u1EEF\u1EF1"
+ "y\u00FD\u1EF3\u1EF7\u1EF9\u1EF5";
/** The al first consonants. */
private static ArrayList alFirstConsonants;
/** The al last consonants. */
private static ArrayList alLastConsonants;
/** The al main vowels. */
private static ArrayList alMainVowels;
/** The al secondary vowels. */
private static ArrayList alSecondaryVowels;
/** The str syllable. */
private String strSyllable;
/** The str main vowel. */
private String strMainVowel;
/** The str secondary vowel. */
private String strSecondaryVowel;
/** The str first consonant. */
private String strFirstConsonant;
/** The str last consonant. */
private String strLastConsonant;
/** The tone. */
private TONE tone = TONE.NO_TONE;
/** The i cur pos. */
private int iCurPos;
/** The valid vi syll. */
private boolean validViSyll;
// private boolean validSyll;
// Public Methods
/**
* Instantiates a new vn syll parser.
*
* @param syll the syll
*/
public VnSyllParser(String syll) {
init();
parseVnSyllable(syll);
}
/**
* Instantiates a new vn syll parser.
*/
public VnSyllParser() {
init();
}
/**
* Parses the vn syllable.
*
* @param syll the syll
*/
public void parseVnSyllable(String syll) {
strSyllable = syll;
strMainVowel = "";
strSecondaryVowel = "";
strFirstConsonant = "";
strLastConsonant = "";
iCurPos = 0;
validViSyll = true;
parseFirstConsonant();
parseSecondaryVowel();
parseMainVowel();
parseLastConsonant();
}
/**
* Gets the first consonant.
*
* @return the first consonant
*/
public String getFirstConsonant() {
return strFirstConsonant;
}
/**
* Gets the second vowel.
*
* @return the second vowel
*/
public String getSecondVowel() {
return strSecondaryVowel;
}
/**
* Gets the main vowel.
*
* @return the main vowel
*/
public String getMainVowel() {
return strMainVowel;
}
/**
* Gets the last consonant.
*
* @return the last consonant
*/
public String getLastConsonant() {
return strLastConsonant;
}
/**
* Gets the tone.
*
* @return the tone
*/
public TONE getTone() {
return tone;
}
/**
* Gets the rhyme.
*
* @return the rhyme
*/
public String getRhyme(){
return strSecondaryVowel + strMainVowel + strLastConsonant;
}
/**
* Gets the non tone syll.
*
* @return the non tone syll
*/
public String getNonToneSyll(){
return strFirstConsonant + strSecondaryVowel + strMainVowel + strLastConsonant;
}
/**
* Checks if is valid vn syllable.
*
* @return true, if is valid vn syllable
*/
public boolean isValidVnSyllable() {
return validViSyll ;
}
// Private Methods
/**
* Parses the first consonant.
*/
private void parseFirstConsonant() {
// find first of (vnfirstconsonant)
// if not found, first consonant = ZERO
// else the found consonant
Iterator iter = alFirstConsonants.iterator();
while (iter.hasNext()) {
String strFirstCon = (String) iter.next();
if (strSyllable.startsWith(strFirstCon, iCurPos)) {
strFirstConsonant = strFirstCon;
iCurPos += strFirstCon.length();
return;
}
}
strFirstConsonant = ZERO;
}
/**
* Parses the secondary vowel.
*/
private void parseSecondaryVowel() {
if (!validViSyll)
return;
// get the current and next character in the syllable string
char curChar, nextChar;
if (iCurPos > strSyllable.length() - 1) {
validViSyll = false;
return;
}
curChar = strSyllable.charAt(iCurPos);
if (iCurPos == strSyllable.length() - 1)
nextChar = '$';
else
nextChar = strSyllable.charAt(iCurPos + 1);
// get the tone and the original vowel (without tone)
TONE tone = TONE.NO_TONE;
int idx1 = vnVowels.indexOf(curChar);
int idx2 = vnVowels.indexOf(nextChar);
if (idx1 == -1)
return;// current char is not a vowel
tone = TONE.getTone(idx1 % 6);
curChar = vnVowels.charAt((idx1 / 6) * 6);
if (idx2 == -1) { // next char is not a vowel
strSecondaryVowel = ZERO;
return;
}
nextChar = vnVowels.charAt((idx2 / 6) * 6);
if (tone.getValue() == TONE.NO_TONE.getValue())
tone = TONE.getTone(idx2 % 6);
// Check the secondary vowel
if (curChar == 'o') {
if (nextChar == 'a' || nextChar == 'e') {
strSecondaryVowel += curChar;
iCurPos++;
} else
strSecondaryVowel = ZERO; // oo
return;
} else if (curChar == 'u') {
if (nextChar != 'i' && nextChar != '$') {
strSecondaryVowel += curChar;
iCurPos++;
} else
strSecondaryVowel = ZERO;
return;
}
}
/**
* Parses the main vowel.
*/
private void parseMainVowel() {
if (!validViSyll)
return;
if (iCurPos > strSyllable.length() - 1) {
validViSyll = false;
return;
}
String strVowel = "";
for (int i = iCurPos; i < strSyllable.length(); ++i) {
int idx = vnVowels.indexOf(strSyllable.charAt(i));
if (idx == -1)
break;
strVowel += vnVowels.charAt((idx / 6) * 6);
if (tone.getValue() == TONE.NO_TONE.getValue())
tone = TONE.getTone(idx % 6);
}
Iterator iter = alMainVowels.iterator();
while (iter.hasNext()) {
String tempVowel = (String) iter.next();
if (strVowel.startsWith(tempVowel)) {
strMainVowel = tempVowel;
iCurPos += tempVowel.length();
return;
}
}
validViSyll = false;
return;
}
/**
* Parses the last consonant.
*/
private void parseLastConsonant() {
if (!validViSyll)
return;
if (iCurPos > strSyllable.length())
strLastConsonant = ZERO;
String strCon = strSyllable.substring(iCurPos, strSyllable.length());
if (strCon.length() > 3) {
validViSyll = false;
return;
}
Iterator iter = alLastConsonants.iterator();
while (iter.hasNext()) {
String tempLastCon = (String) iter.next();
if (strCon.equals(tempLastCon)) {
strLastConsonant = tempLastCon;
iCurPos += strLastConsonant.length();
return;
}
}
strLastConsonant = ZERO;
if (iCurPos >= strSyllable.length())
validViSyll = true;
else validViSyll = false;
return;
}
/**
* Inits the.
*/
private static void init() {
if (alFirstConsonants == null) {
alFirstConsonants = new ArrayList();
alLastConsonants = new ArrayList();
alMainVowels = new ArrayList();
alSecondaryVowels = new ArrayList();
initArrayList(alFirstConsonants, vnFirstConsonants);
initArrayList(alLastConsonants, vnLastConsonants);
initArrayList(alMainVowels, vnMainVowels);
initArrayList(alSecondaryVowels, vnSecondaryVowels);
}
}
/**
* Inits the array list.
*
* @param al the al
* @param str the str
*/
private static void initArrayList(ArrayList al, String str) {
StringTokenizer strTknr = new StringTokenizer(str, "|");
while (strTknr.hasMoreTokens()) {
al.add(strTknr.nextToken());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy