net.sourceforge.pinyin4j.PinyinFormatter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pinyin4j Show documentation
Show all versions of pinyin4j Show documentation
Support Chinese character (both Simplified and Tranditional) to most popular Pinyin systems, including
Hanyu Pinyin, Tongyong Pinyin, Wade-Giles, MPS2, Yale and Gwoyeu Romatzyh. Support multiple pronounciations and
customized output.
The newest version!
/**
* This file is part of pinyin4j (http://sourceforge.net/projects/pinyin4j/) and distributed under
* GNU GENERAL PUBLIC LICENSE (GPL).
*
* pinyin4j is free software; you can redistribute it and/or modify it under the terms of the GNU
* General Public License as published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* pinyin4j is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with pinyin4j.
*/
package net.sourceforge.pinyin4j;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
/**
* Contains logic to format given Pinyin string
*
* @author Li Min ([email protected])
*
*/
class PinyinFormatter {
/**
* @param pinyinStr
* unformatted Hanyu Pinyin string
* @param outputFormat
* given format of Hanyu Pinyin
* @return formatted Hanyu Pinyin string
* @throws BadHanyuPinyinOutputFormatCombination
*/
static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)
throws BadHanyuPinyinOutputFormatCombination {
if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())
&& ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat
.getVCharType()))) {
throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");
}
if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {
pinyinStr = pinyinStr.replaceAll("[1-5]", "");
} else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {
pinyinStr = pinyinStr.replaceAll("u:", "v");
pinyinStr = convertToneNumber2ToneMark(pinyinStr);
}
if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {
pinyinStr = pinyinStr.replaceAll("u:", "v");
} else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {
pinyinStr = pinyinStr.replaceAll("u:", "ü");
}
if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {
pinyinStr = pinyinStr.toUpperCase();
}
return pinyinStr;
}
/**
* Convert tone numbers to tone marks using Unicode
*
* Algorithm for determining location of tone mark
*
* A simple algorithm for determining the vowel on which the tone mark
* appears is as follows:
*
*
* - First, look for an "a" or an "e". If either vowel appears, it takes
* the tone mark. There are no possible pinyin syllables that contain both
* an "a" and an "e".
*
*
- If there is no "a" or "e", look for an "ou". If "ou" appears, then
* the "o" takes the tone mark.
*
*
- If none of the above cases hold, then the last vowel in the syllable
* takes the tone mark.
*
*
*
* @param pinyinStr
* the ascii represention with tone numbers
* @return the unicode represention with tone marks
*/
private static String convertToneNumber2ToneMark(final String pinyinStr) {
String lowerCasePinyinStr = pinyinStr.toLowerCase();
if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) {
final char defautlCharValue = '$';
final int defautlIndexValue = -1;
char unmarkedVowel = defautlCharValue;
int indexOfUnmarkedVowel = defautlIndexValue;
final char charA = 'a';
final char charE = 'e';
final String ouStr = "ou";
final String allUnmarkedVowelStr = "aeiouv";
final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü";
if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) {
int tuneNumber =
Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1));
int indexOfA = lowerCasePinyinStr.indexOf(charA);
int indexOfE = lowerCasePinyinStr.indexOf(charE);
int ouIndex = lowerCasePinyinStr.indexOf(ouStr);
if (-1 != indexOfA) {
indexOfUnmarkedVowel = indexOfA;
unmarkedVowel = charA;
} else if (-1 != indexOfE) {
indexOfUnmarkedVowel = indexOfE;
unmarkedVowel = charE;
} else if (-1 != ouIndex) {
indexOfUnmarkedVowel = ouIndex;
unmarkedVowel = ouStr.charAt(0);
} else {
for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) {
if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches(
"[" + allUnmarkedVowelStr + "]")) {
indexOfUnmarkedVowel = i;
unmarkedVowel = lowerCasePinyinStr.charAt(i);
break;
}
}
}
if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) {
int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel);
int columnIndex = tuneNumber - 1;
int vowelLocation = rowIndex * 5 + columnIndex;
char markedVowel = allMarkedVowelStr.charAt(vowelLocation);
return lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v", "ü")
+ markedVowel
+ lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1,
lowerCasePinyinStr.length() - 1).replaceAll("v", "ü");
} else
// error happens in the procedure of locating vowel
{
return lowerCasePinyinStr;
}
} else
// input string has no any tune number
{
// only replace v with ü (umlat) character
return lowerCasePinyinStr.replaceAll("v", "ü");
}
} else
// bad format
{
return lowerCasePinyinStr;
}
}
}