net.sourceforge.pinyin4j.PinyinHelper Maven / Gradle / Ivy
Show all versions of pinyin4j-multi Show documentation
/**
* This file is part of pinyin4j (http://sourceforge.net/projects/pinyin4j/) and distributed under
* GNU GENERAL PUBLIC LICENSE (GPL).
*
* pinyin4j is free software; you can redistribute it and/or modify it under the terms of the GNU
* General Public License as published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* pinyin4j is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with pinyin4j.
*/
package net.sourceforge.pinyin4j;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;
/**
* A class provides several utility functions to convert Chinese characters
* (both Simplified and Tranditional) into various Chinese Romanization
* representations
*
* @author Li Min ([email protected])
*/
public class PinyinHelper {
private static final String[] ARR_EMPTY = {};
private static final String EMPTY = "";
/**
* Get all unformmatted Hanyu Pinyin presentations of a single Chinese
* character (both Simplified and Tranditional)
*
*
* For example,
If the input is '间', the return will be an array with
* two Hanyu Pinyin strings:
"jian1"
"jian4"
If the
* input is '李', the return will be an array with single Hanyu Pinyin
* string:
"li3"
*
*
* Special Note: If the return is "none0", that means the input
* Chinese character exists in Unicode CJK talbe, however, it has no
* pronounciation in Chinese
*
* @param ch the given Chinese character
* @return a String array contains all unformmatted Hanyu Pinyin
* presentations with tone numbers; null for non-Chinese character
*/
static public String[] toHanyuPinyinStringArray(char ch) {
return getUnformattedHanyuPinyinStringArray(ch);
}
/**
* Get all Hanyu Pinyin presentations of a single Chinese character (both
* Simplified and Tranditional)
*
*
* For example,
If the input is '间', the return will be an array with
* two Hanyu Pinyin strings:
"jian1"
"jian4"
If the
* input is '李', the return will be an array with single Hanyu Pinyin
* string:
"li3"
*
*
* Special Note: If the return is "none0", that means the input
* Chinese character is in Unicode CJK talbe, however, it has no
* pronounciation in Chinese
*
* @param ch the given Chinese character
* @param outputFormat describes the desired format of returned Hanyu Pinyin String
* @return a String array contains all Hanyu Pinyin presentations with tone
* numbers; return empty string for non-Chinese character
* @throws BadHanyuPinyinOutputFormatCombination if certain combination of output formats happens
* @see HanyuPinyinOutputFormat
* @see BadHanyuPinyinOutputFormatCombination
*/
static public String[] toHanyuPinyinStringArray(char ch, HanyuPinyinOutputFormat outputFormat)
throws BadHanyuPinyinOutputFormatCombination {
return getFormattedHanyuPinyinStringArray(ch, outputFormat);
}
/**
* Return the formatted Hanyu Pinyin representations of the given Chinese
* character (both in Simplified and Tranditional) in array format.
*
* @param ch the given Chinese character
* @param outputFormat Describes the desired format of returned Hanyu Pinyin string
* @return The formatted Hanyu Pinyin representations of the given codepoint
* in array format; null if no record is found in the hashtable.
*/
static private String[] getFormattedHanyuPinyinStringArray(char ch,
HanyuPinyinOutputFormat outputFormat) throws BadHanyuPinyinOutputFormatCombination {
String[] pinyinStrArray = getUnformattedHanyuPinyinStringArray(ch);
if (null != pinyinStrArray) {
for (int i = 0; i < pinyinStrArray.length; i++) {
pinyinStrArray[i] =
PinyinFormatter.formatHanyuPinyin(pinyinStrArray[i], outputFormat);
}
return pinyinStrArray;
} else
return ARR_EMPTY;
}
/**
* Delegate function
*
* @param ch the given Chinese character
* @return unformatted Hanyu Pinyin strings; null if the record is not found
*/
private static String[] getUnformattedHanyuPinyinStringArray(char ch) {
return ChineseToPinyinResource.getInstance().getHanyuPinyinStringArray(ch);
}
/**
* Get all unformmatted Tongyong Pinyin presentations of a single Chinese
* character (both Simplified and Tranditional)
*
* @param ch the given Chinese character
* @return a String array contains all unformmatted Tongyong Pinyin
* presentations with tone numbers; null for non-Chinese character
* @see #toHanyuPinyinStringArray(char)
*/
static public String[] toTongyongPinyinStringArray(char ch) {
return convertToTargetPinyinStringArray(ch, PinyinRomanizationType.TONGYONG_PINYIN);
}
/**
* Get all unformmatted Wade-Giles presentations of a single Chinese
* character (both Simplified and Tranditional)
*
* @param ch the given Chinese character
* @return a String array contains all unformmatted Wade-Giles presentations
* with tone numbers; null for non-Chinese character
* @see #toHanyuPinyinStringArray(char)
*/
static public String[] toWadeGilesPinyinStringArray(char ch) {
return convertToTargetPinyinStringArray(ch, PinyinRomanizationType.WADEGILES_PINYIN);
}
/**
* Get all unformmatted MPS2 (Mandarin Phonetic Symbols 2) presentations of
* a single Chinese character (both Simplified and Tranditional)
*
* @param ch the given Chinese character
* @return a String array contains all unformmatted MPS2 (Mandarin Phonetic
* Symbols 2) presentations with tone numbers; null for non-Chinese
* character
* @see #toHanyuPinyinStringArray(char)
*/
static public String[] toMPS2PinyinStringArray(char ch) {
return convertToTargetPinyinStringArray(ch, PinyinRomanizationType.MPS2_PINYIN);
}
/**
* Get all unformmatted Yale Pinyin presentations of a single Chinese
* character (both Simplified and Tranditional)
*
* @param ch the given Chinese character
* @return a String array contains all unformmatted Yale Pinyin
* presentations with tone numbers; null for non-Chinese character
* @see #toHanyuPinyinStringArray(char)
*/
static public String[] toYalePinyinStringArray(char ch) {
return convertToTargetPinyinStringArray(ch, PinyinRomanizationType.YALE_PINYIN);
}
/**
* @param ch the given Chinese character
* @param targetPinyinSystem indicates target Chinese Romanization system should be
* converted to
* @return string representations of target Chinese Romanization system
* corresponding to the given Chinese character in array format;
* null if error happens
* @see PinyinRomanizationType
*/
private static String[] convertToTargetPinyinStringArray(char ch,
PinyinRomanizationType targetPinyinSystem) {
String[] hanyuPinyinStringArray = getUnformattedHanyuPinyinStringArray(ch);
if (null != hanyuPinyinStringArray) {
String[] targetPinyinStringArray = new String[hanyuPinyinStringArray.length];
for (int i = 0; i < hanyuPinyinStringArray.length; i++) {
targetPinyinStringArray[i] =
PinyinRomanizationTranslator.convertRomanizationSystem(
hanyuPinyinStringArray[i], PinyinRomanizationType.HANYU_PINYIN,
targetPinyinSystem);
}
return targetPinyinStringArray;
} else
return ARR_EMPTY;
}
/**
* Get all unformmatted Gwoyeu Romatzyh presentations of a single Chinese
* character (both Simplified and Tranditional)
*
* @param ch the given Chinese character
* @return a String array contains all unformmatted Gwoyeu Romatzyh
* presentations with tone numbers; null for non-Chinese character
* @see #toHanyuPinyinStringArray(char)
*/
static public String[] toGwoyeuRomatzyhStringArray(char ch) {
return convertToGwoyeuRomatzyhStringArray(ch);
}
/**
* @param ch the given Chinese character
* @return Gwoyeu Romatzyh string representations corresponding to the given
* Chinese character in array format; null if error happens
* @see PinyinRomanizationType
*/
private static String[] convertToGwoyeuRomatzyhStringArray(char ch) {
String[] hanyuPinyinStringArray = getUnformattedHanyuPinyinStringArray(ch);
if (null != hanyuPinyinStringArray) {
String[] targetPinyinStringArray = new String[hanyuPinyinStringArray.length];
for (int i = 0; i < hanyuPinyinStringArray.length; i++) {
targetPinyinStringArray[i] =
GwoyeuRomatzyhTranslator
.convertHanyuPinyinToGwoyeuRomatzyh(hanyuPinyinStringArray[i]);
}
return targetPinyinStringArray;
} else
return ARR_EMPTY;
}
/**
* Get a string which all Chinese characters are replaced by corresponding
* main (first) Hanyu Pinyin representation.
*
*
* Special Note: If the return contains "none0", that means that
* Chinese character is in Unicode CJK talbe, however, it has not
* pronounciation in Chinese. This interface will be removed in next
* release.
*
* @param str A given string contains Chinese characters
* @param outputFormat Describes the desired format of returned Hanyu Pinyin string
* @param separate The string is appended after a Chinese character (excluding
* the last Chinese character at the end of sentence). Note!
* Separate will not appear after a non-Chinese character
* @param retain Retain the characters that cannot be converted into pinyin characters
* @param firstLetter firstLetter
* @return a String identical to the original one but all recognizable
* Chinese characters are converted into main (first) Hanyu Pinyin
* representation
*/
static public String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat,
String separate, boolean retain, boolean firstLetter)
throws BadHanyuPinyinOutputFormatCombination {
if (separate == null) separate = "";
ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();
StringBuilder resultPinyinStrBuf = new StringBuilder();
if (str == null || (str = str.trim()).length() == 0) return EMPTY;
char[] chars = str.toCharArray();
for (int i = 0; i < chars.length; i++) {
String result = null;//匹配到的最长的结果
char ch = chars[i];
Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();
int success = i;
int current = i;
do {
String hexStr = Integer.toHexString((int) ch).toUpperCase();
currentTrie = currentTrie.get(hexStr);
if (currentTrie != null) {
if (currentTrie.getPinyin() != null) {
result = currentTrie.getPinyin();
success = current;
}
currentTrie = currentTrie.getNextTire();
}
current++;
if (current < chars.length)
ch = chars[current];
else
break;
} while (currentTrie != null);
if (result == null) {//如果在前缀树中没有匹配到,那么它就不能转换为拼音,直接输出或者去掉
if (retain) resultPinyinStrBuf.append(chars[i]);
resultPinyinStrBuf.append(separate);
} else {
String[] pinyinStrArray = resource.parsePinyinString(result);
if (pinyinStrArray != null) {
for (String aPinyinStrArray : pinyinStrArray) {
aPinyinStrArray =
PinyinFormatter.formatHanyuPinyin(aPinyinStrArray, outputFormat);
resultPinyinStrBuf.append(firstLetter
? aPinyinStrArray.charAt(0)
: aPinyinStrArray);
resultPinyinStrBuf.append(separate);
if (i == success) break;
}
}
}
i = success;
}
resultPinyinStrBuf.setLength(resultPinyinStrBuf.length() - separate.length());
return resultPinyinStrBuf.toString();
}
/**
* @param str
* @param outputFormat
* @param separate
* @param retain
* @return
* @throws BadHanyuPinyinOutputFormatCombination
*/
static public String toHanYuPinyinStringFirstLetter(String str,
HanyuPinyinOutputFormat outputFormat, String separate, boolean retain)
throws BadHanyuPinyinOutputFormatCombination {
return PinyinHelper.toHanYuPinyinString(str, outputFormat, separate, retain, true);
}
static public String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat,
String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {
return PinyinHelper.toHanYuPinyinString(str, outputFormat, separate, retain, false);
}
public static boolean isEnglishLetter(char c) {
return Character.isUpperCase(c) || Character.isLowerCase(c);
}
// ! Hidden constructor
private PinyinHelper() {}
}