com.wl4g.component.common.nlp.PingyUtils Maven / Gradle / Ivy
/*
* Copyright 2017 ~ 2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.wl4g.component.common.nlp;
import java.util.ArrayList;
import java.util.List;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
/**
* Chinese processing conversion recognition tools.
*
* @author Wangl.sir <[email protected], [email protected]>
* @version 2017-03-29
* @sine v1.0.0
* @see
*/
public abstract class PingyUtils {
/**
* Gets Chinese spelling
*
* @param cnStrring
* Target Chinese string
* @return
*/
public static List getPingyin(String cnStrring) {
List pingys = null;
try {
char[] chArr = cnStrring.toCharArray();
String[] strArr = new String[chArr.length];
HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);
t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
t3.setVCharType(HanyuPinyinVCharType.WITH_V);
pingys = new ArrayList(chArr.length);
for (int i = 0; i < chArr.length; i++) {
// 判断是否为汉字字符
if (java.lang.Character.toString(chArr[i]).matches("[\\u4E00-\\u9FA5]+")) {
strArr = PinyinHelper.toHanyuPinyinStringArray(chArr[i], t3);
pingys.add(strArr[0]);
} else {
pingys.add(Character.toString(chArr[i]));
}
}
return pingys;
} catch (BadHanyuPinyinOutputFormatCombination e1) {
e1.printStackTrace();
}
return pingys;
}
/**
* Gets Chinese initials
*
* @param cnStrring
* Target Chinese string
* @return
*/
public static List getPinyinHeadChar(String cnStrring) {
List pingys = new ArrayList<>(cnStrring.length());
for (int j = 0; j < cnStrring.length(); j++) {
char word = cnStrring.charAt(j);
String[] pinyArr = PinyinHelper.toHanyuPinyinStringArray(word);
if (pinyArr != null) {
pingys.add(pinyArr[0].charAt(0));
} else {
pingys.add(word);
}
}
return pingys;
}
}