All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.wl4g.infra.common.nlp.PingyUtils Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2017 ~ 2025 the original author or authors. James Wong 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.wl4g.infra.common.nlp;

import java.util.ArrayList;
import java.util.List;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

/**
 * Chinese processing conversion recognition tools.
 * 
 * @author James Wong <[email protected]>
 * @version 2017-03-29
 * @since v2.0.0
 * @see
 */
public abstract class PingyUtils {

    /**
     * Gets Chinese spelling
     * 
     * @param cnStrring
     *            Target Chinese string
     * @return
     */
    public static List getPingyin(String cnStrring) {
        List pingys = null;
        try {
            char[] chArr = cnStrring.toCharArray();
            String[] strArr = new String[chArr.length];

            HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
            t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);
            t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
            t3.setVCharType(HanyuPinyinVCharType.WITH_V);

            pingys = new ArrayList(chArr.length);
            for (int i = 0; i < chArr.length; i++) {
                // 判断是否为汉字字符
                if (java.lang.Character.toString(chArr[i]).matches("[\\u4E00-\\u9FA5]+")) {
                    strArr = PinyinHelper.toHanyuPinyinStringArray(chArr[i], t3);
                    pingys.add(strArr[0]);
                } else {
                    pingys.add(Character.toString(chArr[i]));
                }
            }

            return pingys;
        } catch (BadHanyuPinyinOutputFormatCombination e1) {
            e1.printStackTrace();
        }

        return pingys;
    }

    /**
     * Gets Chinese initials
     * 
     * @param cnStrring
     *            Target Chinese string
     * @return
     */
    public static List getPinyinHeadChar(String cnStrring) {
        List pingys = new ArrayList<>(cnStrring.length());

        for (int j = 0; j < cnStrring.length(); j++) {
            char word = cnStrring.charAt(j);
            String[] pinyArr = PinyinHelper.toHanyuPinyinStringArray(word);
            if (pinyArr != null) {
                pingys.add(pinyArr[0].charAt(0));
            } else {
                pingys.add(word);
            }
        }

        return pingys;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy