All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.rx.util.pinyin.Engine Maven / Gradle / Ivy

package org.rx.util.pinyin;


import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;

import java.util.Collections;
import java.util.Comparator;
import java.util.List;

/**
 * 字符串转拼音引擎,支持字典和{@link SegmentationSelector}
 *
 * Created by guyacong on 2016/12/23.
 */

final class Engine {

    static final EmitComparator EMIT_COMPARATOR = new EmitComparator();

    private Engine() {
        //no instance
    }

    static String toPinyin(final String inputStr, Pinyin.Config config, String separator) {
        List pinyinDicts = Collections.unmodifiableList(config.getPinyinDicts());
        Trie trie = Utils.dictsToTrie(config.getPinyinDicts());
        SegmentationSelector selector = config.getSelector();

        return toPinyin(inputStr, trie, pinyinDicts, separator, selector);
    }

    static String toPinyin(final String inputStr, final Trie trie, final  List pinyinDictList,
            final String separator, final SegmentationSelector selector) {
        if (inputStr == null || inputStr.length() == 0) {
            return inputStr;
        }


        if (trie == null || selector == null) {
            // 没有提供字典或选择器,按单字符转换输出
            StringBuffer resultPinyinStrBuf = new StringBuffer();
            for (int i = 0; i < inputStr.length(); i++) {
                resultPinyinStrBuf.append(Pinyin.toPinyin(inputStr.charAt(i)));
                if (i != inputStr.length() - 1) {
                    resultPinyinStrBuf.append(separator);
                }
            }
            return resultPinyinStrBuf.toString();
        }

        List selectedEmits = selector.select(trie.parseText(inputStr));

        Collections.sort(selectedEmits, EMIT_COMPARATOR);

        StringBuffer resultPinyinStrBuf = new StringBuffer();

        int nextHitIndex = 0;

        for (int i = 0; i < inputStr.length();) {
            // 首先确认是否有以第i个字符作为begin的hit
            if (nextHitIndex < selectedEmits.size() && i == selectedEmits.get(nextHitIndex).getStart()) {
                // 有以第i个字符作为begin的hit
                String[] fromDicts = pinyinFromDict(selectedEmits.get(nextHitIndex).getKeyword(), pinyinDictList);
                for (int j = 0; j < fromDicts.length; j++) {
                    resultPinyinStrBuf.append(fromDicts[j].toUpperCase());
                    if (j != fromDicts.length - 1) {
                        resultPinyinStrBuf.append(separator);
                    }
                }

                i = i + selectedEmits.get(nextHitIndex).size();
                nextHitIndex++;
            } else {
                // 将第i个字符转为拼音
                resultPinyinStrBuf.append(Pinyin.toPinyin(inputStr.charAt(i)));
                i++;
            }

            if (i != inputStr.length()) {
                resultPinyinStrBuf.append(separator);
            }
        }

        return resultPinyinStrBuf.toString();
    }

    static String[] pinyinFromDict(String wordInDict, List pinyinDictSet) {
        if (pinyinDictSet != null) {
            for (PinyinDict dict : pinyinDictSet) {
                if (dict != null && dict.words() != null
                        && dict.words().contains(wordInDict)) {
                    return dict.toPinyin(wordInDict);
                }
            }
        }
        throw new IllegalArgumentException("No pinyin dict contains word: " + wordInDict);
    }

    static final class EmitComparator implements Comparator {

        @Override
        public int compare(Emit o1, Emit o2) {
            if (o1.getStart() == o2.getStart()) {
                // 起点相同时,更长的排前面
                return (o1.size() < o2.size()) ? 1 : ((o1.size() == o2.size()) ? 0 : -1);
            } else {
                // 起点小的放前面
                return (o1.getStart() < o2.getStart()) ? -1 : ((o1.getStart() == o2.getStart()) ? 0 : 1);
            }
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy