com.hankcs.hanlp.seg.Segment Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hanlp Show documentation
汉语言处理包
There is a newer version: portable-1.8.5
/*
 * 
 * He Han
 * [email protected]
 * 2014/10/29 14:53
 *
 * 
 * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
 * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
 * 
 */
package com.hankcs.hanlp.seg;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.bintrie.BaseNode;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.dictionary.other.CharType;
import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.seg.common.WordNet;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.SentencesUtil;
import com.hankcs.hanlp.utility.TextUtility;

import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
 * 分词器（分词服务）

 * 是所有分词器的基类（Abstract）

 * 分词器的分词方法是线程安全的，但配置方法则不保证
 *
 * @author hankcs
 */
public abstract class Segment
{
    /**
     * 分词器配置
     */
    protected Config config;

    /**
     * 构造一个分词器
     */
    public Segment()
    {
        config = new Config();
    }

    /**
     * 原子分词
     *
     * @param charArray
     * @param start     从start开始（包含）
     * @param end       到end结束（不包含end）
     * @return 一个列表，代表从start到from的所有字构成的原子节点
     */
    protected static List atomSegment(char[] charArray, int start, int end)
    {
        List atomSegment = new ArrayList();
        int pCur = start, nCurType, nNextType;
        StringBuilder sb = new StringBuilder();
        char c;

        int[] charTypeArray = new int[end - start];

        // 生成对应单个汉字的字符类型数组
        for (int i = 0; i < charTypeArray.length; ++i)
        {
            c = charArray[i + start];
            charTypeArray[i] = CharType.get(c);

            if (c == '.' && i + start < (charArray.length - 1) && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM)
                charTypeArray[i] = CharType.CT_NUM;
            else if (c == '.' && i + start < (charArray.length - 1) && charArray[i + start + 1] >= '0' && charArray[i + start + 1] <= '9')
                charTypeArray[i] = CharType.CT_SINGLE;
            else if (charTypeArray[i] == CharType.CT_LETTER)
                charTypeArray[i] = CharType.CT_SINGLE;
        }

        // 根据字符类型数组中的内容完成原子切割
        while (pCur < end)
        {
            nCurType = charTypeArray[pCur - start];

            if (nCurType == CharType.CT_CHINESE || nCurType == CharType.CT_INDEX ||
                    nCurType == CharType.CT_DELIMITER || nCurType == CharType.CT_OTHER)
            {
                String single = String.valueOf(charArray[pCur]);
                if (single.length() != 0)
                    atomSegment.add(new AtomNode(single, nCurType));
                pCur++;
            }
            //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
            else if (pCur < end - 1 && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM))
            {
                sb.delete(0, sb.length());
                sb.append(charArray[pCur]);

                boolean reachEnd = true;
                while (pCur < end - 1)
                {
                    nNextType = charTypeArray[++pCur - start];

                    if (nNextType == nCurType)
                        sb.append(charArray[pCur]);
                    else
                    {
                        reachEnd = false;
                        break;
                    }
                }
                atomSegment.add(new AtomNode(sb.toString(), nCurType));
                if (reachEnd)
                    pCur++;
            }
            // 对于所有其它情况
            else
            {
                atomSegment.add(new AtomNode(charArray[pCur], nCurType));
                pCur++;
            }
        }

        return atomSegment;
    }

    /**
     * 简易原子分词，将所有字放到一起作为一个词
     *
     * @param charArray
     * @param start
     * @param end
     * @return
     */
    protected static List simpleAtomSegment(char[] charArray, int start, int end)
    {
        List atomNodeList = new LinkedList();
        atomNodeList.add(new AtomNode(new String(charArray, start, end - start), CharType.CT_LETTER));
        return atomNodeList;
    }

    /**
     * 快速原子分词，希望用这个方法替换掉原来缓慢的方法
     *
     * @param charArray
     * @param start
     * @param end
     * @return
     */
    protected static List quickAtomSegment(char[] charArray, int start, int end)
    {
        List atomNodeList = new LinkedList();
        int offsetAtom = start;
        int preType = CharType.get(charArray[offsetAtom]);
        int curType;
        while (++offsetAtom < end)
        {
            curType = CharType.get(charArray[offsetAtom]);
            if (curType != preType)
            {
                // 浮点数识别
                if ((charArray[offsetAtom] == '.' || charArray[offsetAtom] == '．') && preType == CharType.CT_NUM)
                {
                    if (offsetAtom+1 < end)
                    {
                        int nextType = CharType.get(charArray[offsetAtom+1]);
                        if (nextType == CharType.CT_NUM) 
                        {
                            continue;
                        }
                    }
                }
                atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
                start = offsetAtom;
            }
            preType = curType;
        }
        if (offsetAtom == end)
            atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));

        return atomNodeList;
    }

    /**
     * 使用用户词典合并粗分结果
     * @param vertexList 粗分结果
     * @return 合并后的结果
     */
    protected static List combineByCustomDictionary(List vertexList)
    {
        Vertex[] wordNet = new Vertex[vertexList.size()];
        vertexList.toArray(wordNet);
        // DAT合并
        DoubleArrayTrie dat = CustomDictionary.dat;
        for (int i = 0; i < wordNet.length; ++i)
        {
            int state = 1;
            state = dat.transition(wordNet[i].realWord, state);
            if (state > 0)
            {
                int to = i + 1;
                int end = to;
                CoreDictionary.Attribute value = dat.output(state);
                for (; to < wordNet.length; ++to)
                {
                    state = dat.transition(wordNet[to].realWord, state);
                    if (state < 0) break;
                    CoreDictionary.Attribute output = dat.output(state);
                    if (output != null)
                    {
                        value = output;
                        end = to + 1;
                    }
                }
                if (value != null)
                {
                    combineWords(wordNet, i, end, value);
                    i = end - 1;
                }
            }
        }
        // BinTrie合并
        if (CustomDictionary.trie != null)
        {
            for (int i = 0; i < wordNet.length; ++i)
            {
                if (wordNet[i] == null) continue;
                BaseNode state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
                if (state != null)
                {
                    int to = i + 1;
                    int end = to;
                    CoreDictionary.Attribute value = state.getValue();
                    for (; to < wordNet.length; ++to)
                    {
                        if (wordNet[to] == null) continue;
                        state = state.transition(wordNet[to].realWord.toCharArray(), 0);
                        if (state == null) break;
                        if (state.getValue() != null)
                        {
                            value = state.getValue();
                            end = to + 1;
                        }
                    }
                    if (value != null)
                    {
                        combineWords(wordNet, i, end, value);
                        i = end - 1;
                    }
                }
            }
        }
        vertexList.clear();
        for (Vertex vertex : wordNet)
        {
            if (vertex != null) vertexList.add(vertex);
        }
        return vertexList;
    }

    /**
     * 使用用户词典合并粗分结果，并将用户词语收集到全词图中
     * @param vertexList 粗分结果
     * @param wordNetAll 收集用户词语到全词图中
     * @return 合并后的结果
     */
    protected static List combineByCustomDictionary(List vertexList, final WordNet wordNetAll)
    {
        List outputList = combineByCustomDictionary(vertexList);
        int line = 0;
        for (final Vertex vertex : outputList)
        {
            final int parentLength = vertex.realWord.length();
            final int currentLine = line;
            if (parentLength >= 3)
            {
                CustomDictionary.parseText(vertex.realWord, new AhoCorasickDoubleArrayTrie.IHit()
                {
                    @Override
                    public void hit(int begin, int end, CoreDictionary.Attribute value)
                    {
                        if (end - begin == parentLength) return;
                        wordNetAll.add(currentLine + begin, new Vertex(vertex.realWord.substring(begin, end), value));
                    }
                });
            }
            line += parentLength;
        }
        return outputList;
    }

    /**
     * 将连续的词语合并为一个
     * @param wordNet 词图
     * @param start 起始下标（包含）
     * @param end 结束下标（不包含）
     * @param value 新的属性
     */
    private static void combineWords(Vertex[] wordNet, int start, int end, CoreDictionary.Attribute value)
    {
        if (start + 1 == end)   // 小优化，如果只有一个词，那就不需要合并，直接应用新属性
        {
            wordNet[start].attribute = value;
        }
        else
        {
            StringBuilder sbTerm = new StringBuilder();
            for (int j = start; j < end; ++j)
            {
                if (wordNet[j] == null) continue;
                String realWord = wordNet[j].realWord;
                sbTerm.append(realWord);
                wordNet[j] = null;
            }
            wordNet[start] = new Vertex(sbTerm.toString(), value);
        }
    }

    /**
     * 合并数字
     * @param termList
     */
    protected void mergeNumberQuantifier(List termList, WordNet wordNetAll, Config config)
    {
        if (termList.size() < 4) return;
        StringBuilder sbQuantifier = new StringBuilder();
        ListIterator iterator = termList.listIterator();
        iterator.next();
        int line = 1;
        while (iterator.hasNext())
        {
            Vertex pre = iterator.next();
            if (pre.hasNature(Nature.m))
            {
                sbQuantifier.append(pre.realWord);
                Vertex cur = null;
                while (iterator.hasNext() && (cur = iterator.next()).hasNature(Nature.m))
                {
                    sbQuantifier.append(cur.realWord);
                    iterator.remove();
                    removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length());
                }
                if (cur != null)
                {
                    if ((cur.hasNature(Nature.q) || cur.hasNature(Nature.qv) || cur.hasNature(Nature.qt)))
                    {
                        if (config.indexMode)
                        {
                            wordNetAll.add(line, new Vertex(sbQuantifier.toString(), new CoreDictionary.Attribute(Nature.m)));
                        }
                        sbQuantifier.append(cur.realWord);
                        iterator.remove();
                        removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length());
                    }
                    else
                    {
                        line += cur.realWord.length();   // (cur = iterator.next()).hasNature(Nature.m) 最后一个next可能不含q词性
                    }
                }
                if (sbQuantifier.length() != pre.realWord.length())
                {
                    for (Vertex vertex : wordNetAll.get(line + pre.realWord.length()))
                    {
                        vertex.from = null;
                    }
                    pre.realWord = sbQuantifier.toString();
                    pre.word = Predefine.TAG_NUMBER;
                    pre.attribute = new CoreDictionary.Attribute(Nature.mq);
                    pre.wordID = CoreDictionary.M_WORD_ID;
                    sbQuantifier.setLength(0);
                }
            }
            sbQuantifier.setLength(0);
            line += pre.realWord.length();
        }
//        System.out.println(wordNetAll);
    }

    /**
     * 将一个词语从词网中彻底抹除
     * @param cur 词语
     * @param wordNetAll 词网
     * @param line 当前扫描的行数
     * @param length 当前缓冲区的长度
     */
    private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line, int length)
    {
        LinkedList[] vertexes = wordNetAll.getVertexes();
        // 将其从wordNet中删除
        for (Vertex vertex : vertexes[line + length])
        {
            if (vertex.from == cur)
                vertex.from = null;
        }
        ListIterator iterator = vertexes[line + length - cur.realWord.length()].listIterator();
        while (iterator.hasNext())
        {
            Vertex vertex = iterator.next();
            if (vertex == cur) iterator.remove();
        }
    }

    /**
     * 分词

     * 此方法是线程安全的
     *
     * @param text 待分词文本
     * @return 单词列表
     */
    public List seg(String text)
    {
        char[] charArray = text.toCharArray();
        if (HanLP.Config.Normalization)
        {
            CharTable.normalization(charArray);
        }
        if (config.threadNumber > 1 && charArray.length > 10000)    // 小文本多线程没意义，反而变慢了
        {
            List sentenceList = SentencesUtil.toSentenceList(charArray);
            String[] sentenceArray = new String[sentenceList.size()];
            sentenceList.toArray(sentenceArray);
            //noinspection unchecked
            List[] termListArray = new List[sentenceArray.length];
            final int per = sentenceArray.length / config.threadNumber;
            WorkThread[] threadArray = new WorkThread[config.threadNumber];
            for (int i = 0; i < config.threadNumber - 1; ++i)
            {
                int from = i * per;
                threadArray[i] = new WorkThread(sentenceArray, termListArray, from, from + per);
                threadArray[i].start();
            }
            threadArray[config.threadNumber - 1] = new WorkThread(sentenceArray, termListArray, (config.threadNumber - 1) * per, sentenceArray.length);
            threadArray[config.threadNumber - 1].start();
            try
            {
                for (WorkThread thread : threadArray)
                {
                    thread.join();
                }
            }
            catch (InterruptedException e)
            {
                logger.severe("线程同步异常：" + TextUtility.exceptionToString(e));
                return Collections.emptyList();
            }
            List termList = new LinkedList();
            if (config.offset || config.indexMode)  // 由于分割了句子，所以需要重新校正offset
            {
                int sentenceOffset = 0;
                for (int i = 0; i < sentenceArray.length; ++i)
                {
                    for (Term term : termListArray[i])
                    {
                        term.offset += sentenceOffset;
                        termList.add(term);
                    }
                    sentenceOffset += sentenceArray[i].length();
                }
            }
            else
            {
                for (List list : termListArray)
                {
                    termList.addAll(list);
                }
            }

            return termList;
        }
//        if (text.length() > 10000)  // 针对大文本，先拆成句子，后分词，避免内存峰值太大
//        {
//            List termList = new LinkedList();
//            if (config.offset || config.indexMode)
//            {
//                int sentenceOffset = 0;
//                for (String sentence : SentencesUtil.toSentenceList(charArray))
//                {
//                    List termOfSentence = segSentence(sentence.toCharArray());
//                    for (Term term : termOfSentence)
//                    {
//                        term.offset += sentenceOffset;
//                        termList.add(term);
//                    }
//                    sentenceOffset += sentence.length();
//                }
//            }
//            else
//            {
//                for (String sentence : SentencesUtil.toSentenceList(charArray))
//                {
//                    termList.addAll(segSentence(sentence.toCharArray()));
//                }
//            }
//
//            return termList;
//        }
        return segSentence(charArray);
    }

    /**
     * 分词
     *
     * @param text 待分词文本
     * @return 单词列表
     */
    public List seg(char[] text)
    {
        assert text != null;
        if (HanLP.Config.Normalization)
        {
            CharTable.normalization(text);
        }
        return segSentence(text);
    }

    /**
     * 分词断句 输出句子形式
     *
     * @param text 待分词句子
     * @return 句子列表，每个句子由一个单词列表组成
     */
    public List> seg2sentence(String text)
    {
        List> resultList = new LinkedList>();
        {
            for (String sentence : SentencesUtil.toSentenceList(text))
            {
                resultList.add(segSentence(sentence.toCharArray()));
            }
        }

        return resultList;
    }

    /**
     * 给一个句子分词
     *
     * @param sentence 待分词句子
     * @return 单词列表
     */
    protected abstract List segSentence(char[] sentence);

    /**
     * 设为索引模式
     *
     * @return
     */
    public Segment enableIndexMode(boolean enable)
    {
        config.indexMode = enable;
        return this;
    }

    /**
     * 开启词性标注
     *
     * @param enable
     * @return
     */
    public Segment enablePartOfSpeechTagging(boolean enable)
    {
        config.speechTagging = enable;
        return this;
    }

    /**
     * 开启人名识别
     *
     * @param enable
     * @return
     */
    public Segment enableNameRecognize(boolean enable)
    {
        config.nameRecognize = enable;
        config.updateNerConfig();
        return this;
    }

    /**
     * 开启地名识别
     *
     * @param enable
     * @return
     */
    public Segment enablePlaceRecognize(boolean enable)
    {
        config.placeRecognize = enable;
        config.updateNerConfig();
        return this;
    }

    /**
     * 开启机构名识别
     *
     * @param enable
     * @return
     */
    public Segment enableOrganizationRecognize(boolean enable)
    {
        config.organizationRecognize = enable;
        config.updateNerConfig();
        return this;
    }

    /**
     * 是否启用用户词典
     *
     * @param enable
     */
    public Segment enableCustomDictionary(boolean enable)
    {
        config.useCustomDictionary = enable;
        return this;
    }

    /**
     * 是否尽可能强制使用用户词典（使用户词典的优先级尽可能高）

     *     警告：具体实现由各子类决定，可能会破坏分词器的统计特性（例如，如果用户词典
     *     含有“和服”，则“商品和服务”的分词结果可能会被用户词典的高优先级影响）。
     * @param enable
     * @return 分词器本身
     *
     * @since 1.3.5
     */
    public Segment enableCustomDictionaryForcing(boolean enable)
    {
        if (enable)
        {
            enableCustomDictionary(true);
        }
        config.forceCustomDictionary = enable;
        return this;
    }

    /**
     * 是否启用音译人名识别
     *
     * @param enable
     */
    public Segment enableTranslatedNameRecognize(boolean enable)
    {
        config.translatedNameRecognize = enable;
        config.updateNerConfig();
        return this;
    }

    /**
     * 是否启用日本人名识别
     *
     * @param enable
     */
    public Segment enableJapaneseNameRecognize(boolean enable)
    {
        config.japaneseNameRecognize = enable;
        config.updateNerConfig();
        return this;
    }

    /**
     * 是否启用偏移量计算（开启后Term.offset才会被计算）
     *
     * @param enable
     * @return
     */
    public Segment enableOffset(boolean enable)
    {
        config.offset = enable;
        return this;
    }

    /**
     * 是否启用数词和数量词识别

     *     即[二, 十, 一] => [二十一]，[十, 九, 元] => [十九元]
     * @param enable
     * @return
     */
    public Segment enableNumberQuantifierRecognize(boolean enable)
    {
        config.numberQuantifierRecognize = enable;
        return this;
    }

    /**
     * 是否启用所有的命名实体识别
     *
     * @param enable
     * @return
     */
    public Segment enableAllNamedEntityRecognize(boolean enable)
    {
        config.nameRecognize = enable;
        config.japaneseNameRecognize = enable;
        config.translatedNameRecognize = enable;
        config.placeRecognize = enable;
        config.organizationRecognize = enable;
        config.updateNerConfig();
        return this;
    }

    class WorkThread extends Thread
    {
        String[] sentenceArray;
        List[] termListArray;
        int from;
        int to;

        public WorkThread(String[] sentenceArray, List[] termListArray, int from, int to)
        {
            this.sentenceArray = sentenceArray;
            this.termListArray = termListArray;
            this.from = from;
            this.to = to;
        }

        @Override
        public void run()
        {
            for (int i = from; i < to; ++i)
            {
                termListArray[i] = segSentence(sentenceArray[i].toCharArray());
            }
        }
    }

    /**
     * 开启多线程
     * @param enable true表示开启4个线程，false表示单线程
     * @return
     */
    public Segment enableMultithreading(boolean enable)
    {
        if (enable) config.threadNumber = 4;
        else config.threadNumber = 1;
        return this;
    }

    /**
     * 开启多线程
     * @param threadNumber 线程数量
     * @return
     */
    public Segment enableMultithreading(int threadNumber)
    {
        config.threadNumber = threadNumber;
        return this;
    }
}