All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hankcs.hanlp.dictionary.ns.PlaceDictionary Maven / Gradle / Ivy

There is a newer version: portable-1.8.5
Show newest version
/*
 * 
 * He Han
 * [email protected]
 * 2014/9/10 14:47
 *
 * 
 * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
 * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
 * 
 */
package com.hankcs.hanlp.dictionary.ns;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.corpus.dictionary.item.EnumItem;
import com.hankcs.hanlp.corpus.tag.NS;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.TransformMatrixDictionary;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.seg.common.WordNet;
import com.hankcs.hanlp.utility.Predefine;

import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
 * 地名识别用的词典,实际上是对两个词典的包装
 *
 * @author hankcs
 */
public class PlaceDictionary
{
    /**
     * 地名词典
     */
    public static NSDictionary dictionary;
    /**
     * 转移矩阵词典
     */
    public static TransformMatrixDictionary transformMatrixDictionary;
    /**
     * AC算法用到的Trie树
     */
    public static AhoCorasickDoubleArrayTrie trie;

    /**
     * 本词典专注的词的ID
     */
    static final int WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PLACE);
    /**
     * 本词典专注的词的属性
     */
    static final CoreDictionary.Attribute ATTRIBUTE = CoreDictionary.get(WORD_ID);

    static
    {
        long start = System.currentTimeMillis();
        dictionary = new NSDictionary();
        dictionary.load(HanLP.Config.PlaceDictionaryPath);
        logger.info(HanLP.Config.PlaceDictionaryPath + "加载成功,耗时" + (System.currentTimeMillis() - start) + "ms");
        transformMatrixDictionary = new TransformMatrixDictionary(NS.class);
        transformMatrixDictionary.load(HanLP.Config.PlaceDictionaryTrPath);
        trie = new AhoCorasickDoubleArrayTrie();
        TreeMap patternMap = new TreeMap();
        patternMap.put("CH", "CH");
        patternMap.put("CDH", "CDH");
        patternMap.put("CDEH", "CDEH");
        patternMap.put("GH", "GH");
        trie.build(patternMap);
    }

    /**
     * 模式匹配
     *
     * @param nsList         确定的标注序列
     * @param vertexList     原始的未加角色标注的序列
     * @param wordNetOptimum 待优化的图
     * @param wordNetAll
     */
    public static void parsePattern(List nsList, List vertexList, final WordNet wordNetOptimum, final WordNet wordNetAll)
    {
//        ListIterator listIterator = vertexList.listIterator();
        StringBuilder sbPattern = new StringBuilder(nsList.size());
        for (NS ns : nsList)
        {
            sbPattern.append(ns.toString());
        }
        String pattern = sbPattern.toString();
        final Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
        trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit()
        {
            @Override
            public void hit(int begin, int end, String value)
            {
                StringBuilder sbName = new StringBuilder();
                for (int i = begin; i < end; ++i)
                {
                    sbName.append(wordArray[i].realWord);
                }
                String name = sbName.toString();
                // 对一些bad case做出调整
                if (isBadCase(name)) return;

                // 正式算它是一个名字
                if (HanLP.Config.DEBUG)
                {
                    System.out.printf("识别出地名:%s %s\n", name, value);
                }
                int offset = 0;
                for (int i = 0; i < begin; ++i)
                {
                    offset += wordArray[i].realWord.length();
                }
                wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PLACE, name, ATTRIBUTE, WORD_ID), wordNetAll);
            }
        });
    }

    /**
     * 因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中
* 这个方法返回是否是bad case * * @param name * @return */ static boolean isBadCase(String name) { EnumItem nrEnumItem = dictionary.get(name); if (nrEnumItem == null) return false; return nrEnumItem.containsLabel(NS.Z); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy