com.hankcs.hanlp.dictionary.ns.PlaceDictionary Maven / Gradle / Ivy
/*
*
* He Han
* [email protected]
* 2014/9/10 14:47
*
*
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
*
*/
package com.hankcs.hanlp.dictionary.ns;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.corpus.dictionary.item.EnumItem;
import com.hankcs.hanlp.corpus.tag.NS;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.TransformMatrixDictionary;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.seg.common.WordNet;
import com.hankcs.hanlp.utility.Predefine;
import java.util.*;
import static com.hankcs.hanlp.utility.Predefine.logger;
/**
* 地名识别用的词典,实际上是对两个词典的包装
*
* @author hankcs
*/
public class PlaceDictionary
{
/**
* 地名词典
*/
public static NSDictionary dictionary;
/**
* 转移矩阵词典
*/
public static TransformMatrixDictionary transformMatrixDictionary;
/**
* AC算法用到的Trie树
*/
public static AhoCorasickDoubleArrayTrie trie;
/**
* 本词典专注的词的ID
*/
static final int WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PLACE);
/**
* 本词典专注的词的属性
*/
static final CoreDictionary.Attribute ATTRIBUTE = CoreDictionary.get(WORD_ID);
static
{
long start = System.currentTimeMillis();
dictionary = new NSDictionary();
dictionary.load(HanLP.Config.PlaceDictionaryPath);
logger.info(HanLP.Config.PlaceDictionaryPath + "加载成功,耗时" + (System.currentTimeMillis() - start) + "ms");
transformMatrixDictionary = new TransformMatrixDictionary(NS.class);
transformMatrixDictionary.load(HanLP.Config.PlaceDictionaryTrPath);
trie = new AhoCorasickDoubleArrayTrie();
TreeMap patternMap = new TreeMap();
patternMap.put("CH", "CH");
patternMap.put("CDH", "CDH");
patternMap.put("CDEH", "CDEH");
patternMap.put("GH", "GH");
trie.build(patternMap);
}
/**
* 模式匹配
*
* @param nsList 确定的标注序列
* @param vertexList 原始的未加角色标注的序列
* @param wordNetOptimum 待优化的图
* @param wordNetAll
*/
public static void parsePattern(List nsList, List vertexList, final WordNet wordNetOptimum, final WordNet wordNetAll)
{
// ListIterator listIterator = vertexList.listIterator();
StringBuilder sbPattern = new StringBuilder(nsList.size());
for (NS ns : nsList)
{
sbPattern.append(ns.toString());
}
String pattern = sbPattern.toString();
final Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit()
{
@Override
public void hit(int begin, int end, String value)
{
StringBuilder sbName = new StringBuilder();
for (int i = begin; i < end; ++i)
{
sbName.append(wordArray[i].realWord);
}
String name = sbName.toString();
// 对一些bad case做出调整
if (isBadCase(name)) return;
// 正式算它是一个名字
if (HanLP.Config.DEBUG)
{
System.out.printf("识别出地名:%s %s\n", name, value);
}
int offset = 0;
for (int i = 0; i < begin; ++i)
{
offset += wordArray[i].realWord.length();
}
wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PLACE, name, ATTRIBUTE, WORD_ID), wordNetAll);
}
});
}
/**
* 因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中
* 这个方法返回是否是bad case
*
* @param name
* @return
*/
static boolean isBadCase(String name)
{
EnumItem nrEnumItem = dictionary.get(name);
if (nrEnumItem == null) return false;
return nrEnumItem.containsLabel(NS.Z);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy