All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hankcs.hanlp.seg.Segment Maven / Gradle / Ivy

There is a newer version: portable-1.8.5
Show newest version
/*
 * 
 * He Han
 * [email protected]
 * 2014/10/29 14:53
 *
 * 
 * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
 * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
 * 
 */
package com.hankcs.hanlp.seg;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.bintrie.BaseNode;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.dictionary.other.CharType;
import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.seg.common.WordNet;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.SentencesUtil;
import com.hankcs.hanlp.utility.TextUtility;

import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
 * 分词器(分词服务)
* 是所有分词器的基类(Abstract)
* 分词器的分词方法是线程安全的,但配置方法则不保证 * * @author hankcs */ public abstract class Segment { /** * 分词器配置 */ protected Config config; /** * 构造一个分词器 */ public Segment() { config = new Config(); } /** * 原子分词 * * @param charArray * @param start 从start开始(包含) * @param end 到end结束(不包含end) * @return 一个列表,代表从start到from的所有字构成的原子节点 */ protected static List atomSegment(char[] charArray, int start, int end) { List atomSegment = new ArrayList(); int pCur = start, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; int[] charTypeArray = new int[end - start]; // 生成对应单个汉字的字符类型数组 for (int i = 0; i < charTypeArray.length; ++i) { c = charArray[i + start]; charTypeArray[i] = CharType.get(c); if (c == '.' && i + start < (charArray.length - 1) && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM) charTypeArray[i] = CharType.CT_NUM; else if (c == '.' && i + start < (charArray.length - 1) && charArray[i + start + 1] >= '0' && charArray[i + start + 1] <= '9') charTypeArray[i] = CharType.CT_SINGLE; else if (charTypeArray[i] == CharType.CT_LETTER) charTypeArray[i] = CharType.CT_SINGLE; } // 根据字符类型数组中的内容完成原子切割 while (pCur < end) { nCurType = charTypeArray[pCur - start]; if (nCurType == CharType.CT_CHINESE || nCurType == CharType.CT_INDEX || nCurType == CharType.CT_DELIMITER || nCurType == CharType.CT_OTHER) { String single = String.valueOf(charArray[pCur]); if (single.length() != 0) atomSegment.add(new AtomNode(single, nCurType)); pCur++; } //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。 else if (pCur < end - 1 && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM)) { sb.delete(0, sb.length()); sb.append(charArray[pCur]); boolean reachEnd = true; while (pCur < end - 1) { nNextType = charTypeArray[++pCur - start]; if (nNextType == nCurType) sb.append(charArray[pCur]); else { reachEnd = false; break; } } atomSegment.add(new AtomNode(sb.toString(), nCurType)); if (reachEnd) pCur++; } // 对于所有其它情况 else { atomSegment.add(new AtomNode(charArray[pCur], nCurType)); pCur++; } } return atomSegment; } /** * 简易原子分词,将所有字放到一起作为一个词 * * @param charArray * @param start * @param end * @return */ protected static List simpleAtomSegment(char[] charArray, int start, int end) { List atomNodeList = new LinkedList(); atomNodeList.add(new AtomNode(new String(charArray, start, end - start), CharType.CT_LETTER)); return atomNodeList; } /** * 快速原子分词,希望用这个方法替换掉原来缓慢的方法 * * @param charArray * @param start * @param end * @return */ protected static List quickAtomSegment(char[] charArray, int start, int end) { List atomNodeList = new LinkedList(); int offsetAtom = start; int preType = CharType.get(charArray[offsetAtom]); int curType; while (++offsetAtom < end) { curType = CharType.get(charArray[offsetAtom]); if (curType != preType) { // 浮点数识别 if ((charArray[offsetAtom] == '.' || charArray[offsetAtom] == '.') && preType == CharType.CT_NUM) { if (offsetAtom+1 < end) { int nextType = CharType.get(charArray[offsetAtom+1]); if (nextType == CharType.CT_NUM) { continue; } } } atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType)); start = offsetAtom; } preType = curType; } if (offsetAtom == end) atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType)); return atomNodeList; } /** * 使用用户词典合并粗分结果 * @param vertexList 粗分结果 * @return 合并后的结果 */ protected static List combineByCustomDictionary(List vertexList) { Vertex[] wordNet = new Vertex[vertexList.size()]; vertexList.toArray(wordNet); // DAT合并 DoubleArrayTrie dat = CustomDictionary.dat; for (int i = 0; i < wordNet.length; ++i) { int state = 1; state = dat.transition(wordNet[i].realWord, state); if (state > 0) { int to = i + 1; int end = to; CoreDictionary.Attribute value = dat.output(state); for (; to < wordNet.length; ++to) { state = dat.transition(wordNet[to].realWord, state); if (state < 0) break; CoreDictionary.Attribute output = dat.output(state); if (output != null) { value = output; end = to + 1; } } if (value != null) { combineWords(wordNet, i, end, value); i = end - 1; } } } // BinTrie合并 if (CustomDictionary.trie != null) { for (int i = 0; i < wordNet.length; ++i) { if (wordNet[i] == null) continue; BaseNode state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0); if (state != null) { int to = i + 1; int end = to; CoreDictionary.Attribute value = state.getValue(); for (; to < wordNet.length; ++to) { if (wordNet[to] == null) continue; state = state.transition(wordNet[to].realWord.toCharArray(), 0); if (state == null) break; if (state.getValue() != null) { value = state.getValue(); end = to + 1; } } if (value != null) { combineWords(wordNet, i, end, value); i = end - 1; } } } } vertexList.clear(); for (Vertex vertex : wordNet) { if (vertex != null) vertexList.add(vertex); } return vertexList; } /** * 使用用户词典合并粗分结果,并将用户词语收集到全词图中 * @param vertexList 粗分结果 * @param wordNetAll 收集用户词语到全词图中 * @return 合并后的结果 */ protected static List combineByCustomDictionary(List vertexList, final WordNet wordNetAll) { List outputList = combineByCustomDictionary(vertexList); int line = 0; for (final Vertex vertex : outputList) { final int parentLength = vertex.realWord.length(); final int currentLine = line; if (parentLength >= 3) { CustomDictionary.parseText(vertex.realWord, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) { if (end - begin == parentLength) return; wordNetAll.add(currentLine + begin, new Vertex(vertex.realWord.substring(begin, end), value)); } }); } line += parentLength; } return outputList; } /** * 将连续的词语合并为一个 * @param wordNet 词图 * @param start 起始下标(包含) * @param end 结束下标(不包含) * @param value 新的属性 */ private static void combineWords(Vertex[] wordNet, int start, int end, CoreDictionary.Attribute value) { if (start + 1 == end) // 小优化,如果只有一个词,那就不需要合并,直接应用新属性 { wordNet[start].attribute = value; } else { StringBuilder sbTerm = new StringBuilder(); for (int j = start; j < end; ++j) { if (wordNet[j] == null) continue; String realWord = wordNet[j].realWord; sbTerm.append(realWord); wordNet[j] = null; } wordNet[start] = new Vertex(sbTerm.toString(), value); } } /** * 合并数字 * @param termList */ protected void mergeNumberQuantifier(List termList, WordNet wordNetAll, Config config) { if (termList.size() < 4) return; StringBuilder sbQuantifier = new StringBuilder(); ListIterator iterator = termList.listIterator(); iterator.next(); int line = 1; while (iterator.hasNext()) { Vertex pre = iterator.next(); if (pre.hasNature(Nature.m)) { sbQuantifier.append(pre.realWord); Vertex cur = null; while (iterator.hasNext() && (cur = iterator.next()).hasNature(Nature.m)) { sbQuantifier.append(cur.realWord); iterator.remove(); removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length()); } if (cur != null) { if ((cur.hasNature(Nature.q) || cur.hasNature(Nature.qv) || cur.hasNature(Nature.qt))) { if (config.indexMode) { wordNetAll.add(line, new Vertex(sbQuantifier.toString(), new CoreDictionary.Attribute(Nature.m))); } sbQuantifier.append(cur.realWord); iterator.remove(); removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length()); } else { line += cur.realWord.length(); // (cur = iterator.next()).hasNature(Nature.m) 最后一个next可能不含q词性 } } if (sbQuantifier.length() != pre.realWord.length()) { for (Vertex vertex : wordNetAll.get(line + pre.realWord.length())) { vertex.from = null; } pre.realWord = sbQuantifier.toString(); pre.word = Predefine.TAG_NUMBER; pre.attribute = new CoreDictionary.Attribute(Nature.mq); pre.wordID = CoreDictionary.M_WORD_ID; sbQuantifier.setLength(0); } } sbQuantifier.setLength(0); line += pre.realWord.length(); } // System.out.println(wordNetAll); } /** * 将一个词语从词网中彻底抹除 * @param cur 词语 * @param wordNetAll 词网 * @param line 当前扫描的行数 * @param length 当前缓冲区的长度 */ private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line, int length) { LinkedList[] vertexes = wordNetAll.getVertexes(); // 将其从wordNet中删除 for (Vertex vertex : vertexes[line + length]) { if (vertex.from == cur) vertex.from = null; } ListIterator iterator = vertexes[line + length - cur.realWord.length()].listIterator(); while (iterator.hasNext()) { Vertex vertex = iterator.next(); if (vertex == cur) iterator.remove(); } } /** * 分词
* 此方法是线程安全的 * * @param text 待分词文本 * @return 单词列表 */ public List seg(String text) { char[] charArray = text.toCharArray(); if (HanLP.Config.Normalization) { CharTable.normalization(charArray); } if (config.threadNumber > 1 && charArray.length > 10000) // 小文本多线程没意义,反而变慢了 { List sentenceList = SentencesUtil.toSentenceList(charArray); String[] sentenceArray = new String[sentenceList.size()]; sentenceList.toArray(sentenceArray); //noinspection unchecked List[] termListArray = new List[sentenceArray.length]; final int per = sentenceArray.length / config.threadNumber; WorkThread[] threadArray = new WorkThread[config.threadNumber]; for (int i = 0; i < config.threadNumber - 1; ++i) { int from = i * per; threadArray[i] = new WorkThread(sentenceArray, termListArray, from, from + per); threadArray[i].start(); } threadArray[config.threadNumber - 1] = new WorkThread(sentenceArray, termListArray, (config.threadNumber - 1) * per, sentenceArray.length); threadArray[config.threadNumber - 1].start(); try { for (WorkThread thread : threadArray) { thread.join(); } } catch (InterruptedException e) { logger.severe("线程同步异常:" + TextUtility.exceptionToString(e)); return Collections.emptyList(); } List termList = new LinkedList(); if (config.offset || config.indexMode) // 由于分割了句子,所以需要重新校正offset { int sentenceOffset = 0; for (int i = 0; i < sentenceArray.length; ++i) { for (Term term : termListArray[i]) { term.offset += sentenceOffset; termList.add(term); } sentenceOffset += sentenceArray[i].length(); } } else { for (List list : termListArray) { termList.addAll(list); } } return termList; } // if (text.length() > 10000) // 针对大文本,先拆成句子,后分词,避免内存峰值太大 // { // List termList = new LinkedList(); // if (config.offset || config.indexMode) // { // int sentenceOffset = 0; // for (String sentence : SentencesUtil.toSentenceList(charArray)) // { // List termOfSentence = segSentence(sentence.toCharArray()); // for (Term term : termOfSentence) // { // term.offset += sentenceOffset; // termList.add(term); // } // sentenceOffset += sentence.length(); // } // } // else // { // for (String sentence : SentencesUtil.toSentenceList(charArray)) // { // termList.addAll(segSentence(sentence.toCharArray())); // } // } // // return termList; // } return segSentence(charArray); } /** * 分词 * * @param text 待分词文本 * @return 单词列表 */ public List seg(char[] text) { assert text != null; if (HanLP.Config.Normalization) { CharTable.normalization(text); } return segSentence(text); } /** * 分词断句 输出句子形式 * * @param text 待分词句子 * @return 句子列表,每个句子由一个单词列表组成 */ public List> seg2sentence(String text) { List> resultList = new LinkedList>(); { for (String sentence : SentencesUtil.toSentenceList(text)) { resultList.add(segSentence(sentence.toCharArray())); } } return resultList; } /** * 给一个句子分词 * * @param sentence 待分词句子 * @return 单词列表 */ protected abstract List segSentence(char[] sentence); /** * 设为索引模式 * * @return */ public Segment enableIndexMode(boolean enable) { config.indexMode = enable; return this; } /** * 开启词性标注 * * @param enable * @return */ public Segment enablePartOfSpeechTagging(boolean enable) { config.speechTagging = enable; return this; } /** * 开启人名识别 * * @param enable * @return */ public Segment enableNameRecognize(boolean enable) { config.nameRecognize = enable; config.updateNerConfig(); return this; } /** * 开启地名识别 * * @param enable * @return */ public Segment enablePlaceRecognize(boolean enable) { config.placeRecognize = enable; config.updateNerConfig(); return this; } /** * 开启机构名识别 * * @param enable * @return */ public Segment enableOrganizationRecognize(boolean enable) { config.organizationRecognize = enable; config.updateNerConfig(); return this; } /** * 是否启用用户词典 * * @param enable */ public Segment enableCustomDictionary(boolean enable) { config.useCustomDictionary = enable; return this; } /** * 是否尽可能强制使用用户词典(使用户词典的优先级尽可能高)
* 警告:具体实现由各子类决定,可能会破坏分词器的统计特性(例如,如果用户词典 * 含有“和服”,则“商品和服务”的分词结果可能会被用户词典的高优先级影响)。 * @param enable * @return 分词器本身 * * @since 1.3.5 */ public Segment enableCustomDictionaryForcing(boolean enable) { if (enable) { enableCustomDictionary(true); } config.forceCustomDictionary = enable; return this; } /** * 是否启用音译人名识别 * * @param enable */ public Segment enableTranslatedNameRecognize(boolean enable) { config.translatedNameRecognize = enable; config.updateNerConfig(); return this; } /** * 是否启用日本人名识别 * * @param enable */ public Segment enableJapaneseNameRecognize(boolean enable) { config.japaneseNameRecognize = enable; config.updateNerConfig(); return this; } /** * 是否启用偏移量计算(开启后Term.offset才会被计算) * * @param enable * @return */ public Segment enableOffset(boolean enable) { config.offset = enable; return this; } /** * 是否启用数词和数量词识别
* 即[二, 十, 一] => [二十一],[十, 九, 元] => [十九元] * @param enable * @return */ public Segment enableNumberQuantifierRecognize(boolean enable) { config.numberQuantifierRecognize = enable; return this; } /** * 是否启用所有的命名实体识别 * * @param enable * @return */ public Segment enableAllNamedEntityRecognize(boolean enable) { config.nameRecognize = enable; config.japaneseNameRecognize = enable; config.translatedNameRecognize = enable; config.placeRecognize = enable; config.organizationRecognize = enable; config.updateNerConfig(); return this; } class WorkThread extends Thread { String[] sentenceArray; List[] termListArray; int from; int to; public WorkThread(String[] sentenceArray, List[] termListArray, int from, int to) { this.sentenceArray = sentenceArray; this.termListArray = termListArray; this.from = from; this.to = to; } @Override public void run() { for (int i = from; i < to; ++i) { termListArray[i] = segSentence(sentenceArray[i].toCharArray()); } } } /** * 开启多线程 * @param enable true表示开启4个线程,false表示单线程 * @return */ public Segment enableMultithreading(boolean enable) { if (enable) config.threadNumber = 4; else config.threadNumber = 1; return this; } /** * 开启多线程 * @param threadNumber 线程数量 * @return */ public Segment enableMultithreading(int threadNumber) { config.threadNumber = threadNumber; return this; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy