All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wltea.analyzer.core.IKSegmenter Maven / Gradle / Ivy

The newest version!
package org.wltea.analyzer.core;

import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

/**
 * IK分词器主类
 */
public final class IKSegmenter {

	//字符窜reader
	private Reader input;
	//分词器配置项
	private final Configuration cfg;
	//分词器上下文
	private AnalyzeContext context;
	//分词处理器列表
	private List segmenters;
	//分词歧义裁决器
	private IKArbitrator arbitrator;

	/**
	 * IK分词器构造函数
	 *
	 * @param input    input
	 * @param useSmart 为true,使用智能分词策略
	 *                 

* 非智能分词:细粒度输出所有可能的切分结果 * 智能分词: 合并数词和量词,对分词结果进行歧义判断 */ public IKSegmenter(Reader input, boolean useSmart) { this.input = input; this.cfg = DefaultConfig.getInstance(); this.cfg.setUseSmart(useSmart); this.init(); } /** * IK分词器构造函数 * * @param input input * @param cfg 使用自定义的Configuration构造分词器 */ public IKSegmenter(Reader input, Configuration cfg) { this.input = input; this.cfg = cfg; this.init(); } /** * 初始化 */ private void init() { //初始化词典单例 Dictionary.initial(this.cfg); //初始化分词上下文 this.context = new AnalyzeContext(this.cfg); //加载子分词器 this.segmenters = this.loadSegmenters(); //加载歧义裁决器 this.arbitrator = new IKArbitrator(); } /** * 初始化词典,加载子分词器实现 * * @return List */ private List loadSegmenters() { List segmenters = new ArrayList<>(4); //处理字母的子分词器 segmenters.add(new LetterSegmenter()); //处理中文数量词的子分词器 segmenters.add(new CN_QuantifierSegmenter()); //处理中文词的子分词器 segmenters.add(new CJKSegmenter()); return segmenters; } /** * 分词,获取下一个词元 * * @return Lexeme 词元对象 * @throws IOException IOException */ public synchronized Lexeme next() throws IOException { Lexeme l; while ((l = context.getNextLexeme()) == null) { /* * 从reader中读取数据,填充buffer * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 * 移位处理上次读入的但未处理的数据 */ int available = context.fillBuffer(this.input); if (available <= 0) { //reader已经读完 context.reset(); return null; } else { //初始化指针 context.initCursor(); do { //遍历子分词器 for (ISegmenter segmenter : segmenters) { segmenter.analyze(context); } //字符缓冲区接近读完,需要读入新的字符 if (context.needRefillBuffer()) { break; } //向前移动指针 } while (context.moveCursor()); //重置子分词器,为下轮循环进行初始化 segmenters.forEach(ISegmenter::reset); } //对分词进行歧义处理 this.arbitrator.process(context, this.cfg.useSmart()); //将分词结果输出到结果集,并处理未切分的单个CJK字符 context.outputToResult(); //记录本次分词的缓冲区位移 context.markBufferOffset(); } return l; } /** * 重置分词器到初始状态 * * @param input input */ public synchronized void reset(Reader input) { this.input = input; context.reset(); segmenters.forEach(ISegmenter::reset); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy