org.apdplat.word.segmentation.SegmentationContrast Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of word Show documentation
Show all versions of word Show documentation
word分词是一个Java实现的中文分词组件,提供了多种基于词典的分词算法,并利用ngram模型来消除歧义。
能准确识别英文、数字,以及日期、时间等数量词,能识别人名、地名、组织机构名等未登录词。
同时提供了Lucene、Solr、ElasticSearch插件。
The newest version!
/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, [email protected]
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*/
package org.apdplat.word.segmentation;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
/**
* 对比各种分词算法的分词结果
* @author 杨尚川
*/
public class SegmentationContrast {
public static Map seg(String text) {
Map results = new HashMap<>();
for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){
String result = SegmentationFactory.getSegmentation(segmentationAlgorithm).seg(text).toString();
results.put(segmentationAlgorithm.getDes(), result);
}
return results;
}
public static void dump(Map map){
System.out.println("***************************************************");
System.out.println("切分效果对比:");
System.out.println("***************************************************");
map.keySet().stream().sorted().forEach(sa -> System.out.println(sa + " : " + map.get(sa)));
System.out.println("***************************************************");
}
public static void run(String encoding) {
try(BufferedReader reader = new BufferedReader(new InputStreamReader(System.in, encoding))){
String line = null;
while((line = reader.readLine()) != null){
if("exit".equals(line)){
System.exit(0);
return;
}
if(line.trim().equals("")){
continue;
}
dump(seg(line));
showUsage();
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
public static void showUsage(){
System.out.println("输入exit退出程序");
System.out.println("输入要分词的文本后回车确认:");
}
public static void main(String[] args) {
dump(seg("独立自主和平等互利的原则"));
String encoding = "utf-8";
if(args==null || args.length == 0){
showUsage();
run(encoding);
}else if(Charset.isSupported(args[0])){
showUsage();
run(args[0]);
}
}
}