org.apdplat.word.util.Utils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of word Show documentation
Show all versions of word Show documentation
word分词是一个Java实现的中文分词组件,提供了多种基于词典的分词算法,并利用ngram模型来消除歧义。
能准确识别英文、数字,以及日期、时间等数量词,能识别人名、地名、组织机构名等未登录词。
同时提供了Lucene、Solr、ElasticSearch插件。
/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, [email protected]
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*/
package org.apdplat.word.util;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
/**
* 工具类
* @author 杨尚川
*/
public class Utils {
//至少出现一次中文字符,且以中文字符开头和结束
private static final Pattern PATTERN_ONE = Pattern.compile("^[\\u4e00-\\u9fa5]+$");
//至少出现两次中文字符,且以中文字符开头和结束
private static final Pattern PATTERN_TWO = Pattern.compile("^[\\u4e00-\\u9fa5]{2,}$");
/**
* 至少出现一次中文字符,且以中文字符开头和结束
* @param word
* @return
*/
public static boolean isChineseCharAndLengthAtLeastOne(String word){
if(PATTERN_ONE.matcher(word).find()){
return true;
}
return false;
}
/**
* 至少出现两次中文字符,且以中文字符开头和结束
* @param word
* @return
*/
public static boolean isChineseCharAndLengthAtLeastTwo(String word){
if(PATTERN_TWO.matcher(word).find()){
return true;
}
return false;
}
/**
* 删除目录
* @param dir 目录
* @return 是否成功
*/
public static boolean deleteDir(File dir) {
if (dir.isDirectory()) {
File[] children = dir.listFiles();
for (File child : children) {
boolean success = deleteDir(child);
if (!success) {
return false;
}
}
}
return dir.delete();
}
/**
* 根据MAP的VALUE进行排序
* @param key
* @param value
* @param map map
* @return 根据MAP的VALUE由大到小的排序结果列表
*/
public static List> getSortedMapByValue(Map map) {
List> list = new ArrayList<>(map.entrySet());
Collections.sort(list, new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
if(o1.getValue() instanceof Integer){
return o2.getValue().intValue() - o1.getValue().intValue();
}
if(o1.getValue() instanceof Long){
return (int)(o2.getValue().longValue() - o1.getValue().longValue());
}
if(o1.getValue() instanceof Float){
float f1 = o1.getValue().floatValue();
float f2 = o2.getValue().floatValue();
if(f1 < f2){
return 1;
}
if(f1 == f2){
return 0;
}
return -1;
}
if(o1.getValue() instanceof Double){
double f1 = o1.getValue().doubleValue();
double f2 = o2.getValue().doubleValue();
if(f1 < f2){
return 1;
}
if(f1 == f2){
return 0;
}
return -1;
}
return 0;
}
});
return list;
}
}