org.ansj.util.Counter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ansj_seg Show documentation
Show all versions of ansj_seg Show documentation
best java chinese word seg !
The newest version!
package org.ansj.util;
/**
* Created by Ansj on 01/11/2017.
*/
import org.ansj.domain.Term;
import org.ansj.exception.LibraryException;
import org.ansj.library.StopLibrary;
import org.ansj.recognition.Recognition;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.*;
import org.nlpcn.commons.lang.util.CollectionUtil;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 词频统计
*/
public class Counter {
private static final Log LOG = LogFactory.getLog(Counter.class);
public static int SIZE = 100000;
public static float FACTOR = 0.2F;
/**
* @param file 需要统计词频的文件。如果是文件夹。则此文件夹下递归目录的所有txt
* @param encoding 文件的编码
* @param analysis 分词器构造
* @param size 统计的最大数字。超过数字做搜索 size*20% -》 size
* @param recognitions 过滤条件
* @return 分词后的统计结果
*/
public static Result count(File file, String encoding, Analysis analysis, int size, Recognition... recognitions) {
if (recognitions == null || recognitions.length < 1) {
LOG.warn("Maybe you should add a `Recognition`");
}
if (encoding == null) {
LOG.warn("encoding is null! Then use UTF-8 encoding!");
encoding = IOUtil.UTF8;
}
LOG.info("start to count[file=" + file + ", encoding=" + encoding + ", analysis=" + analysis.getClass().getName() + ", size=" + size + ", recognitions=" + Arrays.toString(recognitions) + "]");
AtomicInteger ai = new AtomicInteger(0);
long t = System.currentTimeMillis();
Map hm = new HashMap();
if (!file.isHidden() && file.exists() && file.canRead()) {
try {
if (file.isDirectory()) {
// TODO: 只遍历当前目录下的所有文件
File[] files = file.listFiles();
if (files != null) {
for (File f : files) {
if (f.isFile() && !f.isHidden() && f.exists() && f.canRead()) {
visitFile(ai, hm, f, encoding, analysis, recognitions);
}
}
}
} else {
visitFile(ai, hm, file, encoding, analysis, recognitions);
}
} catch (Exception e) {
throw new LibraryException(e);
}
}
LOG.info("count " + ai.get() + " files, " + "took " + (System.currentTimeMillis() - t) + "ms");
// 按词频倒序排, 取前size个
List> items = CollectionUtil.sortMapByValue(hm, 1);
Result ret = new Result();
ret.map = new HashMap(size);
for (Map.Entry entry : items) {
if (ret.map.size() == size) {
break;
}
ret.map.put(entry.getKey(), entry.getValue());
}
return ret;
}
private static void visitFile(AtomicInteger ai, Map hm, File f, String encoding, Analysis analysis, Recognition... recognitions) throws IOException {
String name = f.getName().trim().toLowerCase();
if (name.endsWith(".txt") || name.endsWith(".dic")) {
try (BufferedReader br = IOUtil.getReader(f, encoding)) {
String line;
org.ansj.domain.Result result;
while ((line = br.readLine()) != null) {
result = analysis.parseStr(line);
if (recognitions != null) {
for (Recognition r : recognitions) {
if (r != null) {
result.recognition(r);
}
}
}
for (Term term : result) {
addTerm(term.getRealName(), hm);
}
}
}
ai.incrementAndGet();
if (LOG.isDebugEnabled()) {
LOG.debug("count file: " + f);
}
}
}
private static void addTerm(String term, Map termMap) {
if (termMap.containsKey(term)) {
termMap.put(term, termMap.get(term) + 1);
return;
}
// 超过容量
int capacity = (int) (SIZE * (1 + FACTOR));
if (capacity <= termMap.size()) {
LOG.info("map size over capacity[" + capacity + "], then start to remove low frequency terms");
List> items = CollectionUtil.sortMapByValue(termMap, 1);
for (Map.Entry item : items.subList(SIZE, items.size())) {
termMap.remove(item.getKey());
if (LOG.isDebugEnabled()) {
LOG.debug("term[`" + item.getKey() + "`-" + item.getValue() + "] removed");
}
}
}
//
termMap.put(term, 1);
}
/**
* @param file 需要统计词频的文件。如果是文件夹。则此文件夹下递归目录的所有txt
* @param analysis 分词器构造
* @param size 统计的最大数字。超过数字做搜索 size*20% -》 size
* @return 分词后的统计结果
*/
public static Result count(File file, Analysis analysis, int size) {
return count(file, null, analysis, size);
}
public static Result count(File file, String encoding, Analysis analysis, int size) {
return count(file, encoding, analysis, size, (Recognition) null);
}
/**
* @param file 需要统计词频的文件。如果是文件夹。则此文件夹下递归目录的所有txt
* @param analysis 分词器构造
* @return 分词后的统计结果
*/
public static Result count(File file, Analysis analysis) {
return count(file, null, analysis);
}
public static Result count(File file, String encoding, Analysis analysis) {
return count(file, encoding, analysis, SIZE, (Recognition) null);
}
/**
* @param file 需要统计词频的文件。如果是文件夹。则此文件夹下递归目录的所有txt
* @return 分词后的统计结果
*/
public static Result count(File file) {
return count(file, (String) null);
}
public static Result count(File file, String encoding) {
return count(file, encoding, new ToAnalysis(), SIZE, (Recognition) null);
}
public static class Result implements Iterable> {
public Map map;
public void writeToFile(String path, String encoding) throws IOException {
IOUtil.writeMap(map, path, encoding);
}
@Override
public Iterator> iterator() {
return map.entrySet().iterator();
}
}
public static void main(String[] args) throws IOException {
File f = null;
Analysis analysis = new ToAnalysis();
int size = SIZE;
StopRecognition sr = null;
String[] dim;
String encoding = null, output = null;
for (String arg : args) {
if (arg.startsWith("--") && arg.contains("=")) {
dim = arg.split("=");
if (dim.length >= 2) {
switch (dim[0]) {
case "--path":
f = new File(dim[1]);
break;
case "--encoding":
encoding = dim[1];
break;
case "--analysis":
switch (dim[1]) {
case "base":
case "base_ansj":
analysis = new BaseAnalysis();
break;
case "index":
case "index_ansj":
analysis = new IndexAnalysis();
break;
case "dic":
case "dic_ansj":
analysis = new DicAnalysis();
break;
case "nlp":
case "nlp_ansj":
analysis = new NlpAnalysis();
break;
default:
LOG.warn("invalid analysis[" + dim[1] + "], then use `ToAnalysis`");
break;
}
break;
case "--size":
size = Integer.parseInt(dim[1]);
break;
case "--stop":
sr = StopLibrary.get(dim[1]);
break;
case "--output":
output = dim[1];
break;
default:
LOG.warn("invalid argument[" + dim[0] + "], allowed arguments: --path, --encoding, --analysis, --size, --stop, --output");
break;
}
}
}
}
if (f == null) {
LOG.error("source path not found! You must add argument[--path]!");
System.exit(1);
}
if (output == null) {
LOG.warn("output file not found! You should add argument[--output]!");
}
Result result = count(f, encoding, analysis, size, sr);
if (output != null) {
result.writeToFile(output, IOUtil.UTF8);
LOG.info("results have been writen to file[" + output + "] in UTF-8 format");
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy