All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.nlpcn.commons.lang.util.WordWeight Maven / Gradle / Ivy

package org.nlpcn.commons.lang.util;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

/**
 * 计算词语的权重,词频统计等
 * 
 * @author ansj
 *
 */
public class WordWeight {

	private MapCount mc = new MapCount(); // 词频统计

	private HashMap> x2mat = new HashMap>();

	private MapCount x2mc = new MapCount();

	private Integer maxCount;

	private Integer recyclingCount;

	private double allFreq;

	public WordWeight() {
	};

	/**
	 * 新的个数 = maxCount - recyclingCount; recyclingCount< maxCount
	 * 
	 * @param maxCount
	 *            最大值,当超过这个值后进行回收
	 * @param recyclingCount
	 *            回收个数
	 */
	public WordWeight(Integer maxCount, Integer recyclingCount) {
		this.maxCount = maxCount;
		this.recyclingCount = recyclingCount;
	}

	public void add(String word) {
		add(word, 1);
	}

	public void add(String word, double weight) {
		allFreq += weight;
		mc.add(word, weight);
		if (maxCount != null && recyclingCount != null && mc.get().size() >= maxCount) {
			recycling();
		}
	}

	public void add(String word, String target) {
		add(word, target, 1);
	}

	public void add(String word, String target, double weight) {
		if (x2mat.containsKey(target)) {
			x2mat.get(target).add(word, weight);
		} else {
			x2mat.put(target, new MapCount());
			x2mat.get(target).add(word, weight);
		}
		x2mc.add(target, 1);
		add(word, weight);
	}

	/**
	 * 导出词频统计结果
	 * 
	 * @return
	 */
	public Map export() {
		Map result = new HashMap();
		result.putAll(mc.get());
		return result;
	}

	/**
	 * 导出IDF统计结果
	 * 
	 * @return
	 */
	public Map exportIDF() {

		Map result = new HashMap();

		for (Entry entry : mc.get().entrySet()) {
			result.put(entry.getKey(), Math.log(allFreq / entry.getValue()));
		}

		return result;
	}

	public HashMap> exportChiSquare() {

		HashMap> x2final = new HashMap>();

		double sum = allFreq;

		Double a, b, c, d;

		for (Entry> iter1 : x2mat.entrySet()) {
			String target = iter1.getKey();
			for (Entry iter2 : iter1.getValue().get().entrySet()) {
				String name = iter2.getKey();
				a = iter2.getValue();
				b = x2mc.get().get(target) - a;
				c = mc.get().get(name) - a;
				d = sum - b - c + a;
				Double x2stat = Math.pow(a * d - b * c, 2) / (a + c) / (b + d);
				if (x2final.get(target) != null) {
					x2final.get(target).add(name, x2stat);
				} else {
					x2final.put(target, new MapCount());
					x2final.get(target).add(name, x2stat);
				}
			}
		}

		return x2final;

	}

	/**
	 * 回收
	 */
	private void recycling() {
		List> list = CollectionUtil.sortMapByValue(mc.get(), -1);
		Set targetSet = x2mat.keySet();
		String word;
		for (int i = 0; i < recyclingCount; i++) {
			word = list.get(i).getKey();
			allFreq -= mc.get().remove(word); // 从全局中移除数字
			for (String target : targetSet) {
				Double r2 = x2mat.get(target).get().remove(word);
				if (r2 != null) {
					x2mc.add(target, -r2);
				}
			}
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy