All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.nlpcn.commons.lang.finger.SimHashService Maven / Gradle / Ivy

package org.nlpcn.commons.lang.finger;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.nlpcn.commons.lang.tire.GetWord;
import org.nlpcn.commons.lang.util.MurmurHash;

public class SimHashService extends AbsService {

	private static final int BYTE_LEN = 64;

	private static final long[] BITS = new long[BYTE_LEN];

	static {
		BITS[0] = 1;
		for (int i = 1; i < BITS.length; i++) {
			BITS[i] = BITS[i - 1] * 2;
		}
	}

	/**
	 * 比较 ab 的汉明距离
	 * 
	 * @param a
	 * @param b
	 * @return
	 */
	public int hmDistance(long a, long b) {
		int d = 0;
		a = a ^ b;
		for (int i = 0; i < BYTE_LEN; i++) {
			if ((a & BITS[i]) != 0) {
				d++;
			}
		}
		return d;
	}

	/**
	 * 传入两个文章进行汉明距离比较
	 * 
	 * @param c1
	 * @param c2
	 * @return
	 */
	public int hmDistance(String c1, String c2) {
		return hmDistance(fingerprint(c1), fingerprint(c2));
	}

	/**
	 * 获得simhash的指纹
	 * 
	 * @param content
	 * @return
	 */
	public long fingerprint(String content) {
		int[] values = new int[BYTE_LEN];

		for (String word : analysis(content)) {
			long hashCode = hash(word);
			for (int i = 0; i < BYTE_LEN; i++) {
				if ((hashCode & BITS[i]) != 0) {
					values[BYTE_LEN - 1 - i]++;
				} else {
					values[BYTE_LEN - 1 - i]--;
				}
			}
		}

		long result = 0;

		for (int i = 0; i < BYTE_LEN; i++) {
			if (values[i] > 0) {
				result = result | BITS[BYTE_LEN - 1 - i];
			}
		}

		return result;
	}

	/**
	 * 调用分词器,如果你想用自己的分词器。需要覆盖这个方法
	 * 
	 * @return
	 */
	public List analysis(String content) {

		GetWord word = forest.getWord(content);

		String temp = null;

		List all = new ArrayList();

		while ((temp = word.getFrontWords()) != null) {
			all.add(temp);
		}

		return all;
	}

	/**
	 * hash 方法生成hashcode , 默认采用murmur64的hash算法,如果需要则覆盖这个方法
	 * 
	 * @param word
	 * @return
	 */
	public long hash(String word) {
		return MurmurHash.hash64(word);
	}

	public Index createIndex() {
		return new Index();
	}

	public class Index {

		@SuppressWarnings("unchecked")
		List[] lists = new List[2048];

		private Index() {
		};

		/**
		 * 增加hashcode到索引中
		 * 
		 * @param 
		 * 
		 * @param hash
		 */
		public void addHashCode(long hash) {
			int[] indexs = makeCodeIndex(hash);

			for (int i = 0; i < indexs.length; i++) {
				int idx = indexs[i];
				if (lists[idx] == null) {
					lists[idx] = new ArrayList();
				}
				lists[idx].add(hash);
			}

		}

		private int[] makeCodeIndex(long hashCode) {
			return new int[] { (int) (hashCode & (BITS[8] - 1)), (int) ((hashCode >>> 8) & (BITS[8] - 1) + 256), (int) ((hashCode >>> 16) & (BITS[8] - 1) + 512),
					(int) ((hashCode >>> 24) & (BITS[8] - 1) + 768), (int) ((hashCode >>> 32) & (BITS[8] - 1) + 1024), (int) ((hashCode >>> 40) & (BITS[8] - 1) + 1280),
					(int) ((hashCode >>> 48) & (BITS[8] - 1) + 1536), (int) ((hashCode >>> 56) & (BITS[8] - 1) + 1792) };
		}

		/**
		 * 增加正文到索引中
		 * 
		 * @param content
		 */
		public void add(String content) {
			addHashCode(fingerprint(content));
		}

		/**
		 * 返回和当前查询最近的汉明距离数字
		 * 
		 * @param hashCode
		 * @return
		 */
		public int nearest(long hashCode) {
			int[] indexs = makeCodeIndex(hashCode);

			Set sets = new HashSet();
			for (int i = 0; i < indexs.length; i++) {
				List list = lists[indexs[i]];
				if (list != null) {
					sets.addAll(list);
				}
			}

			int hmDistance = 64;
			for (Long hc : sets) {
				hmDistance = Math.min(hmDistance(hashCode, hc), hmDistance);
				if (hmDistance == 0) {
					return hmDistance;
				}
			}
			
			return hmDistance;
		}

		/**
		 * 返回和当前查询最近的汉明距离数字
		 * 
		 * @param hashCode
		 * @return
		 */
		public int nearest(String content) {
			return nearest(fingerprint(content));
		}

		/**
		 * 查询最近的距离并且添加到索引中
		 * 
		 * @param hashCode
		 * @return
		 */
		public int nearestAndAdd(long hashCode) {
			int hmDistance = nearest(hashCode);
			if (hmDistance > 0) {
				addHashCode(hashCode);
			}
			return hmDistance;
		}

		/**
		 * 查询最近的距离并且添加到索引中
		 * 
		 * @param hashCode
		 * @return
		 */
		public int nearestAndAdd(String content) {
			return nearestAndAdd(fingerprint(content));
		}

		/**
		 * 得到索引中所有的hashcode
		 * 
		 * @return
		 */
		public Set allHashCode() {
			Set hs = new HashSet();

			for (List list : lists) {
				if (list != null) {
					for (Long i : list) {
						hs.add(i);
					}
				}
			}

			return hs;
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy