com.dahuatech.hutool.core.text.Simhash Maven / Gradle / Ivy
package com.dahuatech.hutool.core.text;
import com.dahuatech.hutool.core.lang.MurmurHash;
import java.math.BigInteger;
import java.util.*;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
/**
* Simhash是一种局部敏感hash,用于海量文本去重。
* 算法实现来自:https://github.com/xlturing/Simhash4J
*
* 局部敏感hash定义:假定两个字符串具有一定的相似性,在hash之后,仍然能保持这种相似性,就称之为局部敏感hash。
*
* @author Looly, litaoxiao
* @since 4.3.3
*/
public class Simhash {
private final int bitNum = 64;
/** 存储段数,默认按照4段进行simhash存储 */
private final int fracCount;
private final int fracBitNum;
/** 汉明距离的衡量标准,小于此距离标准表示相似 */
private final int hammingThresh;
/** 按照分段存储simhash,查找更快速 */
private List