All Downloads are FREE. Search and download functionalities are using the official Maven repository.

brickhouse.udf.sketch.SetSimilarityUDF Maven / Gradle / Ivy

package brickhouse.udf.sketch;

import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;

import brickhouse.analytics.uniques.SketchSet;

/**
 * Compute the Jaccard similarity of two sketch sets.
 * 
 * Jaccard Similarity is defined as the size of the intersection of two sets divided by the 
 *   size of the union of the sets. Since sketches are only approximate measures, this
 *   calculation only makes sense when the sets are roughly the same size.
 *
 */
@Description(name="set_similarity",
value = "_FUNC_(a,b) - Compute the Jaccard set similarity of two sketch sets. "
)
public class SetSimilarityUDF extends UDF {

	public Double evaluate( List a, List b) {
		if( a == null || b == null ) 
			return null;
		if( a.size() ==0 || b.size() == 0 ) {
			return 0.0;
		}
		/// For now, assume min sketch size is 5000...
		/// otherwise it is better to use array_intersect
		/// XXX TODO convert to GenericUDF, so that it can be passed in 
		///  as an argument
		int sketchSize = Math.max( a.size() , b.size() );
		if( sketchSize < SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE)
		    sketchSize = SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE;
		
		SketchSet sketchA = new SketchSet(sketchSize);
		SketchSet sketchB = new SketchSet(sketchSize);
		SketchSet sketchAUB = new SketchSet(sketchSize);
		
		
		for(String aStr : a) {
			sketchA.addItem( aStr);
			sketchAUB.addItem( aStr);
		}
		for(String bStr : b) {
			sketchB.addItem( bStr);
			sketchAUB.addItem( bStr);
		}
		
		double aEst = sketchA.estimateReach();
		double bEst = sketchB.estimateReach();
		double aubEst = sketchAUB.estimateReach();
		
		/// Intersection is 
		double ainterb =  aEst + bEst - aubEst;
		double sim = ainterb/aubEst;
		
		return sim;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy