org.apache.mahout.math.stats.LogLikelihood Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-math Show documentation
Show all versions of mahout-math Show documentation
High performance scientific and technical computing data structures and methods,
mostly based on CERN's
Colt Java API
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.stats;
import com.google.common.base.Preconditions;
import com.google.common.collect.Multiset;
import com.google.common.collect.Ordering;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
/**
* Utility methods for working with log-likelihood
*/
public final class LogLikelihood {
private LogLikelihood() {
}
/**
* Calculates the unnormalized Shannon entropy. This is
*
* -sum x_i log x_i / N = -N sum x_i/N log x_i/N
*
* where N = sum x_i
*
* If the x's sum to 1, then this is the same as the normal
* expression. Leaving this un-normalized makes working with
* counts and computing the LLR easier.
*
* @return The entropy value for the elements
*/
public static double entropy(long... elements) {
long sum = 0;
double result = 0.0;
for (long element : elements) {
Preconditions.checkArgument(element >= 0);
result += xLogX(element);
sum += element;
}
return xLogX(sum) - result;
}
private static double xLogX(long x) {
return x == 0 ? 0.0 : x * Math.log(x);
}
/**
* Merely an optimization for the common two argument case of {@link #entropy(long...)}
* @see #logLikelihoodRatio(long, long, long, long)
*/
private static double entropy(long a, long b) {
return xLogX(a + b) - xLogX(a) - xLogX(b);
}
/**
* Merely an optimization for the common four argument case of {@link #entropy(long...)}
* @see #logLikelihoodRatio(long, long, long, long)
*/
private static double entropy(long a, long b, long c, long d) {
return xLogX(a + b + c + d) - xLogX(a) - xLogX(b) - xLogX(c) - xLogX(d);
}
/**
* Calculates the Raw Log-likelihood ratio for two events, call them A and B. Then we have:
*
*
* Event A Everything but A
* Event B A and B together (k_11) B, but not A (k_12)
* Everything but B A without B (k_21) Neither A nor B (k_22)
*
*
* @param k11 The number of times the two events occurred together
* @param k12 The number of times the second event occurred WITHOUT the first event
* @param k21 The number of times the first event occurred WITHOUT the second event
* @param k22 The number of times something else occurred (i.e. was neither of these events
* @return The raw log-likelihood ratio
*
*
* Credit to http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the table and the descriptions.
*/
public static double logLikelihoodRatio(long k11, long k12, long k21, long k22) {
Preconditions.checkArgument(k11 >= 0 && k12 >= 0 && k21 >= 0 && k22 >= 0);
// note that we have counts here, not probabilities, and that the entropy is not normalized.
double rowEntropy = entropy(k11 + k12, k21 + k22);
double columnEntropy = entropy(k11 + k21, k12 + k22);
double matrixEntropy = entropy(k11, k12, k21, k22);
if (rowEntropy + columnEntropy < matrixEntropy) {
// round off error
return 0.0;
}
return 2.0 * (rowEntropy + columnEntropy - matrixEntropy);
}
/**
* Calculates the root log-likelihood ratio for two events.
* See {@link #logLikelihoodRatio(long, long, long, long)}.
* @param k11 The number of times the two events occurred together
* @param k12 The number of times the second event occurred WITHOUT the first event
* @param k21 The number of times the first event occurred WITHOUT the second event
* @param k22 The number of times something else occurred (i.e. was neither of these events
* @return The root log-likelihood ratio
*
*
* There is some more discussion here: http://s.apache.org/CGL
*
* And see the response to Wataru's comment here:
* http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
*/
public static double rootLogLikelihoodRatio(long k11, long k12, long k21, long k22) {
double llr = logLikelihoodRatio(k11, k12, k21, k22);
double sqrt = Math.sqrt(llr);
if ((double) k11 / (k11 + k12) < (double) k21 / (k21 + k22)) {
sqrt = -sqrt;
}
return sqrt;
}
/**
* Compares two sets of counts to see which items are interestingly over-represented in the first
* set.
* @param a The first counts.
* @param b The reference counts.
* @param maxReturn The maximum number of items to return. Use maxReturn >= a.elementSet.size() to return all
* scores above the threshold.
* @param threshold The minimum score for items to be returned. Use 0 to return all items more common
* in a than b. Use -Double.MAX_VALUE (not Double.MIN_VALUE !) to not use a threshold.
* @return A list of scored items with their scores.
*/
public static List> compareFrequencies(Multiset a,
Multiset b,
int maxReturn,
double threshold) {
int totalA = a.size();
int totalB = b.size();
Ordering> byScoreAscending = new Ordering>() {
@Override
public int compare(ScoredItem tScoredItem, ScoredItem tScoredItem1) {
return Double.compare(tScoredItem.score, tScoredItem1.score);
}
};
Queue> best = new PriorityQueue<>(maxReturn + 1, byScoreAscending);
for (T t : a.elementSet()) {
compareAndAdd(a, b, maxReturn, threshold, totalA, totalB, best, t);
}
// if threshold >= 0 we only iterate through a because anything not there can't be as or more common than in b.
if (threshold < 0) {
for (T t : b.elementSet()) {
// only items missing from a need be scored
if (a.count(t) == 0) {
compareAndAdd(a, b, maxReturn, threshold, totalA, totalB, best, t);
}
}
}
List> r = new ArrayList<>(best);
Collections.sort(r, byScoreAscending.reverse());
return r;
}
private static void compareAndAdd(Multiset a,
Multiset b,
int maxReturn,
double threshold,
int totalA,
int totalB,
Queue> best,
T t) {
int kA = a.count(t);
int kB = b.count(t);
double score = rootLogLikelihoodRatio(kA, totalA - kA, kB, totalB - kB);
if (score >= threshold) {
ScoredItem x = new ScoredItem<>(t, score);
best.add(x);
while (best.size() > maxReturn) {
best.poll();
}
}
}
public static final class ScoredItem {
private final T item;
private final double score;
public ScoredItem(T item, double score) {
this.item = item;
this.score = score;
}
public double getScore() {
return score;
}
public T getItem() {
return item;
}
}
}