org.apache.mahout.classifier.evaluation.Auc Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.classifier.evaluation;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.list.DoubleArrayList;
import com.google.common.base.Preconditions;
import java.util.Random;
/**
* Computes AUC and a few other accuracy statistics without storing huge amounts of data. This is
* done by keeping uniform samples of the positive and negative scores. Then, when AUC is to be
* computed, the remaining scores are sorted and a rank-sum statistic is used to compute the AUC.
* Since AUC is invariant with respect to down-sampling of either positives or negatives, this is
* close to correct and is exactly correct if maxBufferSize or fewer positive and negative scores
* are examined.
*/
public class Auc {
private int maxBufferSize = 10000;
private final DoubleArrayList[] scores = {new DoubleArrayList(), new DoubleArrayList()};
private final Random rand;
private int samples;
private final double threshold;
private final Matrix confusion;
private final DenseMatrix entropy;
private boolean probabilityScore = true;
private boolean hasScore;
/**
* Allocates a new data-structure for accumulating information about AUC and a few other accuracy
* measures.
* @param threshold The threshold to use in computing the confusion matrix.
*/
public Auc(double threshold) {
confusion = new DenseMatrix(2, 2);
entropy = new DenseMatrix(2, 2);
this.rand = RandomUtils.getRandom();
this.threshold = threshold;
}
public Auc() {
this(0.5);
}
/**
* Adds a score to the AUC buffers.
*
* @param trueValue Whether this score is for a true-positive or a true-negative example.
* @param score The score for this example.
*/
public void add(int trueValue, double score) {
Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
hasScore = true;
int predictedClass = score > threshold ? 1 : 0;
confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
samples++;
if (isProbabilityScore()) {
double limited = Math.max(1.0e-20, Math.min(score, 1 - 1.0e-20));
double v0 = entropy.get(trueValue, 0);
entropy.set(trueValue, 0, (Math.log1p(-limited) - v0) / samples + v0);
double v1 = entropy.get(trueValue, 1);
entropy.set(trueValue, 1, (Math.log(limited) - v1) / samples + v1);
}
// add to buffers
DoubleArrayList buf = scores[trueValue];
if (buf.size() >= maxBufferSize) {
// but if too many points are seen, we insert into a random
// place and discard the predecessor. The random place could
// be anywhere, possibly not even in the buffer.
// this is a special case of Knuth's permutation algorithm
// but since we don't ever shuffle the first maxBufferSize
// samples, the result isn't just a fair sample of the prefixes
// of all permutations. The CONTENTs of the result, however,
// will be a fair and uniform sample of maxBufferSize elements
// chosen from all elements without replacement
int index = rand.nextInt(samples);
if (index < buf.size()) {
buf.set(index, score);
}
} else {
// for small buffers, we collect all points without permuting
// since we sort the data later, permuting now would just be
// pedantic
buf.add(score);
}
}
public void add(int trueValue, int predictedClass) {
hasScore = false;
Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
}
/**
* Computes the AUC of points seen so far. This can be moderately expensive since it requires
* that all points that have been retained be sorted.
*
* @return The value of the Area Under the receiver operating Curve.
*/
public double auc() {
Preconditions.checkArgument(hasScore, "Can't compute AUC for classifier without a score");
scores[0].sort();
scores[1].sort();
double n0 = scores[0].size();
double n1 = scores[1].size();
if (n0 == 0 || n1 == 0) {
return 0.5;
}
// scan the data
int i0 = 0;
int i1 = 0;
int rank = 1;
double rankSum = 0;
while (i0 < n0 && i1 < n1) {
double v0 = scores[0].get(i0);
double v1 = scores[1].get(i1);
if (v0 < v1) {
i0++;
rank++;
} else if (v1 < v0) {
i1++;
rankSum += rank;
rank++;
} else {
// ties have to be handled delicately
double tieScore = v0;
// how many negatives are tied?
int k0 = 0;
while (i0 < n0 && scores[0].get(i0) == tieScore) {
k0++;
i0++;
}
// and how many positives
int k1 = 0;
while (i1 < n1 && scores[1].get(i1) == tieScore) {
k1++;
i1++;
}
// we found k0 + k1 tied values which have
// ranks in the half open interval [rank, rank + k0 + k1)
// the average rank is assigned to all
rankSum += (rank + (k0 + k1 - 1) / 2.0) * k1;
rank += k0 + k1;
}
}
if (i1 < n1) {
rankSum += (rank + (n1 - i1 - 1) / 2.0) * (n1 - i1);
rank += (int) (n1 - i1);
}
return (rankSum / n1 - (n1 + 1) / 2) / n0;
}
/**
* Returns the confusion matrix for the classifier supposing that we were to use a particular
* threshold.
* @return The confusion matrix.
*/
public Matrix confusion() {
return confusion;
}
/**
* Returns a matrix related to the confusion matrix and to the log-likelihood. For a
* pretty accurate classifier, N + entropy is nearly the same as the confusion matrix
* because log(1-eps) \approx -eps if eps is small.
*
* For lower accuracy classifiers, this measure will give us a better picture of how
* things work our.
*
* Also, by definition, log-likelihood = sum(diag(entropy))
* @return Returns a cell by cell break-down of the log-likelihood
*/
public Matrix entropy() {
if (!hasScore) {
// find a constant score that would optimize log-likelihood, but use a dash of Bayesian
// conservatism to avoid dividing by zero or taking log(0)
double p = (0.5 + confusion.get(1, 1)) / (1 + confusion.get(0, 0) + confusion.get(1, 1));
entropy.set(0, 0, confusion.get(0, 0) * Math.log1p(-p));
entropy.set(0, 1, confusion.get(0, 1) * Math.log(p));
entropy.set(1, 0, confusion.get(1, 0) * Math.log1p(-p));
entropy.set(1, 1, confusion.get(1, 1) * Math.log(p));
}
return entropy;
}
public void setMaxBufferSize(int maxBufferSize) {
this.maxBufferSize = maxBufferSize;
}
public boolean isProbabilityScore() {
return probabilityScore;
}
public void setProbabilityScore(boolean probabilityScore) {
this.probabilityScore = probabilityScore;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy