org.apache.mahout.classifier.evaluation.Auc Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier.evaluation;

import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.list.DoubleArrayList;

import com.google.common.base.Preconditions;

import java.util.Random;

/**
 * Computes AUC and a few other accuracy statistics without storing huge amounts of data.  This is
 * done by keeping uniform samples of the positive and negative scores.  Then, when AUC is to be
 * computed, the remaining scores are sorted and a rank-sum statistic is used to compute the AUC.
 * Since AUC is invariant with respect to down-sampling of either positives or negatives, this is
 * close to correct and is exactly correct if maxBufferSize or fewer positive and negative scores
 * are examined.
 */
public class Auc {

  private int maxBufferSize = 10000;
  private final DoubleArrayList[] scores = {new DoubleArrayList(), new DoubleArrayList()};
  private final Random rand;
  private int samples;
  private final double threshold;
  private final Matrix confusion;
  private final DenseMatrix entropy;

  private boolean probabilityScore = true;

  private boolean hasScore;

  /**
   * Allocates a new data-structure for accumulating information about AUC and a few other accuracy
   * measures.
   * @param threshold The threshold to use in computing the confusion matrix.
   */
  public Auc(double threshold) {
    confusion = new DenseMatrix(2, 2);
    entropy = new DenseMatrix(2, 2);
    this.rand = RandomUtils.getRandom();
    this.threshold = threshold;
  }

  public Auc() {
    this(0.5);
  }

  /**
   * Adds a score to the AUC buffers.
   *
   * @param trueValue Whether this score is for a true-positive or a true-negative example.
   * @param score     The score for this example.
   */
  public void add(int trueValue, double score) {
    Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
    hasScore = true;

    int predictedClass = score > threshold ? 1 : 0;
    confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);

    samples++;
    if (isProbabilityScore()) {
      double limited = Math.max(1.0e-20, Math.min(score, 1 - 1.0e-20));
      double v0 = entropy.get(trueValue, 0);
      entropy.set(trueValue, 0, (Math.log1p(-limited) - v0) / samples + v0);

      double v1 = entropy.get(trueValue, 1);
      entropy.set(trueValue, 1, (Math.log(limited) - v1) / samples + v1);
    }

    // add to buffers
    DoubleArrayList buf = scores[trueValue];
    if (buf.size() >= maxBufferSize) {
      // but if too many points are seen, we insert into a random
      // place and discard the predecessor.  The random place could
      // be anywhere, possibly not even in the buffer.
      // this is a special case of Knuth's permutation algorithm
      // but since we don't ever shuffle the first maxBufferSize
      // samples, the result isn't just a fair sample of the prefixes
      // of all permutations.  The CONTENTs of the result, however,
      // will be a fair and uniform sample of maxBufferSize elements
      // chosen from all elements without replacement
      int index = rand.nextInt(samples);
      if (index < buf.size()) {
        buf.set(index, score);
      }
    } else {
      // for small buffers, we collect all points without permuting
      // since we sort the data later, permuting now would just be
      // pedantic
      buf.add(score);
    }
  }

  public void add(int trueValue, int predictedClass) {
    hasScore = false;
    Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
    confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
  }

  /**
   * Computes the AUC of points seen so far.  This can be moderately expensive since it requires
   * that all points that have been retained be sorted.
   *
   * @return The value of the Area Under the receiver operating Curve.
   */
  public double auc() {
    Preconditions.checkArgument(hasScore, "Can't compute AUC for classifier without a score");
    scores[0].sort();
    scores[1].sort();

    double n0 = scores[0].size();
    double n1 = scores[1].size();

    if (n0 == 0 || n1 == 0) {
      return 0.5;
    }

    // scan the data
    int i0 = 0;
    int i1 = 0;
    int rank = 1;
    double rankSum = 0;
    while (i0 < n0 && i1 < n1) {

      double v0 = scores[0].get(i0);
      double v1 = scores[1].get(i1);

      if (v0 < v1) {
        i0++;
        rank++;
      } else if (v1 < v0) {
        i1++;
        rankSum += rank;
        rank++;
      } else {
        // ties have to be handled delicately
        double tieScore = v0;

        // how many negatives are tied?
        int k0 = 0;
        while (i0 < n0 && scores[0].get(i0) == tieScore) {
          k0++;
          i0++;
        }

        // and how many positives
        int k1 = 0;
        while (i1 < n1 && scores[1].get(i1) == tieScore) {
          k1++;
          i1++;
        }

        // we found k0 + k1 tied values which have
        // ranks in the half open interval [rank, rank + k0 + k1)
        // the average rank is assigned to all
        rankSum += (rank + (k0 + k1 - 1) / 2.0) * k1;
        rank += k0 + k1;
      }
    }

    if (i1 < n1) {
      rankSum += (rank + (n1 - i1 - 1) / 2.0) * (n1 - i1);
      rank += (int) (n1 - i1);
    }

    return (rankSum / n1 - (n1 + 1) / 2) / n0;
  }

  /**
   * Returns the confusion matrix for the classifier supposing that we were to use a particular
   * threshold.
   * @return The confusion matrix.
   */
  public Matrix confusion() {
    return confusion;
  }

  /**
   * Returns a matrix related to the confusion matrix and to the log-likelihood.  For a
   * pretty accurate classifier, N + entropy is nearly the same as the confusion matrix
   * because log(1-eps) \approx -eps if eps is small.
   *
   * For lower accuracy classifiers, this measure will give us a better picture of how
   * things work our.
   *
   * Also, by definition, log-likelihood = sum(diag(entropy))
   * @return Returns a cell by cell break-down of the log-likelihood
   */
  public Matrix entropy() {
    if (!hasScore) {
      // find a constant score that would optimize log-likelihood, but use a dash of Bayesian
      // conservatism to avoid dividing by zero or taking log(0)
      double p = (0.5 + confusion.get(1, 1)) / (1 + confusion.get(0, 0) + confusion.get(1, 1));
      entropy.set(0, 0, confusion.get(0, 0) * Math.log1p(-p));
      entropy.set(0, 1, confusion.get(0, 1) * Math.log(p));
      entropy.set(1, 0, confusion.get(1, 0) * Math.log1p(-p));
      entropy.set(1, 1, confusion.get(1, 1) * Math.log(p));
    }
    return entropy;
  }

  public void setMaxBufferSize(int maxBufferSize) {
    this.maxBufferSize = maxBufferSize;
  }

  public boolean isProbabilityScore() {
    return probabilityScore;
  }

  public void setProbabilityScore(boolean probabilityScore) {
    this.probabilityScore = probabilityScore;
  }
}