All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.ContingencyTables Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ContingencyTables.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core;

/**
 * Class implementing some statistical routines for contingency tables.
 *
 * @author Eibe Frank ([email protected])
 * @version $Revision: 10057 $
 */
public class ContingencyTables
  implements RevisionHandler {

  /** The natural logarithm of 2 */
  public static final double log2 = Math.log(2);

  /** Cache of integer logs */
  private static final double MAX_INT_FOR_CACHE_PLUS_ONE = 10000;
  private static final double[] INT_N_LOG_N_CACHE = new double[(int)MAX_INT_FOR_CACHE_PLUS_ONE];

  /** Initialize cache */
  static {
    for (int i = 1; i < MAX_INT_FOR_CACHE_PLUS_ONE; i++) {
      double d = (double)i;
      INT_N_LOG_N_CACHE[i] = d * Math.log(d);
    }
  }


  /**
   * Returns chi-squared probability for a given matrix.
   *
   * @param matrix the contigency table
   * @param yates is Yates' correction to be used?
   * @return the chi-squared probability
   */

  public static double chiSquared(double [][] matrix, boolean yates) {

    int df = (matrix.length - 1) * (matrix[0].length - 1);

    return Statistics.chiSquaredProbability(chiVal(matrix, yates), df);
  }

  /**
   * Computes chi-squared statistic for a contingency table.
   *
   * @param matrix the contigency table
   * @param useYates is Yates' correction to be used?
   * @return the value of the chi-squared statistic
   */
  public static double chiVal(double [][] matrix, boolean useYates) {
    
    int df, nrows, ncols, row, col;
    double[] rtotal, ctotal;
    double expect = 0, chival = 0, n = 0;
    boolean yates = true;
    
    nrows = matrix.length;
    ncols = matrix[0].length;
    rtotal = new double [nrows];
    ctotal = new double [ncols];
    for (row = 0; row < nrows; row++) {
      for (col = 0; col < ncols; col++) {
	rtotal[row] += matrix[row][col];
	ctotal[col] += matrix[row][col];
	n += matrix[row][col];
      }
    }
    df = (nrows - 1)*(ncols - 1);
    if ((df > 1) || (!useYates)) {
      yates = false;
    } else if (df <= 0) {
      return 0;
    }
    chival = 0.0;
    for (row = 0; row < nrows; row++) {
      if (Utils.gr(rtotal[row], 0)) {
	for (col = 0; col < ncols; col++) {
	  if (Utils.gr(ctotal[col], 0)) {
	    expect = (ctotal[col] * rtotal[row]) / n;
	    chival += chiCell (matrix[row][col], expect, yates);
	  }
	}
      }
    }
    return chival;
  }

  /**
   * Tests if Cochran's criterion is fullfilled for the given
   * contingency table. Rows and columns with all zeros are not considered
   * relevant.
   *
   * @param matrix the contigency table to be tested
   * @return true if contingency table is ok, false if not
   */
  public static boolean cochransCriterion(double[][] matrix) {

    double[] rtotal, ctotal;
    double n = 0, expect, smallfreq = 5;
    int smallcount = 0, nonZeroRows = 0, nonZeroColumns = 0, nrows, ncols, 
      row, col;

    nrows = matrix.length;
    ncols = matrix[0].length;

    rtotal = new double [nrows];
    ctotal = new double [ncols];
    for (row = 0; row < nrows; row++) {
      for (col = 0; col < ncols; col++) {
	rtotal[row] += matrix[row][col];
	ctotal[col] += matrix[row][col];
	n += matrix[row][col];
      }
    }
    for (row = 0; row < nrows; row++) {
      if (Utils.gr(rtotal[row], 0)) {
	nonZeroRows++;
      }
    }
    for (col = 0; col < ncols; col++) {
      if (Utils.gr(ctotal[col], 0)) {
	nonZeroColumns++;
      }
    }
    for (row = 0; row < nrows; row++) {
      if (Utils.gr(rtotal[row], 0)) {
	for (col = 0; col < ncols; col++) {
	  if (Utils.gr(ctotal[col], 0)) {
	    expect = (ctotal[col] * rtotal[row]) / n;
	    if (Utils.sm(expect, smallfreq)) {
	      if (Utils.sm(expect, 1)) {
		return false;
	      } else {
		smallcount++;
		if (smallcount > (nonZeroRows * nonZeroColumns) / smallfreq) {
		  return false;
		}
	      }
	    }
	  }
	}
      }
    }
    return true;
  }

  /**
   * Computes Cramer's V for a contingency table.
   *
   * @param matrix the contingency table
   * @return Cramer's V
   */
  public static double CramersV(double [][] matrix) {

    int row, col, nrows,ncols, min;
    double n = 0;
    
    nrows = matrix.length;
    ncols = matrix[0].length;
    for (row = 0; row < nrows; row++) {
      for (col = 0; col < ncols; col++) {
	n += matrix[row][col];
      }
    }
    min = nrows < ncols ? nrows-1 : ncols-1;
    if ((min == 0) || Utils.eq(n, 0))
      return 0;
    return Math.sqrt(chiVal(matrix, false) / (n * (double)min)); 
  } 

  /**
   * Computes the entropy of the given array.
   *
   * @param array the array
   * @return the entropy
   */
  public static double entropy(double[] array) {

    double returnValue = 0, sum = 0;

    for (int i = 0; i < array.length; i++) {
      returnValue -= lnFunc(array[i]);
      sum += array[i];
    }
    if (Utils.eq(sum, 0)) {
      return 0;
    } else {
      return (returnValue + lnFunc(sum)) / (sum * log2);
    }
  }

  /**
   * Computes conditional entropy of the rows given
   * the columns.
   *
   * @param matrix the contingency table
   * @return the conditional entropy of the rows given the columns
   */
  public static double entropyConditionedOnColumns(double[][] matrix) {
    
    double returnValue = 0, sumForColumn, total = 0;

    for (int j = 0; j < matrix[0].length; j++) {
      sumForColumn = 0;
      for (int i = 0; i < matrix.length; i++) {
	returnValue = returnValue + lnFunc(matrix[i][j]);
	sumForColumn += matrix[i][j];
      }
      returnValue = returnValue - lnFunc(sumForColumn);
      total += sumForColumn;
    }
    if (Utils.eq(total, 0)) {
      return 0;
    }
    return -returnValue / (total * log2);
  }

  /**
   * Computes conditional entropy of the columns given
   * the rows.
   *
   * @param matrix the contingency table
   * @return the conditional entropy of the columns given the rows
   */
  public static double entropyConditionedOnRows(double[][] matrix) {
    
    double returnValue = 0, sumForRow, total = 0;

    for (int i = 0; i < matrix.length; i++) {
      sumForRow = 0;
      for (int j = 0; j < matrix[0].length; j++) {
	returnValue = returnValue + lnFunc(matrix[i][j]);
	sumForRow += matrix[i][j];
      }
      returnValue = returnValue - lnFunc(sumForRow);
      total += sumForRow;
    }
    if (Utils.eq(total, 0)) {
      return 0;
    }
    return -returnValue / (total * log2);
  }

  /**
   * Computes conditional entropy of the columns given the rows
   * of the test matrix with respect to the train matrix. Uses a
   * Laplace prior. Does NOT normalize the entropy.
   *
   * @param train the train matrix 
   * @param test the test matrix
   * @param numClasses the number of symbols for Laplace
   * @return the entropy
   */
  public static double entropyConditionedOnRows(double[][] train, 
						double[][] test,
						double numClasses) {
    
    double returnValue = 0, trainSumForRow, testSumForRow, testSum = 0;

    for (int i = 0; i < test.length; i++) {
      trainSumForRow = 0;
      testSumForRow = 0;
      for (int j = 0; j < test[0].length; j++) {
	returnValue -= test[i][j] * Math.log(train[i][j] + 1);
	trainSumForRow += train[i][j];
	testSumForRow += test[i][j];
      }
      testSum = testSumForRow;
      returnValue += testSumForRow * Math.log(trainSumForRow + 
					     numClasses);
    }
    return returnValue / (testSum * log2);
  }

  /**
   * Computes the rows' entropy for the given contingency table.
   *
   * @param matrix the contingency table
   * @return the rows' entropy
   */
  public static double entropyOverRows(double[][] matrix) {
    
    double returnValue = 0, sumForRow, total = 0;

    for (int i = 0; i < matrix.length; i++) {
      sumForRow = 0;
      for (int j = 0; j < matrix[0].length; j++) {
	sumForRow += matrix[i][j];
      }
      returnValue = returnValue - lnFunc(sumForRow);
      total += sumForRow;
    }
    if (Utils.eq(total, 0)) {
      return 0;
    }
    return (returnValue + lnFunc(total)) / (total * log2);
  }

  /**
   * Computes the columns' entropy for the given contingency table.
   *
   * @param matrix the contingency table
   * @return the columns' entropy
   */
  public static double entropyOverColumns(double[][] matrix){
    
    double returnValue = 0, sumForColumn, total = 0;

    for (int j = 0; j < matrix[0].length; j++){
      sumForColumn = 0;
      for (int i = 0; i < matrix.length; i++) {
	sumForColumn += matrix[i][j];
      }
      returnValue = returnValue - lnFunc(sumForColumn);
      total += sumForColumn;
    }
    if (Utils.eq(total, 0)) {
      return 0;
    }
    return (returnValue + lnFunc(total)) / (total * log2);
  }

  /**
   * Computes gain ratio for contingency table (split on rows).
   * Returns Double.MAX_VALUE if the split entropy is 0.
   *
   * @param matrix the contingency table
   * @return the gain ratio
   */
  public static double gainRatio(double[][] matrix){
    
    double preSplit = 0, postSplit = 0, splitEnt = 0,
      sumForRow, sumForColumn, total = 0, infoGain;

    // Compute entropy before split
    for (int i = 0; i < matrix[0].length; i++) {
      sumForColumn = 0;
      for (int j = 0; j < matrix.length; j++) 
	sumForColumn += matrix[j][i];
      preSplit += lnFunc(sumForColumn);
      total += sumForColumn;
    }
    preSplit -= lnFunc(total);

    // Compute entropy after split and split entropy
    for (int i = 0; i < matrix.length; i++) {
      sumForRow = 0;
      for (int j = 0; j < matrix[0].length; j++) {
	postSplit += lnFunc(matrix[i][j]);
	sumForRow += matrix[i][j];
      }
      splitEnt += lnFunc(sumForRow);
    }
    postSplit -= splitEnt;
    splitEnt -= lnFunc(total);

    infoGain = preSplit - postSplit;
    if (Utils.eq(splitEnt, 0))
      return 0;
    return infoGain / splitEnt;
  }

  /**
   * Returns negative base 2 logarithm of multiple hypergeometric
   * probability for a contingency table.
   *
   * @param matrix the contingency table
   * @return the log of the hypergeometric probability of the contingency table 
   */
  public static double log2MultipleHypergeometric(double[][] matrix) {

    double sum = 0, sumForRow, sumForColumn, total = 0;

    for (int i = 0; i < matrix.length; i++) {
      sumForRow = 0;
      for (int j = 0; j < matrix[i].length; j++) {
	sumForRow += matrix[i][j];
      }
      sum += SpecialFunctions.lnFactorial(sumForRow);
      total += sumForRow;
    }
    for (int j = 0; j < matrix[0].length; j++) {
      sumForColumn = 0;
      for (int i = 0; i < matrix.length; i++) {
	sumForColumn += matrix [i][j];
      }
      sum += SpecialFunctions.lnFactorial(sumForColumn);
    }
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[i].length; j++) {
	sum -= SpecialFunctions.lnFactorial(matrix[i][j]);
      }
    }
    sum -= SpecialFunctions.lnFactorial(total);
    return -sum / log2;
  }

  /**
   * Reduces a matrix by deleting all zero rows and columns.
   *
   * @param matrix the matrix to be reduced
   * @return the matrix with all zero rows and columns deleted
   */
  public static double[][] reduceMatrix(double[][] matrix) {

    int row, col, currCol, currRow, nrows, ncols, 
      nonZeroRows = 0, nonZeroColumns = 0;
    double[] rtotal, ctotal;
    double[][] newMatrix;

    nrows = matrix.length;
    ncols = matrix[0].length;
    rtotal = new double [nrows];
    ctotal = new double [ncols];
    for (row = 0; row < nrows; row++) {
      for (col = 0; col < ncols; col++) {
	rtotal[row] += matrix[row][col];
	ctotal[col] += matrix[row][col];
      }
    }
    for (row = 0; row < nrows; row++) {
      if (Utils.gr(rtotal[row],0)) {
	nonZeroRows++;
      }
    }
    for (col = 0; col < ncols; col++) {
      if (Utils.gr(ctotal[col],0)) {
	nonZeroColumns++;
      }
    }
    newMatrix = new double[nonZeroRows][nonZeroColumns];
    currRow = 0;
    for (row = 0; row < nrows; row++) {
      if (Utils.gr(rtotal[row],0)) {
	currCol = 0;
	for (col = 0; col < ncols; col++) {
	  if (Utils.gr(ctotal[col],0)) {
	    newMatrix[currRow][currCol] = matrix[row][col];
	    currCol++;
	  }
	}
	currRow++;
      }
    }
    return newMatrix;
  }
    
   /**
    * Calculates the symmetrical uncertainty for base 2.
    *
    * @param matrix the contingency table
    * @return the calculated symmetrical uncertainty
    *
    */
  public static double symmetricalUncertainty(double matrix[][]) {

    double sumForColumn, sumForRow, total = 0, columnEntropy = 0, 
      rowEntropy = 0, entropyConditionedOnRows = 0, infoGain = 0;

    // Compute entropy for columns
    for (int i = 0; i < matrix[0].length; i++) {
      sumForColumn = 0;
      for (int j = 0; j < matrix.length; j++) {
	sumForColumn += matrix[j][i];
      }
      columnEntropy += lnFunc(sumForColumn);
      total += sumForColumn;
    }
    columnEntropy -= lnFunc(total);

    // Compute entropy for rows and conditional entropy
    for (int i = 0; i < matrix.length; i++) {
      sumForRow = 0;
      for (int j = 0; j < matrix[0].length; j++) { 
	sumForRow += matrix[i][j];
	entropyConditionedOnRows += lnFunc(matrix[i][j]);
      }
      rowEntropy += lnFunc(sumForRow);
    }
    entropyConditionedOnRows -= rowEntropy;
    rowEntropy -= lnFunc(total);
    infoGain = columnEntropy - entropyConditionedOnRows;
    if (Utils.eq(columnEntropy, 0) || Utils.eq(rowEntropy, 0))
      return 0;
    return 2.0 * (infoGain / (columnEntropy + rowEntropy));
  }

  /**
   * Computes Goodman and Kruskal's tau-value for a contingency table.
   *
   * @param matrix the contingency table
   * @return Goodman and Kruskal's tau-value
   */
  public static double tauVal(double[][] matrix) {
    
    int nrows, ncols, row, col;
    double [] ctotal;
    double maxcol = 0, max, maxtotal = 0, n = 0;
    
    nrows = matrix.length;
    ncols = matrix[0].length;
    ctotal = new double [ncols];
    for (row = 0; row < nrows; row++) {
      max = 0;
      for (col = 0; col < ncols; col++) {
	if (Utils.gr(matrix[row][col], max)) 
	  max = matrix[row][col];
	ctotal[col] += matrix[row][col];
	n += matrix[row][col];
      }
      maxtotal += max;
    }
    if (Utils.eq(n, 0)) {
      return 0;
    }
    maxcol = ctotal[Utils.maxIndex(ctotal)];
    return (maxtotal - maxcol)/(n - maxcol);
  }

  /**
   * Help method for computing entropy.
   */
  public static double lnFunc(double num){
    
    if (num <= 0) {
      return 0;
    } else {

      // Use cache if we have a sufficiently small integer
      if (num < MAX_INT_FOR_CACHE_PLUS_ONE) {
        int n = (int)num;
        if ((double)n == num) {
          return INT_N_LOG_N_CACHE[n];
        }
      }
      return num * Math.log(num);
    }
  }

  /**
   * Computes chi-value for one cell in a contingency table.
   *
   * @param freq the observed frequency in the cell
   * @param expected the expected frequency in the cell
   * @return the chi-value for that cell; 0 if the expected value is
   * too close to zero 
   */
  private static double chiCell(double freq, double expected, 
                                boolean yates){

    // Cell in empty row and column?
    if (Utils.smOrEq(expected, 0)) {
      return 0;
    }

    // Compute difference between observed and expected value
    double diff = Math.abs(freq - expected);
    if (yates) {

      // Apply Yates' correction if wanted
      diff -= 0.5;

      // The difference should never be negative
      if (diff < 0) {
        diff = 0;
      }
    }

    // Return chi-value for the cell
    return (diff * diff / expected);
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 10057 $");
  }

  /**
   * Main method for testing this class.
   */
  public static void main(String[] ops) {

    double[] firstRow = {10, 5, 20};
    double[] secondRow = {2, 10, 6};
    double[] thirdRow = {5, 10, 10};
    double[][] matrix = new double[3][0];

    matrix[0] = firstRow; matrix[1] = secondRow; matrix[2] = thirdRow;
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[i].length; j++) {
	System.out.print(matrix[i][j] + " ");
      }
      System.out.println();
    }
    System.out.println("Chi-squared probability: " +
		       ContingencyTables.chiSquared(matrix, false));
    System.out.println("Chi-squared value: " +
		       ContingencyTables.chiVal(matrix, false));
    System.out.println("Cochran's criterion fullfilled: " +
		       ContingencyTables.cochransCriterion(matrix));
    System.out.println("Cramer's V: " +
		       ContingencyTables.CramersV(matrix));
    System.out.println("Entropy of first row: " +
		       ContingencyTables.entropy(firstRow));
    System.out.println("Entropy conditioned on columns: " +
		       ContingencyTables.entropyConditionedOnColumns(matrix));
    System.out.println("Entropy conditioned on rows: " +
		       ContingencyTables.entropyConditionedOnRows(matrix));
    System.out.println("Entropy conditioned on rows (with Laplace): " +
		       ContingencyTables.entropyConditionedOnRows(matrix, matrix, 3));
    System.out.println("Entropy of rows: " +
		       ContingencyTables.entropyOverRows(matrix));
    System.out.println("Entropy of columns: " +
		       ContingencyTables.entropyOverColumns(matrix));
    System.out.println("Gain ratio: " +
		       ContingencyTables.gainRatio(matrix));
    System.out.println("Negative log2 of multiple hypergeometric probability: " +
		       ContingencyTables.log2MultipleHypergeometric(matrix));
    System.out.println("Symmetrical uncertainty: " +
		       ContingencyTables.symmetricalUncertainty(matrix));
    System.out.println("Tau value: " +
		       ContingencyTables.tauVal(matrix));
    double[][] newMatrix = new double[3][3];
    newMatrix[0][0] = 1; newMatrix[0][1] = 0; newMatrix[0][2] = 1;
    newMatrix[1][0] = 0; newMatrix[1][1] = 0; newMatrix[1][2] = 0;
    newMatrix[2][0] = 1; newMatrix[2][1] = 0; newMatrix[2][2] = 1;
    System.out.println("Matrix with empty row and column: ");
    for (int i = 0; i < newMatrix.length; i++) {
      for (int j = 0; j < newMatrix[i].length; j++) {
	System.out.print(newMatrix[i][j] + " ");
      }
      System.out.println();
    }
    System.out.println("Reduced matrix: ");
    newMatrix = ContingencyTables.reduceMatrix(newMatrix);
    for (int i = 0; i < newMatrix.length; i++) {
      for (int j = 0; j < newMatrix[i].length; j++) {
	System.out.print(newMatrix[i][j] + " ");
      }
      System.out.println();
    }
  }
}












© 2015 - 2024 Weber Informatics LLC | Privacy Policy