gov.sandia.cognition.statistics.ChiSquaredSimilarity Maven / Gradle / Ivy
/*
* File: TransferEntropy.java
* Authors: Tom Brounstein
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright July 1, 2014, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government.
* Export of this program may require a license from the United States
* Government. See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.statistics;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.statistics.distribution.ChiSquareDistribution;
/**
* A class for computing the chi-squared similarity between two vectors. A chi-
* squared test requires frequency vectors, typically representing documents,
* so all values in the vectors will be computed as non-negative values. The
* test assumes one vector represents a document in a given category, and
* another vector which is being tested to see if it is (likely) from the same
* distribution as the original vector. Note that the test is symmetric,
* so the choice of which vector is the categorized vector and which one is the
* testing vector is somewhat arbitrary.
* @author trbroun
* @since 3.4.2
*/
@PublicationReference(
author="Yao-Tsung Chen, Meng Chang Chen",
title="Using chi-square statistics to measure similarities for text categorization",
type=PublicationType.Journal,
year=2011,
url="http://www.sciencedirect.com/science/article/pii/S0957417410008961#"
)
public class ChiSquaredSimilarity
{
private Vector categorizedVector;
private Vector testingVector;
/**
* Basic constructor. Sets the variables in the intuitive manner.
* @param categorizedVector The vector from a known category.
* @param testingVector The vector which is being tested to see if it comes from the same category.
*/
public ChiSquaredSimilarity(Vector categorizedVector, Vector testingVector) {
this.categorizedVector = categorizedVector.clone();
this.testingVector = testingVector.clone();
}
/**
* Basic setter for the categorized vector.
* @param newCategorizedVector
*/
public void setCategorizedVector(Vector newCategorizedVector) {
categorizedVector = newCategorizedVector.clone();
}
/**
* Basic setter for the test vector.
* @param newTestVector
*/
public void setTestVector(Vector newTestVector) {
testingVector = newTestVector.clone();
}
/**
* Basic getter for the categorized vector.
* @return The categorized vector.
*/
public Vector getCategorizedVector() {
return categorizedVector;
}
/**
* Basic getter for the testing vector.
* @return The test vector.
*/
public Vector getTestVector() {
return testingVector;
}
/**
* Computes the chi-squared statistic of the two vectors. This is a raw number
* and needs to be fed into a chi-squared distribution to compute a probability.
* Both vectors must be non-zero.
* @return The chi-squared statistic.
*/
public double compute() {
if(categorizedVector.getDimensionality() != testingVector.getDimensionality()) {
throw new RuntimeException("Dimension mismatch.");
}
double sumCat = categorizedVector.norm1();
double sumTest = testingVector.norm1();
double h = sumCat + sumTest;
if(sumCat == 0 || sumTest == 0) {
throw new RuntimeException("One of the vectors is the 0 vector.");
}
double total = 0.;
for (int ii = 0; ii < categorizedVector.getDimensionality(); ii++) {
double catii = Math.abs(categorizedVector.getElement(ii));
double testii = Math.abs(testingVector.getElement(ii));
double pairSum = catii + testii;
if (pairSum == 0) {
continue;
}
double firstTerm = (catii*catii)/(sumCat*pairSum);
double secondTerm = (testii*testii)/(sumTest*pairSum);
total += firstTerm + secondTerm;
}
double chiSquaredStat = h*total - h;
return chiSquaredStat;
}
/**
* Computes the chi-squared similarity statistic, then uses that to compute
* a cumulative probability. Returns the probability that a chi-squared
* statistic falls between 0 and the critical value (the computed chi-squared
* statistic for the two supplied vectors). Naturally, a large chi-squared
* value generates a large cumulative probability value.
* @return The probability of a chi-squared statistic being lower than the value of the chi-squared similarity of the given vectors.
*/
public double computeCumulativeProbabilityValue() {
double chiSquaredStat = compute();
ChiSquareDistribution dist = new ChiSquareDistribution(categorizedVector.getDimensionality()-1);
return dist.getCDF().evaluate(chiSquaredStat);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy