de.lmu.ifi.dbs.elki.math.statistics.dependence.MCEDependenceMeasure Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elki Show documentation
Show all versions of elki Show documentation
ELKI - Main Module – Open-Source Data-Mining Framework with Index Acceleration
package de.lmu.ifi.dbs.elki.math.statistics.dependence;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.ArrayList;
import java.util.Arrays;
import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
* Compute a mutual information based dependence measure using a nested means
* discretization, originally proposed for ordering axes in parallel coordinate
* plots.
*
* Reference:
*
* D. Guo
* Coordinating computational and visual approaches for interactive feature
* selection and multivariate clustering
* Information Visualization, 2(4), 2003.
*
*
* @author Erich Schubert
* @since 0.7.0
*/
@Reference(authors = "D. Guo", //
title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", //
booktitle = "Information Visualization, 2(4)", //
url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
public class MCEDependenceMeasure extends AbstractDependenceMeasure {
/**
* Static instance.
*/
public static final MCEDependenceMeasure STATIC = new MCEDependenceMeasure();
/**
* Desired size: 35 observations.
*
* While this could trivially be made parameterizable, it is a reasonable rule
* of thumb and not expected to have a major effect.
*/
public static final int TARGET = 35;
@Override
public double dependence(NumberArrayAdapter adapter1, A data1, NumberArrayAdapter adapter2, B data2) {
final int len = size(adapter1, data1, adapter2, data2);
// Find a number of bins as recommended by Cheng et al.
double p = MathUtil.log2(len / (double) TARGET);
// As we are in 2d, take the root (*.5) But let's use at least 1, too.
// Check: for 10000 this should give 4, for 150 it gives 1.
int power = Math.max(1, (int) Math.floor(p * .5));
int gridsize = 1 << power;
double loggrid = Math.log((double) gridsize);
ArrayList parts1 = buildPartitions(adapter1, data1, len, power);
ArrayList parts2 = buildPartitions(adapter2, data2, len, power);
int[][] res = new int[gridsize][gridsize];
intersectionMatrix(res, parts1, parts2, gridsize);
return 1. - getMCEntropy(res, parts1, parts2, len, gridsize, loggrid);
}
/**
* Partitions an attribute.
*
* @param adapter1 Data adapter
* @param data1 Data set
* @param len Length of data
* @param depth Splitting depth
* @return List of sorted objects
*/
private ArrayList buildPartitions(NumberArrayAdapter adapter1, A data1, int len, int depth) {
final int[] idx = new int[len];
final double[] tmp = new double[len];
for(int i = 0; i < len; ++i) {
idx[i] = i;
tmp[i] = adapter1.getDouble(data1, i);
}
// Sort indexes:
IntegerArrayQuickSort.sort(idx, new IntegerComparator() {
@Override
public int compare(int x, int y) {
return Double.compare(tmp[x], tmp[y]);
}
});
Arrays.sort(tmp); // Should yield the same ordering
ArrayList ret = new ArrayList<>(1 << depth);
divide(idx, tmp, ret, 0, tmp.length, depth);
return ret;
}
/**
* Recursive call to further subdivide the array.
*
* @param idx Object indexes.
* @param data 1D data, sorted
* @param ret Output index
* @param start Interval start
* @param end Interval end
* @param depth Depth
*/
private void divide(int[] idx, double[] data, ArrayList ret, int start, int end, int depth) {
if(depth == 0) {
int[] a = Arrays.copyOfRange(idx, start, end);
Arrays.sort(a);
ret.add(a);
return;
}
final int count = end - start;
if(count == 0) {
// Corner case, that should barely happen. But for ties, we currently
// Do not yet assure that it doesn't happen!
for(int j = 1 << depth; j > 0; --j) {
ret.add(new int[0]);
}
return;
}
double m = 0.;
for(int i = start; i < end; i++) {
m += data[i];
}
m /= count;
int pos = Arrays.binarySearch(data, start, end, m);
if(pos >= 0) {
// Ties: try to choose the most central element.
final int opt = (start + end) >> 1;
while(data[pos] == m) {
if(pos < opt) {
pos++;
}
else if(pos > opt) {
pos--;
}
else {
break;
}
}
}
else {
pos = (-pos - 1);
}
divide(idx, data, ret, start, pos, depth - 1);
divide(idx, data, ret, pos, end, depth - 1);
}
/**
* Intersect the two 1d grid decompositions, to obtain a 2d matrix.
*
* @param res Output matrix to fill
* @param partsx Partitions in first component
* @param partsy Partitions in second component.
* @param gridsize Size of partition decomposition
*/
private void intersectionMatrix(int[][] res, ArrayList partsx, ArrayList partsy, int gridsize) {
for(int x = 0; x < gridsize; x++) {
final int[] px = partsx.get(x);
final int[] rowx = res[x];
for(int y = 0; y < gridsize; y++) {
int[] py = partsy.get(y);
rowx[y] = intersectionSize(px, py);
}
}
}
/**
* Compute the intersection of two sorted integer lists.
*
* @param px First list
* @param py Second list
* @return Intersection size.
*/
private int intersectionSize(int[] px, int[] py) {
int i = 0, j = 0, c = 0;
while(i < px.length && j < py.length) {
final int vx = px[i], vy = py[i];
if(vx < vy) {
++i;
}
else if(vx > vy) {
++j;
}
else {
++c;
++i;
++j;
}
}
return c;
}
/**
* Compute the MCE entropy value.
*
* @param mat Partition size matrix
* @param partsx Partitions on X
* @param partsy Partitions on Y
* @param size Data set size
* @param gridsize Size of grids
* @param loggrid Logarithm of grid sizes, for normalization
* @return MCE score.
*/
private double getMCEntropy(int[][] mat, ArrayList partsx, ArrayList partsy, int size, int gridsize, double loggrid) {
// Margin entropies:
double[] mx = new double[gridsize];
double[] my = new double[gridsize];
for(int i = 0; i < gridsize; i++) {
// Note: indexes are a bit tricky here, because we compute both margin
// entropies at the same time!
final double sumx = (double) partsx.get(i).length;
final double sumy = (double) partsy.get(i).length;
for(int j = 0; j < gridsize; j++) {
double px = mat[i][j] / sumx;
double py = mat[j][i] / sumy;
if(px > 0.) {
mx[i] -= px * Math.log(px);
}
if(py > 0.) {
my[i] -= py * Math.log(py);
}
}
}
// Weighted sums of margin entropies.
double sumx = 0., sumy = 0.;
for(int i = 0; i < gridsize; i++) {
sumx += mx[i] * partsx.get(i).length;
sumy += my[i] * partsy.get(i).length;
}
double max = ((sumx > sumy) ? sumx : sumy);
return max / (size * loggrid);
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
@Override
protected MCEDependenceMeasure makeInstance() {
return STATIC;
}
}
}