com.github.steveash.jg2p.util.JenksBreaks Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2015 Steve Ash
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.steveash.jg2p.util;
import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
import java.util.Collections;
import java.util.List;
public class JenksBreaks {
public static List computeBreaks(Iterable inData, int numClasses) {
// Preconditions.isTrue(data.size() >= numClasses, "Data is too small to split into " + numClasses + " classes");
List data = Ordering.natural().sortedCopy(inData);
int[][] lowerClassLimits = jenksMatrix(data, numClasses);
return jenksBreaks(data, lowerClassLimits, numClasses);
}
public static double goodnessOfFit(Iterable inData, List breaks) {
List data = Ordering.natural().sortedCopy(inData);
double sumOfSse = 0;
int start = 0;
for (int i = 1; i < breaks.size() - 1; i++) {
int nextStart = findFirstGt(data, breaks.get(i), start);
assert nextStart >= 0;
sumOfSse += sse(data.subList(start, nextStart));
start = nextStart;
}
// last segment
sumOfSse += sse(data.subList(start, data.size()));
double overallSse = sse(data);
return (overallSse - sumOfSse) / overallSse;
}
private static int findFirstGt(List data, double gteValue, int startFrom) {
for (int i = startFrom; i < data.size(); i++) {
if (data.get(i) >= gteValue) {
return i;
}
}
return -1;
}
private static double avg(List data) {
int count = 0;
double sum = 0;
for (Double val : data) {
count += 1;
sum += val;
}
return sum / count;
}
private static double sse(List data) {
double mean = avg(data);
double sum = 0;
for (Double val : data) {
sum += ((val - mean) * (val - mean));
}
return sum;
}
private static List jenksBreaks(List data, int[][] lowerClassLimits, int numClasses) {
int k = data.size() - 1;
List kClass = Lists.newArrayList(Collections.nCopies(numClasses + 1, 0D));
int countNum = numClasses;
kClass.set(numClasses, data.get(data.size() - 1));
kClass.set(0, data.get(0));
while (countNum > 1) {
kClass.set(countNum - 1, data.get(lowerClassLimits[k][countNum] - 1));
k = lowerClassLimits[k][countNum] - 1;
countNum--;
}
return kClass;
}
private static int[][] jenksMatrix(List data, int numClasses) {
int[][] lowerClassLimits = new int[data.size() + 1][numClasses + 1];
double[][] varianceCombinations = new double[data.size() + 1][numClasses + 1];
for (int i = 1; i < numClasses + 1; i++) {
lowerClassLimits[1][i] = 1;
for (int j = 2; j < data.size() + 1; j++) {
varianceCombinations[j][i] = Double.MAX_VALUE;
}
}
double variance = 0;
for (int l = 2; l < data.size() + 1; l++) {
double sum = 0, sumSq = 0;
int w = 0, i4;
for (int m = 1; m < l + 1; m++) {
int lowerClassLimit = l - m + 1;
double val = data.get(lowerClassLimit - 1);
w++;
sum += val;
sumSq += val * val;
variance = sumSq - (sum * sum) / w;
i4 = lowerClassLimit - 1;
if (i4 != 0) {
for (int j = 2; j < numClasses + 1; j++) {
if (varianceCombinations[l][j] >= (variance + varianceCombinations[i4][j - 1])) {
lowerClassLimits[l][j] = lowerClassLimit;
varianceCombinations[l][j] = variance + varianceCombinations[i4][j - 1];
}
}
}
}
lowerClassLimits[l][1] = 1;
varianceCombinations[l][1] = variance;
}
return lowerClassLimits;
}
}