au.csiro.variantspark.algo.split.JNaiveContinousIndexedSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of variant-spark_2.11 Show documentation
Genomic variants interpretation toolkit
The newest version!
package au.csiro.variantspark.algo.split;

import au.csiro.variantspark.algo.IndexedSplitAggregator;
import au.csiro.variantspark.algo.SplitInfo;
import it.unimi.dsi.fastutil.doubles.DoubleArrays;


/**
 * @author szu004
 * This is a naive implementation of precise continouns variable splitter
 */

public class JNaiveContinousIndexedSplitter extends AbstractIndexedSplitterBase {

	private final double[] data;
	public JNaiveContinousIndexedSplitter(IndexedSplitAggregator impurityCalc, double[] data) {
		super(impurityCalc);
		this.data = data;	
	}
	
	@Override
	protected SplitInfo doFindSplit(int[] splitIndices) {		
		// TODO: [Perfomance] this is where the sorting trick might be useful 
		// this is all to sort the subset indexes in ascending order by the values the refer to
		// using available Java functions
		double splitValues[] = new double[splitIndices.length];
		int order[] = new int[splitIndices.length];
		for(int i=0; i < splitIndices.length; i++) {
			splitValues[i] = data[splitIndices[i]];
			order[i] = i;
		}
		DoubleArrays.quickSortIndirect(order, splitValues);
		int[] sortedSplitIndices = new int[splitIndices.length];
		
		for(int i=0; i < order.length; i++) {
			sortedSplitIndices[i] = splitIndices[order[i]];
		}

		// INFO: a valid split is 
		// - left: v <= splitValue
		// - right: splitValue < v
		// NOTE: continous split only makes sense if there are at least two different values in subset
		// otherwise not split can be done
		// also we can only split at value changes so if there are repeat values we need to continue 
		// and only check for gini improvement if there is a change		
		double minImpurity = Double.MAX_VALUE;
		double splitValue = Double.NaN;
		double splitLeftImpurity = Double.NaN, splitRightImpurity=Double.NaN;
		double lastValue = data[sortedSplitIndices[0]];
		
		// we now go through the subset starting from the smallest values 
		// (sortedSplitIndices are sorted by ascending values the refer to)
		
		for(int i:sortedSplitIndices) {
			double currentValue = data[i];
			if (currentValue !=lastValue) {
				// possible split treshold
				double lastValueImpurity = impurityCalc.getValue(leftRightImpurity);
				if (lastValueImpurity < minImpurity) {
					// OK we have got a better split here
					splitValue = lastValue;
					minImpurity = lastValueImpurity;
					splitLeftImpurity = leftRightImpurity.left();
					splitRightImpurity = leftRightImpurity.right();					
				}
			}			
			impurityCalc.update(i);
			lastValue = currentValue;
		}
		// if splitValue is not NaN we seem to have a split here
		return (!Double.isNaN(splitValue))? new SplitInfo(splitValue, minImpurity, splitLeftImpurity, splitRightImpurity):null;
	}
}