All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.Histogram Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils;


import org.broadinstitute.hellbender.exceptions.GATKException;

import java.util.Arrays;
import java.util.TreeSet;

/**
 * Class used for storing a list of doubles as a run length encoded histogram that compresses the data into bins spaced
 * at defined intervals.
 */
public class Histogram {
    private Double binSize;
    private String precisionFormat;
    private String printDelim;
    final private Double BIN_EPSILON = 0.01;

    private CompressedDataList dataList = new CompressedDataList<>();

    /**
     * Create an empty histogram object with a default bin size of 0.1
     */
    public Histogram() {
        this.binSize = 0.1;
        precisionFormat = "%.1f";
    }

    /**
     * Create an empty histogram object with a with a specified bin size
     * @param binSize size of the bins to compress the data into
     */
    public Histogram(final Double binSize) {
        this.binSize = binSize;
        precisionFormat = "%." + Math.round(-Math.log10(binSize)) + "f";
    }

    /**
     * Add a value to be stored in the histogram
     * @param d Data to be added to the histogram
     */
    public void add(final Double d) {
        if (d.isNaN()) {
            return;
        }
        long binKey = getBinnedValue(d);
        if (isValidBinKey(binKey)) {
            dataList.add((int) binKey);
        } else {
            throw new GATKException("Histogram values are suspiciously extreme.  Failed to add " + d + " to the Histogram.");
        }
    }

    /**
     * Add multiple copies of the same value into the histogram to be stored in the histogram
     * @param d Data to be added to the histogram
     * @param count number of repetitions of the value to add to the histogram
     */
    public void add(final Double d, final int count) {
        if (count < 1) {
            throw new GATKException("Cannot add non-positive counts to Histogram.");
        }
        long binKey = getBinnedValue(d);
        if (isValidBinKey(binKey)) {
            dataList.add((int) binKey, count);
        } else {
            throw new GATKException("Histogram values are suspiciously extreme.  Failed to add " + d + " to the Histogram.");
        }
    }

    /**
     * Combine two histogram objects that have the same bin by adding the values to each bin
     * @param h histogram to add
     */
    public void add(final Histogram h) {
        if (!this.binSize.equals(h.binSize)) {
            throw new GATKException("Histogram bin sizes are mismatched -- cannot add bin size " + this.binSize + " to " + h.binSize);
        }
        this.dataList.add(h.dataList);
    }

    /**
     * Return the count of items in the bin corresponding to the provided value
     * @param d Value to test
     * @return Number of items in the bin mapped by the provided key
     */
    public Integer get(final Double d) {
        long binKey = getBinnedValue(d);
        if (isValidBinKey(binKey)) {
            return dataList.getValueCounts().get((int) binKey);
        } else {
            throw new GATKException("Requested value is suspiciously extreme.  Failed to retrieve " + d + " from the Histogram.");
        }
    }

    /**
     *
     * @return may be null if Histogram is empty
     */

    public Double median() {
        int numItems = 0;
        for(final int count : dataList.valueCounts.values()) {
            numItems += count;
        }
        boolean oddNumberValues = true;
        if(numItems % 2 == 0) {
            oddNumberValues = false;
        }
        int medianIndex = (numItems+1)/2;

        int counter = 0;
        Double firstMedian = null;
        for(final Integer key : new TreeSet<>(dataList.valueCounts.keySet())) {
                counter += dataList.valueCounts.get(key);
            if( counter > medianIndex) {
                if (firstMedian == null) {
                    return key * binSize;
                }
                else {
                    return (firstMedian+key)/2.0*binSize;
                }
            }
            if( counter == medianIndex) {
                if (oddNumberValues) {
                    return key * binSize;
                }
                else {
                    firstMedian = (double) key;
                }
            }
        }
        return null;
    }

    private long getBinnedValue(double d) {
        return Math.round(Math.floor((d+BIN_EPSILON*binSize)/binSize)); //add a little epsilon before division so values exactly on bin boundaries will stay in the same bin
    }

    private boolean isValidBinKey(long binnedValue) {
        return binnedValue <= Integer.MAX_VALUE && binnedValue >= Integer.MIN_VALUE;
    }

    @Override
    public String toString(){
        printDelim = ",";
        String str = "";
        Object[] keys = dataList.valueCounts.keySet().toArray();
        if (keys.length == 0) {
            return Double.toString(Double.NaN);
        }
        Arrays.sort(keys);
        for (Object i: keys){
            if(!str.isEmpty()) {
                str += printDelim;
            }
            str+=(String.format(precisionFormat,(double)(int)i*binSize)+printDelim+dataList.valueCounts.get(i));  //key i needs to be output with specified precision
        }
        return str;
    }

    /**
     *
     * @return true if there is no data in this histogram
     */
    public boolean isEmpty() {
        return dataList.isEmpty();
    }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy