All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.math.Histogram Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.math;

import java.util.Arrays;

/**
 * Histogram utilities. A histogram is a graphical display of tabulated
 * frequencies, shown as bars. It shows what proportion of cases fall into
 * each of several categories: it is a form of data binning. The categories
 * are usually specified as non-overlapping intervals of some variable.
 * 

* There is no "best" number of bins, and different bin sizes can reveal * different features of the data. Depending on the actual data distribution * and the goals of the analysis, different bin widths may be appropriate, * so experimentation is usually needed to determine an appropriate width. *

* Note that this class provides only tools to choose the bin width or the * number of bins and frequency counting. It does NOT providing plotting * services. * * @author Haifeng Li */ public class Histogram { /** * Generate the histogram of given data. The number of bins k is decided by * square-root choice. * @param data the data points. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(int[] data) { return histogram(data, bins(data.length)); } /** * Generate the histogram of given data. The number of bins k is decided by * square-root choice. * @param data the data points. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(float[] data) { return histogram(data, bins(data.length)); } /** * Generate the histogram of given data. The number of bins k is decided by * square-root choice. * @param data the data points. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(double[] data) { return histogram(data, bins(data.length)); } /** * Generate the histogram of k bins. * @param data the data points. * @param k the number of bins. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(int[] data, int k) { if (k <= 1) { throw new IllegalArgumentException("Invlaid number of bins: " + k); } int min = Math.min(data); int max = Math.max(data); int span = max - min + 1; int width = 1; int residual = 1; while (residual > 0) { width = span / k; if (width == 0) { width = 1; } residual = span - k * width; if (residual > 0) { k += 1; } } double center = width / 2.0; double[] breaks = new double[k + 1]; breaks[0] = min - center; for (int i = 1; i <= k; i++) { breaks[i] = breaks[i - 1] + width; } return histogram(data, breaks); } /** * Generate the histogram of n bins. * @param data the data points. * @param breaks an array of size k+1 giving the breakpoints between * histogram cells. Must be in ascending order. * @return a 3-by-n bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(int[] data, double[] breaks) { int k = breaks.length - 1; if (k <= 1) { throw new IllegalArgumentException("Invlaid number of bins: " + k); } double[][] freq = new double[3][k]; for (int i = 0; i < k; i++) { freq[0][i] = breaks[i]; freq[1][i] = breaks[i + 1]; freq[2][i] = 0; } for (int d : data) { int j = Arrays.binarySearch(breaks, d); if (j >= k) { j = k - 1; } if (j < -1 && j >= -breaks.length) { j = -j - 2; } if (j >= 0) { freq[2][j]++; } } return freq; } /** * Generate the histogram of n bins. * @param data the data points. * @param k the number of bins. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(float[] data, int k) { if (k <= 1) { throw new IllegalArgumentException("Invlaid number of bins: " + k); } float min = Math.min(data); float max = Math.max(data); float span = max - min; if (span == 0) { span = k; } float width = span / k; float[] breaks = new float[k + 1]; breaks[0] = min; for (int i = 1; i < k; i++) { breaks[i] = breaks[i - 1] + width; } breaks[k] = max; return histogram(data, breaks); } /** * Generate the histogram of n bins. * @param data the data points. * @param breaks an array of size k+1 giving the breakpoints between * histogram cells. Must be in ascending order. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(float[] data, float[] breaks) { int k = breaks.length - 1; if (k <= 1) { throw new IllegalArgumentException("Invlaid number of bins: " + k); } double[][] freq = new double[3][k]; for (int i = 0; i < k; i++) { freq[0][i] = breaks[i]; freq[1][i] = breaks[i + 1]; freq[2][i] = 0.0f; } for (float d : data) { int j = Arrays.binarySearch(breaks, d); if (j >= k) { j = k - 1; } if (j < -1 && j >= -breaks.length) { j = -j - 2; } if (j >= 0) { freq[2][j]++; } } return freq; } /** * Generate the histogram of n bins. * @param data the data points. * @param k the number of bins. * @return a 3-by-k array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(double[] data, int k) { double min = Math.min(data); double max = Math.max(data); double span = max - min; if (span == 0) { span = k; } double width = span / k; double[] breaks = new double[k + 1]; breaks[0] = min; for (int i = 1; i < k; i++) { breaks[i] = breaks[i - 1] + width; } breaks[k] = max; return histogram(data, breaks); } /** * Generate the histogram of n bins. * @param data the data points. * @param breaks an array of size k+1 giving the breakpoints between * histogram cells. Must be in ascending order. * @return a 3-by-k bins array of which first row is the lower bound of bins, * second row is the upper bound of bins, and the third row is the frequence * count. */ public static double[][] histogram(double[] data, double[] breaks) { int k = breaks.length - 1; if (k <= 1) { throw new IllegalArgumentException("Invlaid number of bins: " + k); } double[][] freq = new double[3][k]; for (int i = 0; i < k; i++) { freq[0][i] = breaks[i]; freq[1][i] = breaks[i + 1]; freq[2][i] = 0.0f; } for (double d : data) { int j = Arrays.binarySearch(breaks, d); if (j >= k) { j = k - 1; } if (j < -1 && j >= -breaks.length) { j = -j - 2; } if (j >= 0) { freq[2][j]++; } } return freq; } /** * Returns the breakpoints between histogram cells for a dataset based on a * suggested bin width h. * @param x the data set. * @param h the bin width. * @return the breakpoints between histogram cells */ public static double[] breaks(double[] x, double h) { return breaks(Math.min(x), Math.max(x), h); } /** * Returns the breakpoints between histogram cells for a given range based * on a suggested bin width h. * @param min the lower bound of bins. * @param max the upper bound of bins. * @param h the bin width. * @return the breakpoints between histogram cells */ public static double[] breaks(double min, double max, double h) { if (h <= 0.0) { throw new IllegalArgumentException("Invalid bin width: " + h); } if (min > max) { throw new IllegalArgumentException("Invalid lower and upper bounds: " + min + " > " + max); } int k = (int) Math.ceil((max-min) / h); double[] breaks = new double[k + 1]; breaks[0] = min - (h * k - (max - min)) / 2; breaks[k] = max + (h * k - (max - min)) / 2; for (int i = 1; i < k; i++) { breaks[i] = breaks[i - 1] + h; } return breaks; } /** * Returns the breakpoints between histogram cells for a dataset. * @param x the data set. * @param k the number of bins. * @return the breakpoints between histogram cells */ public static double[] breaks(double[] x, int k) { return breaks(Math.min(x), Math.max(x), k); } /** * Returns the breakpoints between histogram cells for a given range. * @param min the lower bound of bins. * @param max the upper bound of bins. * @param k the number of bins. * @return the breakpoints between histogram cells */ public static double[] breaks(double min, double max, int k) { if (k <= 1) { throw new IllegalArgumentException("Invalid number of bins: " + k); } if (min > max) { throw new IllegalArgumentException("Invalid lower and upper bounds: " + min + " > " + max); } double h = (max - min) / k; return breaks(min, max, h); } /** * Returns the number of bins for a data based on a suggested bin width h. * @param x the data set. * @param h the bin width. * @return the number of bins k = ceil((max - min) / h) */ public static int bins(double[] x, double h) { if (h <= 0.0) { throw new IllegalArgumentException("Invalid bin width: " + h); } double max = Math.max(x); double min = Math.min(x); return (int) Math.ceil((max-min) / h); } /** * Returns the number of bins by square-root rule, which takes the square * root of the number of data points in the sample (used by Excel histograms * and many others). * @param n the number of data points. * @return the number of bins */ public static int bins(int n) { int k = (int) Math.sqrt(n); if (k < 5) k = 5; return k; } /** * Returns the number of bins by Sturges' rule k = ceil(log2(n) + 1). * @param n the number of data points. * @return the number of bins */ public static int sturges(int n) { int k = (int) Math.ceil(Math.log2(n) + 1); if (k < 5) k = 5; return k; } /** * Returns the number of bins by Scott's rule h = 3.5 * σ / (n1/3). * @param x the data set. * @return the number of bins */ public static int scott(double[] x) { double h = Math.ceil(3.5 * Math.sd(x) / Math.pow(x.length, 1.0/3)); return bins(x, h); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy