All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.api.java.summarize.NumericColumnSummary Maven / Gradle / Ivy

There is a newer version: 1.20.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java.summarize;

import org.apache.flink.annotation.PublicEvolving;

/**
 * Generic Column Summary for Numeric Types.
 *
 * Some values are considered "missing" where "missing" is defined as null, NaN, or Infinity.
 * These values are ignored in some calculations like mean, variance, and standardDeviation.
 *
 * Uses the Kahan summation algorithm to avoid numeric instability when computing variance.
 * The algorithm is described in: "Scalable and Numerically Stable Descriptive Statistics in SystemML",
 * Tian et al, International Conference on Data Engineering 2012.
 *
 * @param  the numeric type e.g. Integer, Double
 */
@PublicEvolving
public class NumericColumnSummary extends ColumnSummary implements java.io.Serializable {

	private static final long serialVersionUID = 1L;

	private final long nonMissingCount; // count of elements that are NOT null, NaN, or Infinite
	private final long nullCount;
	private final long nanCount; // always zero for types like Short, Integer, Long
	private final long infinityCount; // always zero for types like Short, Integer, Long

	private final T min;
	private final T max;
	private final T sum;

	private final Double mean;
	private final Double variance;
	private final Double standardDeviation;

	public NumericColumnSummary(long nonMissingCount, long nullCount, long nanCount, long infinityCount, T min, T max, T sum, Double mean, Double variance, Double standardDeviation) {
		this.nonMissingCount = nonMissingCount;
		this.nullCount = nullCount;
		this.nanCount = nanCount;
		this.infinityCount = infinityCount;
		this.min = min;
		this.max = max;
		this.sum = sum;
		this.mean = mean;
		this.variance = variance;
		this.standardDeviation = standardDeviation;
	}

	/**
	 * The number of "missing" values where "missing" is defined as null, NaN, or Infinity.
	 *
	 * These values are ignored in some calculations like mean, variance, and standardDeviation.
	 */
	public long getMissingCount() {
		return nullCount + nanCount + infinityCount;
	}

	/**
	 * The number of values that are not null, NaN, or Infinity.
	 */
	public long getNonMissingCount() {
		return nonMissingCount;
	}

	/**
	 * The number of non-null values in this column
	 */
	@Override
	public long getNonNullCount() {
		return nonMissingCount + nanCount + infinityCount;
	}

	@Override
	public long getNullCount() {
		return nullCount;
	}

	/**
	 * Number of values that are NaN.
	 *
	 * (always zero for types like Short, Integer, Long)
	 */
	public long getNanCount() {
		return nanCount;
	}

	/**
	 * Number of values that are positive or negative infinity.
	 *
	 * (always zero for types like Short, Integer, Long)
	 */
	public long getInfinityCount() {
		return infinityCount;
	}

	public T getMin() {
		return min;
	}

	public T getMax() {
		return max;
	}

	public T getSum() {
		return sum;
	}

	/**
	 * Null, NaN, and Infinite values are ignored in this calculation.
	 *
	 * @see Arithmetic Mean
	 */
	public Double getMean() {
		return mean;
	}

	/**
	 * Variance is a measure of how far a set of numbers are spread out.
	 *
	 * Null, NaN, and Infinite values are ignored in this calculation.
	 *
	 * @see Variance
	 */
	public Double getVariance() {
		return variance;
	}

	/**
	 * Standard Deviation is a measure of variation in a set of numbers.  It is the square root of the variance.
	 *
	 * Null, NaN, and Infinite values are ignored in this calculation.
	 *
	 * @see Standard Deviation
	 */
	public Double getStandardDeviation() {
		return standardDeviation;
	}

	@Override
	public String toString() {
		return "NumericColumnSummary{" +
			"totalCount=" + getTotalCount() +
			", nullCount=" + nullCount +
			", nonNullCount=" + getNonNullCount() +
			", missingCount=" + getMissingCount() +
			", nonMissingCount=" + nonMissingCount +
			", nanCount=" + nanCount +
			", infinityCount=" + infinityCount +
			", min=" + min +
			", max=" + max +
			", sum=" + sum +
			", mean=" + mean +
			", variance=" + variance +
			", standardDeviation=" + standardDeviation +
			'}';
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy