org.apache.flink.api.java.summarize.NumericColumnSummary Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.java.summarize;
import org.apache.flink.annotation.PublicEvolving;
/**
* Generic Column Summary for Numeric Types.
*
* Some values are considered "missing" where "missing" is defined as null, NaN, or Infinity.
* These values are ignored in some calculations like mean, variance, and standardDeviation.
*
*
Uses the Kahan summation algorithm to avoid numeric instability when computing variance. The
* algorithm is described in: "Scalable and Numerically Stable Descriptive Statistics in SystemML",
* Tian et al, International Conference on Data Engineering 2012.
*
* @param the numeric type e.g. Integer, Double
*/
@PublicEvolving
public class NumericColumnSummary extends ColumnSummary implements java.io.Serializable {
private static final long serialVersionUID = 1L;
private final long nonMissingCount; // count of elements that are NOT null, NaN, or Infinite
private final long nullCount;
private final long nanCount; // always zero for types like Short, Integer, Long
private final long infinityCount; // always zero for types like Short, Integer, Long
private final T min;
private final T max;
private final T sum;
private final Double mean;
private final Double variance;
private final Double standardDeviation;
public NumericColumnSummary(
long nonMissingCount,
long nullCount,
long nanCount,
long infinityCount,
T min,
T max,
T sum,
Double mean,
Double variance,
Double standardDeviation) {
this.nonMissingCount = nonMissingCount;
this.nullCount = nullCount;
this.nanCount = nanCount;
this.infinityCount = infinityCount;
this.min = min;
this.max = max;
this.sum = sum;
this.mean = mean;
this.variance = variance;
this.standardDeviation = standardDeviation;
}
/**
* The number of "missing" values where "missing" is defined as null, NaN, or Infinity.
*
* These values are ignored in some calculations like mean, variance, and standardDeviation.
*/
public long getMissingCount() {
return nullCount + nanCount + infinityCount;
}
/** The number of values that are not null, NaN, or Infinity. */
public long getNonMissingCount() {
return nonMissingCount;
}
/** The number of non-null values in this column. */
@Override
public long getNonNullCount() {
return nonMissingCount + nanCount + infinityCount;
}
@Override
public long getNullCount() {
return nullCount;
}
/**
* Number of values that are NaN.
*
*
(always zero for types like Short, Integer, Long)
*/
public long getNanCount() {
return nanCount;
}
/**
* Number of values that are positive or negative infinity.
*
*
(always zero for types like Short, Integer, Long)
*/
public long getInfinityCount() {
return infinityCount;
}
public T getMin() {
return min;
}
public T getMax() {
return max;
}
public T getSum() {
return sum;
}
/**
* Null, NaN, and Infinite values are ignored in this calculation.
*
* @see Arithmetic Mean
*/
public Double getMean() {
return mean;
}
/**
* Variance is a measure of how far a set of numbers are spread out.
*
*
Null, NaN, and Infinite values are ignored in this calculation.
*
* @see Variance
*/
public Double getVariance() {
return variance;
}
/**
* Standard Deviation is a measure of variation in a set of numbers. It is the square root of
* the variance.
*
*
Null, NaN, and Infinite values are ignored in this calculation.
*
* @see Standard Deviation
*/
public Double getStandardDeviation() {
return standardDeviation;
}
@Override
public String toString() {
return "NumericColumnSummary{"
+ "totalCount="
+ getTotalCount()
+ ", nullCount="
+ nullCount
+ ", nonNullCount="
+ getNonNullCount()
+ ", missingCount="
+ getMissingCount()
+ ", nonMissingCount="
+ nonMissingCount
+ ", nanCount="
+ nanCount
+ ", infinityCount="
+ infinityCount
+ ", min="
+ min
+ ", max="
+ max
+ ", sum="
+ sum
+ ", mean="
+ mean
+ ", variance="
+ variance
+ ", standardDeviation="
+ standardDeviation
+ '}';
}
}