All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.api.java.summarize.NumericColumnSummary Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java.summarize;

import org.apache.flink.annotation.PublicEvolving;

/**
 * Generic Column Summary for Numeric Types.
 *
 * 

Some values are considered "missing" where "missing" is defined as null, NaN, or Infinity. * These values are ignored in some calculations like mean, variance, and standardDeviation. * *

Uses the Kahan summation algorithm to avoid numeric instability when computing variance. The * algorithm is described in: "Scalable and Numerically Stable Descriptive Statistics in SystemML", * Tian et al, International Conference on Data Engineering 2012. * * @param the numeric type e.g. Integer, Double */ @PublicEvolving public class NumericColumnSummary extends ColumnSummary implements java.io.Serializable { private static final long serialVersionUID = 1L; private final long nonMissingCount; // count of elements that are NOT null, NaN, or Infinite private final long nullCount; private final long nanCount; // always zero for types like Short, Integer, Long private final long infinityCount; // always zero for types like Short, Integer, Long private final T min; private final T max; private final T sum; private final Double mean; private final Double variance; private final Double standardDeviation; public NumericColumnSummary( long nonMissingCount, long nullCount, long nanCount, long infinityCount, T min, T max, T sum, Double mean, Double variance, Double standardDeviation) { this.nonMissingCount = nonMissingCount; this.nullCount = nullCount; this.nanCount = nanCount; this.infinityCount = infinityCount; this.min = min; this.max = max; this.sum = sum; this.mean = mean; this.variance = variance; this.standardDeviation = standardDeviation; } /** * The number of "missing" values where "missing" is defined as null, NaN, or Infinity. * *

These values are ignored in some calculations like mean, variance, and standardDeviation. */ public long getMissingCount() { return nullCount + nanCount + infinityCount; } /** The number of values that are not null, NaN, or Infinity. */ public long getNonMissingCount() { return nonMissingCount; } /** The number of non-null values in this column. */ @Override public long getNonNullCount() { return nonMissingCount + nanCount + infinityCount; } @Override public long getNullCount() { return nullCount; } /** * Number of values that are NaN. * *

(always zero for types like Short, Integer, Long) */ public long getNanCount() { return nanCount; } /** * Number of values that are positive or negative infinity. * *

(always zero for types like Short, Integer, Long) */ public long getInfinityCount() { return infinityCount; } public T getMin() { return min; } public T getMax() { return max; } public T getSum() { return sum; } /** * Null, NaN, and Infinite values are ignored in this calculation. * * @see Arithmetic Mean */ public Double getMean() { return mean; } /** * Variance is a measure of how far a set of numbers are spread out. * *

Null, NaN, and Infinite values are ignored in this calculation. * * @see Variance */ public Double getVariance() { return variance; } /** * Standard Deviation is a measure of variation in a set of numbers. It is the square root of * the variance. * *

Null, NaN, and Infinite values are ignored in this calculation. * * @see Standard Deviation */ public Double getStandardDeviation() { return standardDeviation; } @Override public String toString() { return "NumericColumnSummary{" + "totalCount=" + getTotalCount() + ", nullCount=" + nullCount + ", nonNullCount=" + getNonNullCount() + ", missingCount=" + getMissingCount() + ", nonMissingCount=" + nonMissingCount + ", nanCount=" + nanCount + ", infinityCount=" + infinityCount + ", min=" + min + ", max=" + max + ", sum=" + sum + ", mean=" + mean + ", variance=" + variance + ", standardDeviation=" + standardDeviation + '}'; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy