org.pentaho.di.trans.steps.univariatestats.FieldIndex Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.univariatestats;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Class used to hold operating field index, intermediate data and final results for a stats calculation.
*
* Has functions to compute the mean, standard deviation and arbitrary percentiles. Percentiles can be computed using
* interpolation or a simple method. See
* The Engineering Statistics Handbook for details.
*/
public class FieldIndex {
public int m_columnIndex;
public double m_count;
public double m_mean;
public double m_stdDev;
public double m_max;
public double m_min;
public double m_median;
public double m_arbitraryPercentile;
public double m_sum;
public double m_sumSq;
public void calculateDerived() {
m_mean = Double.NaN;
m_stdDev = Double.NaN;
if ( m_count > 0 ) {
m_mean = m_sum / m_count;
m_stdDev = Double.POSITIVE_INFINITY;
if ( m_count > 1 ) {
m_stdDev = m_sumSq - ( m_sum * m_sum ) / m_count;
m_stdDev /= ( m_count - 1 );
if ( m_stdDev < 0 ) {
// round to zero
m_stdDev = 0;
}
m_stdDev = Math.sqrt( m_stdDev );
}
}
}
/**
* Compute a percentile. Can compute percentiles using interpolation or a simple method (see
* for details).
*
*
* @param p
* the percentile to compute (0 <= p <= 1)
* @param vals
* a sorted array of values to compute the percentile from
* @param interpolate
* true if interpolation is to be used
* @return the percentile value
*/
private double percentile( double p, double[] vals, boolean interpolate ) {
double n = m_count;
// interpolation
if ( interpolate ) {
double i = p * ( n + 1 );
// special cases
if ( i <= 1 ) {
return m_min;
}
if ( i >= n ) {
return m_max;
}
double low_obs = Math.floor( i );
double high_obs = low_obs + 1;
double r1 = high_obs - i;
double r2 = 1.0 - r1;
double x1 = vals[(int) low_obs - 1];
double x2 = vals[(int) high_obs - 1];
return ( r1 * x1 ) + ( r2 * x2 );
}
// simple method
double i = p * n;
double res = 0;
if ( i == 0 ) {
return m_min;
}
if ( i == n ) {
return m_max;
}
if ( i - Math.floor( i ) > 0 ) {
i = Math.floor( i );
res = vals[(int) i];
} else {
res = ( vals[(int) ( i - 1 )] + vals[(int) i] ) / 2.0;
}
return res;
}
/**
* Constructs an array of Objects containing the requested statistics for one univariate stats meta function using
* this FieldIndex
.
*
* @param usmf
* theUnivariateStatsMetaFunction
to compute stats for. This contains the input field selected
* by the user along with which stats to compute for it.
* @return an array of computed statistics
*/
public Object[] generateOutputValues( UnivariateStatsMetaFunction usmf, ArrayList cache ) {
calculateDerived();
// process cache?
if ( cache != null ) {
double[] result = new double[(int) m_count];
for ( int i = 0; i < cache.size(); i++ ) {
result[i] = cache.get( i ).doubleValue();
}
Arrays.sort( result );
if ( usmf.getCalcMedian() ) {
m_median = percentile( 0.5, result, usmf.getInterpolatePercentile() );
}
if ( usmf.getCalcPercentile() >= 0 ) {
m_arbitraryPercentile = percentile( usmf.getCalcPercentile(), result, usmf.getInterpolatePercentile() );
}
}
Object[] result = new Object[usmf.numberOfMetricsRequested()];
int index = 0;
if ( usmf.getCalcN() ) {
result[index++] = new Double( m_count );
}
if ( usmf.getCalcMean() ) {
result[index++] = new Double( m_mean );
}
if ( usmf.getCalcStdDev() ) {
result[index++] = new Double( m_stdDev );
}
if ( usmf.getCalcMin() ) {
result[index++] = new Double( m_min );
}
if ( usmf.getCalcMax() ) {
result[index++] = new Double( m_max );
}
if ( usmf.getCalcMedian() ) {
result[index++] = new Double( m_median );
}
if ( usmf.getCalcPercentile() >= 0 ) {
result[index++] = new Double( m_arbitraryPercentile );
}
return result;
}
}