All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pentaho.di.trans.steps.univariatestats.UnivariateStatsMetaFunction Maven / Gradle / Ivy

The newest version!
/*! ******************************************************************************
 *
 
* Pentaho Data Integration
 *
 * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.univariatestats;

import java.util.Objects;

import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.xml.XMLHandler;
import org.pentaho.di.repository.ObjectId;
import org.pentaho.di.repository.Repository;
import org.pentaho.metastore.api.IMetaStore;
import org.w3c.dom.Node;

/**
 * Holds meta information about one univariate stats calculation: source field name and what derived values are to be
 * computed
 *
 * @author Mark Hall (mhall{[at]}pentaho.org
 * @version 1.0
 */
public class UnivariateStatsMetaFunction implements Cloneable {

  public static final String XML_TAG = "univariate_stats";

  private String m_sourceFieldName;
  private boolean m_n = true;
  private boolean m_mean = true;
  private boolean m_stdDev = true;
  private boolean m_min = true;
  private boolean m_max = true;
  private boolean m_median = true;
  private double m_arbitraryPercentile = -1;
  private boolean m_interpolatePercentile = true;

  /**
   * Creates a new UnivariateStatsMetaFunction
   *
   * @param sourceFieldName
   *          the name of the input field to compute stats for
   * @param n
   *          output N
   * @param mean
   *          compute and output the mean
   * @param stdDev
   *          compute and output the standard deviation
   * @param min
   *          output the minumum value
   * @param max
   *          output the maximum value
   * @param median
   *          compute and output the median (requires data caching and sorting)
   * @param arbPercentile
   *          compute and output a percentile (0 <= arbPercentile <= 1)
   * @param interpolate
   *          true if interpolation is to be used for percentiles (rather than a simple method). See  The Engineering Statistics
   *          Handbook for details.
   */
  public UnivariateStatsMetaFunction( String sourceFieldName, boolean n, boolean mean, boolean stdDev,
    boolean min, boolean max, boolean median, double arbPercentile, boolean interpolate ) {
    m_sourceFieldName = sourceFieldName;
    m_n = n;
    m_mean = mean;
    m_stdDev = stdDev;
    m_min = min;
    m_max = max;
    m_median = median;
    m_arbitraryPercentile = arbPercentile;
    m_interpolatePercentile = interpolate;
  }

  /**
   * Construct from an XML node
   *
   * @param uniNode
   *          a XML node
   */
  public UnivariateStatsMetaFunction( Node uniNode ) {
    String temp;
    m_sourceFieldName = XMLHandler.getTagValue( uniNode, "source_field_name" );

    temp = XMLHandler.getTagValue( uniNode, "N" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_n = false;
    }
    temp = XMLHandler.getTagValue( uniNode, "mean" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_mean = false;
    }

    temp = XMLHandler.getTagValue( uniNode, "stdDev" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_stdDev = false;
    }

    temp = XMLHandler.getTagValue( uniNode, "min" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_min = false;
    }

    temp = XMLHandler.getTagValue( uniNode, "max" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_max = false;
    }

    temp = XMLHandler.getTagValue( uniNode, "median" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_median = false;
    }

    temp = XMLHandler.getTagValue( uniNode, "percentile" );
    try {
      m_arbitraryPercentile = Double.parseDouble( temp );
    } catch ( Exception ex ) {
      m_arbitraryPercentile = -1;
    }

    temp = XMLHandler.getTagValue( uniNode, "interpolate" );
    if ( temp.equalsIgnoreCase( "N" ) ) {
      m_interpolatePercentile = false;
    }
  }

  /**
   * Construct using data stored in repository
   *
   * @param rep
   *          the repository
   * @param id_step
   *          the id of the step
   * @param nr
   *          the step number
   * @exception KettleException
   *              if an error occurs
   */
  public UnivariateStatsMetaFunction( Repository rep, ObjectId id_step, int nr ) throws KettleException {
    m_sourceFieldName = rep.getStepAttributeString( id_step, nr, "source_field_name" );
    m_n = rep.getStepAttributeBoolean( id_step, nr, "N" );
    m_mean = rep.getStepAttributeBoolean( id_step, nr, "mean" );
    m_stdDev = rep.getStepAttributeBoolean( id_step, nr, "stdDev" );
    m_min = rep.getStepAttributeBoolean( id_step, nr, "min" );
    m_max = rep.getStepAttributeBoolean( id_step, nr, "max" );
    m_median = rep.getStepAttributeBoolean( id_step, nr, "median" );
    String temp = rep.getStepAttributeString( id_step, nr, "percentile" );
    try {
      m_arbitraryPercentile = Double.parseDouble( temp );
    } catch ( Exception ex ) {
      m_arbitraryPercentile = -1;
    }
    m_interpolatePercentile = rep.getStepAttributeBoolean( id_step, nr, "interpolate" );
  }

  /**
   * Check for equality
   *
   * @param obj
   *          an UnivarateStatsMetaFunction to compare against
   * @return true if this Object and the supplied one are the same
   */
  public boolean equals( Object obj ) {
    if ( ( obj != null ) && ( obj.getClass().equals( this.getClass() ) ) ) {
      UnivariateStatsMetaFunction mf = (UnivariateStatsMetaFunction) obj;

      return ( getXML().equals( mf.getXML() ) );
    }

    return false;
  }

  @Override
  public int hashCode() {
    return Objects.hash( m_sourceFieldName, m_n, m_mean, m_stdDev, m_min, m_max, m_median, m_arbitraryPercentile,
        m_interpolatePercentile );
  }

  /**
   * Return a String containing XML describing this UnivariateStatsMetaFunction
   *
   * @return an XML description of this UnivarateStatsMetaFunction
   */
  public String getXML() {
    String xml = ( "<" + XML_TAG + ">" );

    xml += XMLHandler.addTagValue( "source_field_name", m_sourceFieldName );
    xml += XMLHandler.addTagValue( "N", m_n );
    xml += XMLHandler.addTagValue( "mean", m_mean );
    xml += XMLHandler.addTagValue( "stdDev", m_stdDev );
    xml += XMLHandler.addTagValue( "min", m_min );
    xml += XMLHandler.addTagValue( "max", m_max );
    xml += XMLHandler.addTagValue( "median", m_median );
    xml += XMLHandler.addTagValue( "percentile", "" + m_arbitraryPercentile );
    xml += XMLHandler.addTagValue( "interpolate", m_interpolatePercentile );

    xml += ( "" );

    return xml;
  }

  /**
   * Save this UnivariateStatsMetaFunction to a repository
   *
   * @param rep
   *          the repository to save to
   * @param id_transformation
   *          the transformation id
   * @param id_step
   *          the step id
   * @param nr
   *          the step number
   * @exception KettleException
   *              if an error occurs
   */
  public void saveRep( Repository rep, IMetaStore metaStore, ObjectId id_transformation, ObjectId id_step, int nr ) throws KettleException {

    rep.saveStepAttribute( id_transformation, id_step, nr, "source_field_name", m_sourceFieldName );
    rep.saveStepAttribute( id_transformation, id_step, nr, "N", m_n );
    rep.saveStepAttribute( id_transformation, id_step, nr, "mean", m_mean );
    rep.saveStepAttribute( id_transformation, id_step, nr, "stdDev", m_stdDev );
    rep.saveStepAttribute( id_transformation, id_step, nr, "min", m_min );
    rep.saveStepAttribute( id_transformation, id_step, nr, "max", m_max );
    rep.saveStepAttribute( id_transformation, id_step, nr, "median", m_median );
    rep.saveStepAttribute( id_transformation, id_step, nr, "percentile", " " + m_arbitraryPercentile );
    rep.saveStepAttribute( id_transformation, id_step, nr, "interpolate", m_interpolatePercentile );
  }

  /**
   * Make a copy
   *
   * @return a copy of this UnivariateStatsMetaFunction.
   */
  public Object clone() {
    try {
      UnivariateStatsMetaFunction retval = (UnivariateStatsMetaFunction) super.clone();

      return retval;
    } catch ( CloneNotSupportedException e ) {
      return null;
    }
  }

  /**
   * Set the name of the input field used by this UnivariateStatsMetaFunction.
   *
   * @param sn
   *          the name of the source field to use
   */
  public void setSourceFieldName( String sn ) {
    m_sourceFieldName = sn;
  }

  /**
   * Return the name of the input field used by this UnivariateStatsMetaFunction
   *
   * @return the name of the input field used
   */
  public String getSourceFieldName() {
    return m_sourceFieldName;
  }

  /**
   * Set whether to calculate N for this input field
   *
   * @param n
   *          true if N is to be calculated
   */
  public void setCalcN( boolean n ) {
    m_n = n;
  }

  /**
   * Get whether N is to be calculated for this input field
   *
   * @return true if N is to be calculated
   */
  public boolean getCalcN() {
    return m_n;
  }

  /**
   * Set whether to calculate the mean for this input field
   *
   * @param b
   *          true if the mean is to be calculated
   */
  public void setCalcMean( boolean b ) {
    m_mean = b;
  }

  /**
   * Get whether the mean is to be calculated for this input field
   *
   * @return true if the mean is to be calculated
   */
  public boolean getCalcMean() {
    return m_mean;
  }

  /**
   * Set whether the standard deviation is to be calculated for this input value
   *
   * @param b
   *          true if the standard deviation is to be calculated
   */
  public void setCalcStdDev( boolean b ) {
    m_stdDev = b;
  }

  /**
   * Get whether the standard deviation is to be calculated for this input value
   *
   * @return true if the standard deviation is to be calculated
   */
  public boolean getCalcStdDev() {
    return m_stdDev;
  }

  /**
   * Set whether the minimum is to be calculated for this input value
   *
   * @param b
   *          true if the minimum is to be calculated
   */
  public void setCalcMin( boolean b ) {
    m_min = b;
  }

  /**
   * Get whether the minimum is to be calculated for this input value
   *
   * @return true if the minimum is to be calculated
   */
  public boolean getCalcMin() {
    return m_min;
  }

  /**
   * Set whether the maximum is to be calculated for this input value
   *
   * @param b
   *          true if the maximum is to be calculated
   */
  public void setCalcMax( boolean b ) {
    m_max = b;
  }

  /**
   * Get whether the maximum is to be calculated for this input value
   *
   * @return true if the maximum is to be calculated
   */
  public boolean getCalcMax() {
    return m_max;
  }

  /**
   * Set whether the median is to be calculated for this input value
   *
   * @param b
   *          true if the median is to be calculated
   */
  public void setCalcMedian( boolean b ) {
    m_median = b;
  }

  /**
   * Get whether the median is to be calculated for this input value
   *
   * @return true if the median is to be calculated
   */
  public boolean getCalcMedian() {
    return m_median;
  }

  /**
   * Get whether interpolation is to be used in the computation of percentiles
   *
   * @return true if interpolation is to be used
   */
  public boolean getInterpolatePercentile() {
    return m_interpolatePercentile;
  }

  /**
   * Set whether interpolation is to be used in the computation of percentiles
   *
   * @param i
   *          true is interpolation is to be used
   */
  public void setInterpolatePercentile( boolean i ) {
    m_interpolatePercentile = i;
  }

  /**
   * Gets whether an arbitrary percentile is to be calculated for this input field
   *
   * @return true if a percentile is to be computed
   */
  public double getCalcPercentile() {
    return m_arbitraryPercentile;
  }

  /**
   * Sets whether an arbitrary percentile is to be calculated for this input field
   *
   * @param percentile
   *          the percentile to compute (0 <= percentile <= 100)
   */
  public void setCalcPercentile( double percentile ) {
    if ( percentile < 0 ) {
      m_arbitraryPercentile = -1; // not used
      return;
    }

    if ( percentile >= 0 && percentile <= 100 ) {
      m_arbitraryPercentile = percentile / 100.0;
      return;
    }

    m_arbitraryPercentile = -1; // not used
  }

  /**
   * Returns the number of metrics to compute
   *
   * @return the number of metrics to compute
   */
  public int numberOfMetricsRequested() {
    int num = 0;

    if ( getCalcN() ) {
      num++;
    }
    if ( getCalcMean() ) {
      num++;
    }
    if ( getCalcStdDev() ) {
      num++;
    }
    if ( getCalcMin() ) {
      num++;
    }
    if ( getCalcMax() ) {
      num++;
    }
    if ( getCalcMedian() ) {
      num++;
    }
    if ( getCalcPercentile() >= 0 ) {
      num++;
    }
    return num;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy