All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.filters.unsupervised.attribute.InterquartileRange Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * InterquartileRange.java
 * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.filters.SimpleBatchFilter;

/**
 *  A filter for detecting outliers and extreme values
 * based on interquartile ranges. The filter skips the class attribute.
*
* Outliers:
* Q3 + OF*IQR < x <= Q3 + EVF*IQR
* or
* Q1 - EVF*IQR <= x < Q1 - OF*IQR
*
* Extreme values:
* x > Q3 + EVF*IQR
* or
* x < Q1 - EVF*IQR
*
* Key:
* Q1 = 25% quartile
* Q3 = 75% quartile
* IQR = Interquartile Range, difference between Q1 and Q3
* OF = Outlier Factor
* EVF = Extreme Value Factor *

* * * Valid options are: *

* *

 * -D
 *  Turns on output of debugging information.
 * 
* *
 * -R <col1,col2-col4,...>
 *  Specifies list of columns to base outlier/extreme value detection
 *  on. If an instance is considered in at least one of those
 *  attributes an outlier/extreme value, it is tagged accordingly.
 *  'first' and 'last' are valid indexes.
 *  (default none)
 * 
* *
 * -O <num>
 *  The factor for outlier detection.
 *  (default: 3)
 * 
* *
 * -E <num>
 *  The factor for extreme values detection.
 *  (default: 2*Outlier Factor)
 * 
* *
 * -E-as-O
 *  Tags extreme values also as outliers.
 *  (default: off)
 * 
* *
 * -P
 *  Generates Outlier/ExtremeValue pair for each numeric attribute in
 *  the range, not just a single indicator pair for all the attributes.
 *  (default: off)
 * 
* *
 * -M
 *  Generates an additional attribute 'Offset' per Outlier/ExtremeValue
 *  pair that contains the multiplier that the value is off the median.
 *     value = median + 'multiplier' * IQR
 * Note: implicitely sets '-P'. (default: off)
 * 
* * * * Thanks to Dale for a few brainstorming sessions. * * @author Dale Fletcher (dale at cs dot waikato dot ac dot nz) * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision: 12476 $ */ public class InterquartileRange extends SimpleBatchFilter { /** for serialization */ private static final long serialVersionUID = -227879653639723030L; /** indicator for non-numeric attributes */ public final static int NON_NUMERIC = -1; /** enum for obtaining the various determined IQR values. */ public enum ValueType { UPPER_EXTREME_VALUES, UPPER_OUTLIER_VALUES, LOWER_OUTLIER_VALUES, LOWER_EXTREME_VALUES, MEDIAN, IQR }; /** the attribute range to work on */ protected Range m_Attributes = new Range("first-last"); /** the generated indices (only for performance reasons) */ protected int[] m_AttributeIndices = null; /** the factor for detecting outliers */ protected double m_OutlierFactor = 3; /** the factor for detecting extreme values, by default 2*m_OutlierFactor */ protected double m_ExtremeValuesFactor = 2 * m_OutlierFactor; /** whether extreme values are also tagged as outliers */ protected boolean m_ExtremeValuesAsOutliers = false; /** the upper extreme value threshold (= Q3 + EVF*IQR) */ protected double[] m_UpperExtremeValue = null; /** the upper outlier threshold (= Q3 + OF*IQR) */ protected double[] m_UpperOutlier = null; /** the lower outlier threshold (= Q1 - OF*IQR) */ protected double[] m_LowerOutlier = null; /** the interquartile range */ protected double[] m_IQR = null; /** the median */ protected double[] m_Median = null; /** the lower extreme value threshold (= Q1 - EVF*IQR) */ protected double[] m_LowerExtremeValue = null; /** * whether to generate Outlier/ExtremeValue attributes for each attribute * instead of a general one */ protected boolean m_DetectionPerAttribute = false; /** the position of the outlier attribute */ protected int[] m_OutlierAttributePosition = null; /** * whether to add another attribute called "Offset", that lists the * 'multiplier' by which the outlier/extreme value is away from the median, * i.e., value = median + 'multiplier' * IQR
* automatically enables m_DetectionPerAttribute! */ protected boolean m_OutputOffsetMultiplier = false; /** * Returns a string describing this filter * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ @Override public String globalInfo() { return "A filter for detecting outliers and extreme values based on " + "interquartile ranges. The filter skips the class attribute.\n\n" + "Outliers:\n" + " Q3 + OF*IQR < x <= Q3 + EVF*IQR\n" + " or\n" + " Q1 - EVF*IQR <= x < Q1 - OF*IQR\n" + "\n" + "Extreme values:\n" + " x > Q3 + EVF*IQR\n" + " or\n" + " x < Q1 - EVF*IQR\n" + "\n" + "Key:\n" + " Q1 = 25% quartile\n" + " Q3 = 75% quartile\n" + " IQR = Interquartile Range, difference between Q1 and Q3\n" + " OF = Outlier Factor\n" + " EVF = Extreme Value Factor"; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy