All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.filters.supervised.attribute.MergeNominalValues Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    MergeNominalValues.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.filters.supervised.attribute;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.SimpleBatchFilter;
import weka.filters.SupervisedFilter;

/**
 *  Merges values of all nominal attributes among the
 * specified attributes, excluding the class attribute, using the CHAID method,
 * but without considering re-splitting of merged subsets. It implements Steps 1 and
 * 2 described by Kass (1980), see
*
* Gordon V. Kass (1980). An Exploratory Technique for Investigating Large * Quantities of Categorical Data. Applied Statistics. 29(2):119-127.
*
* Once attribute values have been merged, a chi-squared test using the * Bonferroni correction is applied to check if the resulting attribute is a * valid predictor, based on the Bonferroni multiplier in Equation 3.2 in Kass * (1980). If an attribute does not pass this test, all remaining values (if * any) are merged. Nevertheless, useless predictors can slip through without * being fully merged, e.g. identifier attributes.
*
* The code applies the Yates correction when the chi-squared statistic is * computed.
*
* Note that the algorithm is quadratic in the number of attribute values for an * attribute. *

* * * Valid options are: *

* *

 * -D
 *  Turns on output of debugging information.
 * 
* *
 * -L <double>
 *  The significance level (default: 0.05).
 * 
* *
 * -R <range>
 *  Sets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.'
 *  E.g.: first-5,7,9,20-last
 *  (default: first-last)
 * 
* *
 * -V
 *  Invert matching sense (i.e. act on all attributes not specified in list)
 * 
* *
 * -O
 *  Use short identifiers for merged subsets.
 * 
* * * * @author Eibe Frank * @version $Revision: 14508 $ */ public class MergeNominalValues extends SimpleBatchFilter implements SupervisedFilter, WeightedInstancesHandler, WeightedAttributesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 7447337831221353842L; /** Set the significance level */ protected double m_SigLevel = 0.05; /** Stores which atributes to operate on (or nto) */ protected Range m_SelectCols = new Range("first-last"); /** Stores the indexes of the selected attributes in order. */ protected int[] m_SelectedAttributes; /** Indicators for which attributes need to be changed. */ protected boolean[] m_AttToBeModified; /** The indicators used to map the old values. */ protected int[][] m_Indicators; /** Use short values */ protected boolean m_UseShortIdentifiers = false; /** * Returns a string describing this filter. * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ @Override public String globalInfo() { return "Merges values of all nominal attributes among the specified attributes, excluding " + "the class attribute, using the CHAID method, but without considering re-splitting of " + "merged subsets. It implements Steps 1 and 2 described by Kass (1980), see\n\n" + getTechnicalInformation().toString() + "\n\n" + "Once attribute values have been merged, a chi-squared test using the Bonferroni " + "correction is applied to check if the resulting attribute is a valid predictor, " + "based on the Bonferroni multiplier in Equation 3.2 in Kass (1980). If an attribute does " + "not pass this test, all remaining values (if any) are merged. Nevertheless, useless " + "predictors can slip through without being fully merged, e.g. identifier attributes.\n\n" + "The code applies the Yates correction when the chi-squared statistic is computed.\n\n" + "Note that the algorithm is quadratic in the number of attribute values for an attribute."; } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.ARTICLE); result.setValue(Field.AUTHOR, "Gordon V. Kass"); result .setValue( Field.TITLE, "An Exploratory Technique for Investigating Large Quantities of Categorical Data"); result.setValue(Field.JOURNAL, "Applied Statistics"); result.setValue(Field.YEAR, "1980"); result.setValue(Field.VOLUME, "29"); result.setValue(Field.NUMBER, "2"); result.setValue(Field.PAGES, "119-127"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy