All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.filters.unsupervised.attribute.RandomSubset Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * RandomSubset.java
 * Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import java.util.*;

import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.filters.SimpleBatchFilter;


/**
 
 * Chooses a random subset of non-class attributes, either an absolute number or a percentage. Attributes are included
 * in the order in which they occur in the input data. The class attribute (if present) is always included in the output.
 * 

* * Valid options are:

* *

 -N <double>
 *  The number of attributes to randomly select.
 *  If < 1 then percentage, >= 1 absolute number.
 *  (default: 0.5)
* *
 -V
 *  Invert selection - i.e. randomly remove rather than select.
* *
 -S <int>
 *  The seed value.
 *  (default: 1)
* *
 -output-debug-info
 *  If set, filter is run in debug mode and
 *  may output additional info to the console
* *
 -do-not-check-capabilities
 *  If set, filter capabilities are not checked before filter is built
 *  (use with caution).
* * * @author fracpete (fracpete at waikato dot ac dot nz) * @author [email protected] * @version $Revision: 15073 $ */ public class RandomSubset extends SimpleBatchFilter implements Randomizable, WeightedInstancesHandler, WeightedAttributesHandler { /** for serialization. */ private static final long serialVersionUID = 2911221724251628050L; /** * The number of attributes to randomly choose (>= 1 absolute number of * attributes, < 1 percentage). */ protected double m_NumAttributes = 0.5; /** The seed value. */ protected int m_Seed = 1; /** The indices of the attributes that got selected. */ protected int[] m_Indices = null; /** Whether to randomly remove rather than select */ protected boolean m_invertSelection; /** * Returns a string describing this filter. * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ @Override public String globalInfo() { return "Chooses a random subset of non-class attributes, either an absolute number " + "or a percentage. Attributes are included in the order in which they occur in the input data. The class " + "attribute (if present) is always included in the output."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration

* * @return true for this filter so that input data can affect subset of attributes that is selected */ public boolean allowAccessToFullInputFormat() { return true; } /** * Determines the output format based on the input format and returns this. In * case the output format cannot be returned immediately, i.e., * hasImmediateOutputFormat() returns false, then this method will called from * batchFinished() after the call of preprocess(Instances), in which, e.g., * statistics for the actual processing step can be gathered. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong */ @Override protected Instances determineOutputFormat(Instances inputFormat) throws Exception { // determine the number of attributes int numAttsWithoutClass = inputFormat.numAttributes(); if (inputFormat.classIndex() > -1) { numAttsWithoutClass--; } int sizeOfSample = 0; if (m_NumAttributes < 1) { sizeOfSample = (int) Math.round(numAttsWithoutClass * m_NumAttributes); } else { if (m_NumAttributes < numAttsWithoutClass) { sizeOfSample = (int) m_NumAttributes; } } if (getDebug()) { System.out.println("# of atts: " + sizeOfSample); } // Get a random number generator that depends on the particular dataset passed in Random rand = inputFormat.getRandomNumberGenerator(getSeed()); // The random indices (we will need to take care of the class attribute) int[] indices = RandomSample.drawSortedSample(sizeOfSample, numAttsWithoutClass, rand); // Do we need to take the inverse? if (m_invertSelection) { int[] newIndices = new int[numAttsWithoutClass - indices.length]; int index = 0; int indexNew = 0; int i = 0; while ((i < numAttsWithoutClass)) { while ((indexNew < newIndices.length) && ((indices.length <= index) || (i < indices[index]))) { newIndices[indexNew++] = i++; } index++; i++; } indices = newIndices; } // Make a new list of indices, taking care of the class List selected = new ArrayList<>(); int newClassIndex = -1; if (inputFormat.classIndex() > -1) { for (int i = 0; i < indices.length; i++) { int index = indices[i]; if (index < inputFormat.classIndex()) { selected.add(index); } else { selected.add(index + 1); } } newClassIndex = -Collections.binarySearch(selected, inputFormat.classIndex()) - 1; selected.add(newClassIndex, inputFormat.classIndex()); } else { for (int i = 0; i < indices.length; i++) { selected.add(indices[i]); } } if (getDebug()) { System.out.println("Selected indices: " + selected); } // generate output format ArrayList atts = new ArrayList<>(); m_Indices = new int[selected.size()]; for (int i = 0; i < selected.size(); i++) { atts.add((Attribute)inputFormat.attribute(selected.get(i)).copy()); m_Indices[i] = selected.get(i); } Instances result = new Instances(inputFormat.relationName(), atts, 0).stringFreeStructure(); if (inputFormat.classIndex() > -1) { result.setClassIndex(newClassIndex); } initInputLocators(inputFormatPeek(), m_Indices); return result; } /** * processes the given instance (may change the provided instance) and returns * the modified version. * * @param instances the instance to process * @return the modified data * @throws Exception in case the processing goes wrong */ @Override protected Instances process(Instances instances) throws Exception { Instances result = new Instances(outputFormatPeek(), 0); for (Instance instance : instances) { Instance newInstance; if (instance instanceof SparseInstance) { int n1 = instance.numValues(); int n2 = m_Indices.length; int[] indices = new int[instance.numValues()]; double[] values = new double[instance.numValues()]; int vals = 0; for (int p1 = 0, p2 = 0; p1 < n1 && p2 < n2; ) { int ind1 = instance.index(p1); int ind2 = m_Indices[p2]; if (ind1 == ind2) { indices[vals] = p2; values[vals] = instance.valueSparse(p1); vals++; p1++; p2++; } else if (ind1 > ind2) { p2++; } else { p1++; } } newInstance = new SparseInstance(instance.weight(), values, indices, m_Indices.length); } else { double[] values = new double[m_Indices.length]; for (int i = 0; i < m_Indices.length; i++) { values[i] = instance.value(m_Indices[i]); } newInstance = new DenseInstance(instance.weight(), values); } copyValues(newInstance, false, instance.dataset(), result); result.add(newInstance); } return result; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 15073 $"); } /** * Runs the filter with the given parameters. Use -h to list options. * * @param args the commandline options */ public static void main(String[] args) { runFilter(new RandomSubset(), args); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy