weka.filters.unsupervised.attribute.RandomSubset Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * RandomSubset.java
 * Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import java.util.*;

import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.filters.SimpleBatchFilter;


/**
 
 * Chooses a random subset of non-class attributes, either an absolute number or a percentage. Attributes are included
 * in the order in which they occur in the input data. The class attribute (if present) is always included in the output.
 * 
 
 * 
 
 * Valid options are: 

 * 
 * 
 -N <double>
 *  The number of attributes to randomly select.
 *  If < 1 then percentage, >= 1 absolute number.
 *  (default: 0.5)
 * 
 *  -V
 *  Invert selection - i.e. randomly remove rather than select.
 * 
 *  -S <int>
 *  The seed value.
 *  (default: 1)
 * 
 *  -output-debug-info
 *  If set, filter is run in debug mode and
 *  may output additional info to the console
 * 
 *  -do-not-check-capabilities
 *  If set, filter capabilities are not checked before filter is built
 *  (use with caution).
 * 
 
 * 
 * @author fracpete (fracpete at waikato dot ac dot nz)
 * @author [email protected]
 * @version $Revision: 15073 $
 */
public class RandomSubset extends SimpleBatchFilter
        implements Randomizable, WeightedInstancesHandler, WeightedAttributesHandler {

  /** for serialization. */
  private static final long serialVersionUID = 2911221724251628050L;

  /**
   * The number of attributes to randomly choose (>= 1 absolute number of
   * attributes, < 1 percentage).
   */
  protected double m_NumAttributes = 0.5;

  /** The seed value. */
  protected int m_Seed = 1;

  /** The indices of the attributes that got selected. */
  protected int[] m_Indices = null;

  /** Whether to randomly remove rather than select */
  protected boolean m_invertSelection;

  /**
   * Returns a string describing this filter.
   * 
   * @return a description of the filter suitable for displaying in the
   *         explorer/experimenter gui
   */
  @Override
  public String globalInfo() {
    return "Chooses a random subset of non-class attributes, either an absolute number "
      + "or a percentage. Attributes are included in the order in which they occur in the input data. The class "
      + "attribute (if present) is always included in the output.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration
   *
   * @return true for this filter so that input data can affect subset of attributes that is selected
   */
  public boolean allowAccessToFullInputFormat() {
    return true;
  }

  /**
   * Determines the output format based on the input format and returns this. In
   * case the output format cannot be returned immediately, i.e.,
   * hasImmediateOutputFormat() returns false, then this method will called from
   * batchFinished() after the call of preprocess(Instances), in which, e.g.,
   * statistics for the actual processing step can be gathered.
   * 
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   */
  @Override
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {

    // determine the number of attributes
    int numAttsWithoutClass = inputFormat.numAttributes();
    if (inputFormat.classIndex() > -1) {
      numAttsWithoutClass--;
    }

    int sizeOfSample = 0;
    if (m_NumAttributes < 1) {
      sizeOfSample = (int) Math.round(numAttsWithoutClass * m_NumAttributes);
    } else {
      if (m_NumAttributes < numAttsWithoutClass) {
        sizeOfSample = (int) m_NumAttributes;
      }
    }
    if (getDebug()) {
      System.out.println("# of atts: " + sizeOfSample);
    }

    // Get a random number generator that depends on the particular dataset passed in
    Random rand = inputFormat.getRandomNumberGenerator(getSeed());

    // The random indices (we will need to take care of the class attribute)
    int[] indices = RandomSample.drawSortedSample(sizeOfSample, numAttsWithoutClass, rand);

    // Do we need to take the inverse?
    if (m_invertSelection) {
      int[] newIndices = new int[numAttsWithoutClass - indices.length];
      int index = 0;
      int indexNew = 0;
      int i = 0;
      while ((i < numAttsWithoutClass)) {
        while ((indexNew < newIndices.length) && ((indices.length <= index) || (i < indices[index]))) {
          newIndices[indexNew++] = i++;
        }
        index++;
        i++;
      }
      indices = newIndices;
    }

    // Make a new list of indices, taking care of the class
    List selected  = new ArrayList<>();
    int newClassIndex = -1;
    if (inputFormat.classIndex() > -1) {
      for (int i = 0; i < indices.length; i++) {
        int index = indices[i];
        if (index < inputFormat.classIndex()) {
          selected.add(index);
        } else {
          selected.add(index + 1);
        }
      }
      newClassIndex = -Collections.binarySearch(selected, inputFormat.classIndex()) - 1;
      selected.add(newClassIndex, inputFormat.classIndex());
    } else {
      for (int i = 0; i < indices.length; i++) {
        selected.add(indices[i]);
      }
    }

    if (getDebug()) {
      System.out.println("Selected indices: " + selected);
    }

    // generate output format
    ArrayList atts = new ArrayList<>();
    m_Indices = new int[selected.size()];
    for (int i = 0; i < selected.size(); i++) {
      atts.add((Attribute)inputFormat.attribute(selected.get(i)).copy());
      m_Indices[i] = selected.get(i);
    }
    Instances result = new Instances(inputFormat.relationName(), atts, 0).stringFreeStructure();
    if (inputFormat.classIndex() > -1) {
      result.setClassIndex(newClassIndex);
    }

    initInputLocators(inputFormatPeek(), m_Indices);

    return result;
  }

  /**
   * processes the given instance (may change the provided instance) and returns
   * the modified version.
   * 
   * @param instances the instance to process
   * @return the modified data
   * @throws Exception in case the processing goes wrong
   */
  @Override
  protected Instances process(Instances instances) throws Exception {

    Instances result = new Instances(outputFormatPeek(), 0);
    for (Instance instance : instances) {
      Instance newInstance;
      if (instance instanceof SparseInstance) {
        int n1 = instance.numValues();
        int n2 = m_Indices.length;
        int[] indices = new int[instance.numValues()];
        double[] values = new double[instance.numValues()];
        int vals = 0;
        for (int p1 = 0, p2 = 0; p1 < n1 && p2 < n2; ) {
          int ind1 = instance.index(p1);
          int ind2 = m_Indices[p2];
          if (ind1 == ind2) {
            indices[vals] = p2;
            values[vals] = instance.valueSparse(p1);
            vals++;
            p1++;
            p2++;
          } else if (ind1 > ind2) {
            p2++;
          } else {
            p1++;
          }
        }
        newInstance = new SparseInstance(instance.weight(), values, indices, m_Indices.length);
      } else {
        double[] values = new double[m_Indices.length];
        for (int i = 0; i < m_Indices.length; i++) {
          values[i] = instance.value(m_Indices[i]);
        }
        newInstance = new DenseInstance(instance.weight(), values);
      }
      copyValues(newInstance, false, instance.dataset(), result);
      result.add(newInstance);
    }
    return result;
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 15073 $");
  }

  /**
   * Runs the filter with the given parameters. Use -h to list options.
   * 
   * @param args the commandline options
   */
  public static void main(String[] args) {
    runFilter(new RandomSubset(), args);
  }
}