weka.distributed.WekaClassifierMapTask Maven / Gradle / Ivy

Go to download
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    WekaClassifierMapTask.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed;

import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;
import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.classifiers.UpdateableBatchProcessor;
import weka.classifiers.UpdateableClassifier;
import weka.classifiers.meta.AggregateableFilteredClassifier;
import weka.classifiers.meta.AggregateableFilteredClassifierUpdateable;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.meta.FilteredClassifierUpdateable;
import weka.core.Aggregateable;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.MakePreconstructedFilter;
import weka.filters.MultiFilter;
import weka.filters.PreconstructedFilter;
import weka.filters.StreamableFilter;
import weka.filters.unsupervised.instance.ReservoirSample;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Random;
import java.util.Vector;

/**
 * A map task for building classifiers. Can handle batch and incremental
 * classifiers, which are either Aggregateable or not. Non-aggregateable
 * classifiers are wrapped up in a Vote meta classifeir by the reduce task.
 * Incremental classifiers are trained as instances are presented to the
 * processInstance() method. Batch classifiers are trained when finalizeTask()
 * is called. Instances are collected and held in memory for batch classifiers,
 * although reservior sampling may be used to ensure that a fixed number of
 * instances is used for batch learning. There are options to force batch
 * learning for updateable classifiers and to force the generation of a Vote
 * ensemble for Aggregateable classifiers.
 * 
 * 
 * Classifiers may be trained on all the incoming data or on a particular
 * cross-validation fold (this functionality is used directly by the evaluation
 * map and reduce tasks). In the case of batch classifiers, the data for the map
 * will be stratified (if the class is nominal) and randomized before extracting
 * the fold to train on. In the case of incremental classifiers, a modulus
 * operation is used to pull out the instance corresponding to the selected fold
 * from the incoming instance stream.
 * 

 * 
 * Classifiers can optionally have their training data passed through one or
 * more filters as a pre-processing step. The class will determine how to wrap
 * the base classifier and filters based on the nature of the filters specified
 * and whether the classifier is batch/incremental and Aggregateable.
 * Aggregateable classifiers (batch or incremental) can only be aggregated to
 * one final model if the filters used with them (if using filters) are all
 * StreamableFilters (i.e. they can determine their output structure immediately
 * without having to see any instances).
 * 
 * 
 * It is also possible to specify a special "preconstructed" filter to use in
 * conjunction with, or instead of, regular filters. At present, there is just
 * one Preconstructed filter implemented by the distributed system.
 * PreConstructedPCA can produce a "trained" PCA filter using a correlation
 * matrix produced by the CorrelationMatrixMap/Reduce tasks.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 12589 $
 */
public class WekaClassifierMapTask implements OptionHandler,
  EnvironmentHandler, Serializable {

  /**
   * If this property is set then we can adjust the total number of requested
   * iterations for IteratedSingleClassifierEnhancers according to the number of
   * maps that are going to run. This is useful for schemes that build
   * independent base models (e.g. Bagging) in order to get approximately the
   * requested number of models in the end. For boosting methods it will be
   * necessary to set their number of iterations to a higher value than required
   * as it will be adjusted downwards for each map.
   */
  public static final String TOTAL_NUMBER_OF_MAPS = "total.num.maps";

  /** For serialization */
  private static final long serialVersionUID = -5953696466790594368L;

  /** The classifier to use */
  protected Classifier m_classifier = new weka.classifiers.trees.REPTree();

  /** If true then incremental classifiers will be batch trained */
  protected boolean m_forceBatchForUpdateable;

  /**
   * Option value that is determined by whether the classifier is updateable and
   * whether this iteration through the data is > 1
   */
  protected boolean m_continueTrainingUpdateable;

  /**
   * Total folds - only used if m_foldNumber != -1. Use this to train the
   * classifier on a particular fold of the incoming data set for this map
   */
  protected int m_totalFolds = 1; // default = use all data

  /**
   * The fold number to train on. Use in conjunction with m_totalFolds. Default
   * is to train on all the data entering this map
   */
  protected int m_foldNumber = -1; // 1-based. default - use all data

  /** Number of training instances processed by the classifier in this map */
  protected int m_numTrainingInstances;

  /** Total number of instances seen by this map */
  protected int m_numInstances;

  /** Training header */
  protected Instances m_trainingHeader;

  /** Environment variables */
  protected transient Environment m_env = Environment.getSystemWide();

  /** Whether to use reservoir sampling for batch learning */
  protected boolean m_useReservoirSampling;

  /** Reservoir sampling (if requested) for batch learning in this map */
  protected ReservoirSample m_reservoir;

  /** Sample size if reservoir sampling is being used for batch learning */
  protected int m_sampleSize = -1;

  /**
   * True if a Vote ensemble is to be produced in the case when the base
   * classifier is Aggregateable
   */
  protected boolean m_forceVotedEnsemble;

  /**
   * Filters to use. How these are handled depends on whether the base
   * classifier is Aggregateable, incremental etc. These only have an effect if
   * not continueing the training of an updateable classifier - in this case it
   * is assumed that the updatebble classifier would have been configured with
   * these filters when first constructed.
   */
  protected List m_filtersToUse = new ArrayList();

  /** Random seed for fold generation */
  protected String m_seed = "1";

  public static void main(String[] args) {
    try {
      WekaClassifierMapTask task = new WekaClassifierMapTask();
      if (Utils.getFlag('h', args)) {
        String help = DistributedJob.makeOptionsStr(task);
        System.err.println(help);
        System.exit(1);
      }

      String trainingPath = Utils.getOption("t", args);
      Instances train =
        new Instances(new java.io.BufferedReader(new java.io.FileReader(
          trainingPath)));
      train.setClassIndex(train.numAttributes() - 1);

      task.setOptions(args);
      task.setup(new Instances(train, 0));
      for (int i = 0; i < train.numInstances(); i++) {
        task.processInstance(train.instance(i));
      }
      task.finalizeTask();

      System.err.println("Batch trained classifier:\n"
        + task.getClassifier().toString());

      // now configure for an incremental classifier and
      // train it for two passes over the data
      task = new WekaClassifierMapTask();
      task.setClassifier(new weka.classifiers.bayes.NaiveBayesUpdateable());
      task.setup(new Instances(train, 0));
      for (int i = 0; i < train.numInstances(); i++) {
        task.processInstance(train.instance(i));
      }
      // task.finalizeTask(); // not needed as training is done in
      // processInstance()

      System.err.println("Incremental training (iteration 1):\n"
        + task.getClassifier().toString());

      task.setContinueTrainingUpdateableClassifier(true);
      task.setup(new Instances(train, 0));
      for (int i = 0; i < train.numInstances(); i++) {
        task.processInstance(train.instance(i));
      }
      System.err.println("Incremental training (iteration 2):\n"
        + task.getClassifier().toString());

    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }

  @Override
  public Enumeration