All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.distributed.WekaClassifierMapTask Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    WekaClassifierMapTask.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed;

import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;
import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.classifiers.UpdateableBatchProcessor;
import weka.classifiers.UpdateableClassifier;
import weka.classifiers.meta.AggregateableFilteredClassifier;
import weka.classifiers.meta.AggregateableFilteredClassifierUpdateable;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.meta.FilteredClassifierUpdateable;
import weka.core.Aggregateable;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.MakePreconstructedFilter;
import weka.filters.MultiFilter;
import weka.filters.PreconstructedFilter;
import weka.filters.StreamableFilter;
import weka.filters.unsupervised.instance.ReservoirSample;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Random;
import java.util.Vector;

/**
 * A map task for building classifiers. Can handle batch and incremental
 * classifiers, which are either Aggregateable or not. Non-aggregateable
 * classifiers are wrapped up in a Vote meta classifeir by the reduce task.
 * Incremental classifiers are trained as instances are presented to the
 * processInstance() method. Batch classifiers are trained when finalizeTask()
 * is called. Instances are collected and held in memory for batch classifiers,
 * although reservior sampling may be used to ensure that a fixed number of
 * instances is used for batch learning. There are options to force batch
 * learning for updateable classifiers and to force the generation of a Vote
 * ensemble for Aggregateable classifiers.
 * 

* * Classifiers may be trained on all the incoming data or on a particular * cross-validation fold (this functionality is used directly by the evaluation * map and reduce tasks). In the case of batch classifiers, the data for the map * will be stratified (if the class is nominal) and randomized before extracting * the fold to train on. In the case of incremental classifiers, a modulus * operation is used to pull out the instance corresponding to the selected fold * from the incoming instance stream. *

* * Classifiers can optionally have their training data passed through one or * more filters as a pre-processing step. The class will determine how to wrap * the base classifier and filters based on the nature of the filters specified * and whether the classifier is batch/incremental and Aggregateable. * Aggregateable classifiers (batch or incremental) can only be aggregated to * one final model if the filters used with them (if using filters) are all * StreamableFilters (i.e. they can determine their output structure immediately * without having to see any instances). *

* * It is also possible to specify a special "preconstructed" filter to use in * conjunction with, or instead of, regular filters. At present, there is just * one Preconstructed filter implemented by the distributed system. * PreConstructedPCA can produce a "trained" PCA filter using a correlation * matrix produced by the CorrelationMatrixMap/Reduce tasks. * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 12589 $ */ public class WekaClassifierMapTask implements OptionHandler, EnvironmentHandler, Serializable { /** * If this property is set then we can adjust the total number of requested * iterations for IteratedSingleClassifierEnhancers according to the number of * maps that are going to run. This is useful for schemes that build * independent base models (e.g. Bagging) in order to get approximately the * requested number of models in the end. For boosting methods it will be * necessary to set their number of iterations to a higher value than required * as it will be adjusted downwards for each map. */ public static final String TOTAL_NUMBER_OF_MAPS = "total.num.maps"; /** For serialization */ private static final long serialVersionUID = -5953696466790594368L; /** The classifier to use */ protected Classifier m_classifier = new weka.classifiers.trees.REPTree(); /** If true then incremental classifiers will be batch trained */ protected boolean m_forceBatchForUpdateable; /** * Option value that is determined by whether the classifier is updateable and * whether this iteration through the data is > 1 */ protected boolean m_continueTrainingUpdateable; /** * Total folds - only used if m_foldNumber != -1. Use this to train the * classifier on a particular fold of the incoming data set for this map */ protected int m_totalFolds = 1; // default = use all data /** * The fold number to train on. Use in conjunction with m_totalFolds. Default * is to train on all the data entering this map */ protected int m_foldNumber = -1; // 1-based. default - use all data /** Number of training instances processed by the classifier in this map */ protected int m_numTrainingInstances; /** Total number of instances seen by this map */ protected int m_numInstances; /** Training header */ protected Instances m_trainingHeader; /** Environment variables */ protected transient Environment m_env = Environment.getSystemWide(); /** Whether to use reservoir sampling for batch learning */ protected boolean m_useReservoirSampling; /** Reservoir sampling (if requested) for batch learning in this map */ protected ReservoirSample m_reservoir; /** Sample size if reservoir sampling is being used for batch learning */ protected int m_sampleSize = -1; /** * True if a Vote ensemble is to be produced in the case when the base * classifier is Aggregateable */ protected boolean m_forceVotedEnsemble; /** * Filters to use. How these are handled depends on whether the base * classifier is Aggregateable, incremental etc. These only have an effect if * not continueing the training of an updateable classifier - in this case it * is assumed that the updatebble classifier would have been configured with * these filters when first constructed. */ protected List m_filtersToUse = new ArrayList(); /** Random seed for fold generation */ protected String m_seed = "1"; public static void main(String[] args) { try { WekaClassifierMapTask task = new WekaClassifierMapTask(); if (Utils.getFlag('h', args)) { String help = DistributedJob.makeOptionsStr(task); System.err.println(help); System.exit(1); } String trainingPath = Utils.getOption("t", args); Instances train = new Instances(new java.io.BufferedReader(new java.io.FileReader( trainingPath))); train.setClassIndex(train.numAttributes() - 1); task.setOptions(args); task.setup(new Instances(train, 0)); for (int i = 0; i < train.numInstances(); i++) { task.processInstance(train.instance(i)); } task.finalizeTask(); System.err.println("Batch trained classifier:\n" + task.getClassifier().toString()); // now configure for an incremental classifier and // train it for two passes over the data task = new WekaClassifierMapTask(); task.setClassifier(new weka.classifiers.bayes.NaiveBayesUpdateable()); task.setup(new Instances(train, 0)); for (int i = 0; i < train.numInstances(); i++) { task.processInstance(train.instance(i)); } // task.finalizeTask(); // not needed as training is done in // processInstance() System.err.println("Incremental training (iteration 1):\n" + task.getClassifier().toString()); task.setContinueTrainingUpdateableClassifier(true); task.setup(new Instances(train, 0)); for (int i = 0; i < train.numInstances(); i++) { task.processInstance(train.instance(i)); } System.err.println("Incremental training (iteration 2):\n" + task.getClassifier().toString()); } catch (Exception ex) { ex.printStackTrace(); } } @Override public Enumeration





© 2015 - 2025 Weber Informatics LLC | Privacy Policy