All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hr.irb.fastRandomForest.FastRandomTree Maven / Gradle / Ivy

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    FastRandomTree.java
 *    Copyright (C) 2001 University of Waikato, Hamilton, NZ (original code,
 *      RandomTree.java)
 *    Copyright (C) 2013 Fran Supek (adapted code)
 */

package hr.irb.fastRandomForest;

import java.util.Arrays;
import weka.classifiers.AbstractClassifier;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities.Capability;
import java.util.Random;
import weka.core.RevisionUtils;


/**
 * Based on the "weka.classifiers.trees.RandomTree" class, revision 1.19,
 * by Eibe Frank and Richard Kirkby, with major modifications made to improve
 * the speed of classifier training.
 * 
 * Please refer to the Javadoc of buildTree, splitData and distribution
 * function, as well as the changelog.txt, for the details of changes to 
 * FastRandomTree.
 * 
 * This class should be used only from within the FastRandomForest classifier.
 * 
 * @author Eibe Frank ([email protected]) - original code
 * @author Richard Kirkby ([email protected]) - original code
 * @author Fran Supek (fran.supek[AT]irb.hr) - adapted code
 * @version $Revision: 0.99$
 */
class FastRandomTree
        extends AbstractClassifier
        implements OptionHandler, WeightedInstancesHandler, Runnable {

  /** for serialization */
  static final long serialVersionUID = 8934314652175299375L;
  
  /** The subtrees appended to this tree (node). */
  protected FastRandomTree[] m_Successors;
  
  /**
   * For access to parameters of the RF (k, or maxDepth).
   */
  protected FastRandomForest m_MotherForest;

  /** The attribute to split on. */
  protected int m_Attribute = -1;

  /** The split point. */
  protected double m_SplitPoint = Double.NaN;
  
  /** The proportions of training instances going down each branch. */
  protected double[] m_Prop = null;

  /** Class probabilities from the training vals. */
  protected double[] m_ClassProbs = null;

  /** The dataset used for training. */
  protected transient DataCache data = null;
  
  /**
   * Since 0.99: holds references to temporary arrays re-used by all nodes
   * in the tree, used while calculating the "props" for various attributes in
   * distributionSequentialAtt(). This is meant to avoid frequent 
   * creating/destroying of these arrays.
   */
  protected transient double[] tempProps;  
  
  /**
   * Since 0.99: holds references to temporary arrays re-used by all nodes
   * in the tree, used while calculating the "dists" for various attributes
   * in distributionSequentialAtt(). This is meant to avoid frequent 
   * creating/destroying of these arrays.
   */
  protected transient double[][] tempDists;  
  protected transient double[][] tempDistsOther;  
  
  

  /** Minimum number of instances for leaf. */
  protected static final int m_MinNum = 1;

  /**
   * Get the value of MinNum.
   *
   * @return Value of MinNum.
   */
  public final int getMinNum() {

    return m_MinNum;
  }


  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String KValueTipText() {
    return "Sets the number of randomly chosen attributes.";
  }


  /**
   * Get the value of K.
   *
   * @return Value of K.
   */
  public final int getKValue() {
    return m_MotherForest.m_KValue;
  }


  /**
   * Get the maximum depth of the tree, 0 for unlimited.
   *
   * @return 		the maximum depth.
   */
  public final int getMaxDepth() {
    return m_MotherForest.m_MaxDepth;
  }


  /**
   * Returns default capabilities of the classifier.
   *
   * @return      the capabilities of this classifier
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.DATE_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    // class
    result.enable(Capability.NOMINAL_CLASS);
    result.enable(Capability.MISSING_CLASS_VALUES);

    return result;
  }


  /**
   * This function is not supported by FastRandomTree, as it requires a
   * DataCache for training.

   * @throws Exception every time this function is called
   */
  @Override
  public void buildClassifier(Instances data) throws Exception {
    throw new Exception("FastRandomTree can be used only by FastRandomForest " +
            "and FastRfBagger classes, not directly.");
  }



  /**
   * Builds classifier. Makes the initial call to the recursive buildTree 
   * function. The name "run()" is used to support multithreading via an
   * ExecutorService. 

* * The "data" field of the FastRandomTree should contain a * reference to a DataCache prior to calling this function, and that * DataCache should have the "reusableRandomGenerator" field initialized. * The FastRfBagging class normally takes care of this before invoking this * function. */ public void run() { // compute initial class counts double[] classProbs = new double[data.numClasses]; for (int i = 0; i < data.numInstances; i++) { classProbs[data.instClassValues[i]] += data.instWeights[i]; } // create the attribute indices window - skip class int[] attIndicesWindow = new int[data.numAttributes - 1]; int j = 0; for (int i = 0; i < attIndicesWindow.length; i++) { if (j == data.classIndex) j++; // do not include the class attIndicesWindow[i] = j++; } // prepare the DataCache by: // ... creating an array for the whatGoesWhere field of the data // ... creating the sortedIndices data.whatGoesWhere = new int[ data.inBag.length ]; data.createInBagSortedIndices(); buildTree(data.sortedIndices, 0, data.sortedIndices[0].length-1, classProbs, m_Debug, attIndicesWindow, 0); this.data = null; } /** * Computes class distribution of an instance using the FastRandomTree.

* * In Weka's RandomTree, the distributions were normalized so that all * probabilities sum to 1; this would abolish the effect of instance weights * on voting. In FastRandomForest 0.97 onwards, the distributions are * normalized by dividing with the number of instances going into a leaf.

* * @param instance the instance to compute the distribution for * @return the computed class distribution * @throws Exception if computation fails */ @Override public double[] distributionForInstance(Instance instance) throws Exception { double[] returnedDist = null; if (m_Attribute > -1) { // ============================ node is not a leaf if (instance.isMissing(m_Attribute)) { // ---------------- missing value returnedDist = new double[m_MotherForest.m_Info.numClasses()]; // split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } } else if (m_MotherForest.m_Info .attribute(m_Attribute).isNominal()) { // ------ nominal //returnedDist = m_Successors[(int) instance.value(m_Attribute)] // .distributionForInstance(instance); // 0.99: new - binary splits (also) for nominal attributes if ( instance.value(m_Attribute) == m_SplitPoint ) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } else { // ------------------------------------------ numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } return returnedDist; } else { // =============================================== node is a leaf return m_ClassProbs; } } /** * Computes class distribution of an instance using the FastRandomTree.

* * Works correctly only if the DataCache has the same attributes as the one * used to train the FastRandomTree - but this function does not check for * that!

* * Main use of this is to compute out-of-bag error (also when finding feature * importances). * * @param instance the instance to compute the distribution for * @return the computed class distribution * @throws Exception if computation fails */ public double[] distributionForInstanceInDataCache(DataCache data, int instIdx) { double[] returnedDist = null; if (m_Attribute > -1) { // ============================ node is not a leaf if ( data.isValueMissing(m_Attribute, instIdx) ) { // ---------------- missing value returnedDist = new double[m_MotherForest.m_Info.numClasses()]; // split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstanceInDataCache(data, instIdx); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } } else if ( data.isAttrNominal(m_Attribute) ) { // ------ nominal //returnedDist = m_Successors[(int) instance.value(m_Attribute)] // .distributionForInstance(instance); // 0.99: new - binary splits (also) for nominal attributes if ( data.vals[m_Attribute][instIdx] == m_SplitPoint ) { returnedDist = m_Successors[0].distributionForInstanceInDataCache(data, instIdx); } else { returnedDist = m_Successors[1].distributionForInstanceInDataCache(data, instIdx); } } else { // ------------------------------------------ numeric attributes if ( data.vals[m_Attribute][instIdx] < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstanceInDataCache(data, instIdx); } else { returnedDist = m_Successors[1].distributionForInstanceInDataCache(data, instIdx); } } return returnedDist; } else { // =============================================== node is a leaf return m_ClassProbs; } } /** * Recursively generates a tree. A derivative of the buildTree function from * the "weka.classifiers.trees.RandomTree" class, with the following changes * made: *

    * *
  • m_ClassProbs are now remembered only in leaves, not in every node of * the tree * *
  • m_Distribution has been removed * *
  • members of dists, splits, props and vals arrays which are not used are * dereferenced prior to recursion to reduce memory requirements * *
  • a check for "branch with no training instances" is now (FastRF 0.98) * made before recursion; with the current implementation of splitData(), * empty branches can appear only with nominal attributes with more than * two categories * *
  • each new 'tree' (i.e. node or leaf) is passed a reference to its * 'mother forest', necessary to look up parameters such as maxDepth and K * *
  • pre-split entropy is not recalculated unnecessarily * *
  • uses DataCache instead of weka.core.Instances, the reference to the * DataCache is stored as a field in FastRandomTree class and not passed * recursively down new buildTree() calls * *
  • similarly, a reference to the random number generator is stored * in a field of the DataCache * *
  • m_ClassProbs are now normalized by dividing with number of instances * in leaf, instead of forcing the sum of class probabilities to 1.0; * this has a large effect when class/instance weights are set by user * *
  • a little imprecision is allowed in checking whether there was a * decrease in entropy after splitting * *
  • 0.99: the temporary arrays splits, props, vals now are not wide * as the full number of attributes in the dataset (of which only "k" columns * of randomly chosen attributes get filled). Now, it's just a single array * which gets replaced as the k features are evaluated sequentially, but it * gets replaced only if a next feature is better than a previous one. * *
  • 0.99: the SortedIndices are now not cut up into smaller arrays on every * split, but rather re-sorted within the same array in the splitDataNew(), * and passed down to buildTree() as the original large matrix, but with * start and end points explicitly specified * *
* * @param sortedIndices the indices of the instances of the whole bootstrap replicate * @param startAt First index of the instance to consider in this split; inclusive. * @param endAt Last index of the instance to consider; inclusive. * @param classProbs the class distribution * @param debug whether debugging is on * @param attIndicesWindow the attribute window to choose attributes from * @param depth the current depth */ protected void buildTree(int[][] sortedIndices, int startAt, int endAt, double[] classProbs, boolean debug, int[] attIndicesWindow, int depth) { m_Debug = debug; int sortedIndicesLength = endAt - startAt + 1; // Check if node doesn't contain enough instances or is pure // or maximum depth reached, make leaf. if ( ( sortedIndicesLength < Math.max(2, getMinNum()) ) // small || Utils.eq( classProbs[Utils.maxIndex(classProbs)], Utils.sum(classProbs) ) // pure || ( (getMaxDepth() > 0) && (depth >= getMaxDepth()) ) // deep ) { m_Attribute = -1; // indicates leaf (no useful attribute to split on) // normalize by dividing with the number of instances (as of ver. 0.97) // unless leaf is empty - this can happen with splits on nominal // attributes with more than two categories if ( sortedIndicesLength != 0 ) for (int c = 0; c < classProbs.length; c++) { classProbs[c] /= sortedIndicesLength; } m_ClassProbs = classProbs; this.data = null; return; } // (leaf making) // new 0.99: all the following are for the best attribute only! they're updated while sequentially through the attributes double val = Double.NaN; // value of splitting criterion double[][] dist = new double[2][data.numClasses]; // class distributions (contingency table), indexed first by branch, then by class double[] prop = new double[2]; // the branch sizes (as fraction) double split = Double.NaN; // split point // Investigate K random attributes int attIndex = 0; int windowSize = attIndicesWindow.length; int k = getKValue(); boolean sensibleSplitFound = false; double prior = Double.NaN; double bestNegPosterior = -Double.MAX_VALUE; int bestAttIdx = -1; while ((windowSize > 0) && (k-- > 0 || !sensibleSplitFound ) ) { int chosenIndex = data.reusableRandomGenerator.nextInt(windowSize); attIndex = attIndicesWindow[chosenIndex]; // shift chosen attIndex out of window attIndicesWindow[chosenIndex] = attIndicesWindow[windowSize - 1]; attIndicesWindow[windowSize - 1] = attIndex; windowSize--; // new: 0.99 double candidateSplit = distributionSequentialAtt( prop, dist, bestNegPosterior, attIndex, sortedIndices[attIndex], startAt, endAt ); if ( Double.isNaN(candidateSplit) ) { continue; // we did not improve over a previous attribute! "dist" is unchanged from before } // by this point we know we have an improvement, so we keep the new split point split = candidateSplit; bestAttIdx = attIndex; if ( Double.isNaN(prior) ) { // needs to be computed only once per branch - is same for all attributes (even regardless of missing values) prior = SplitCriteria.entropyOverColumns(dist); } double negPosterior = - SplitCriteria.entropyConditionedOnRows(dist); // this is an updated dist if ( negPosterior > bestNegPosterior ) { bestNegPosterior = negPosterior; } else { throw new IllegalArgumentException("Very strange!"); } val = prior - (-negPosterior); // we want the greatest reduction in entropy if ( val > 1e-2 ) { // we allow some leeway here to compensate sensibleSplitFound = true; // for imprecision in entropy computation } } // feature by feature in window if ( sensibleSplitFound ) { m_Attribute = bestAttIdx; // find best attribute m_SplitPoint = split; m_Prop = prop; prop = null; // can be GC'ed //int[][][] subsetIndices = // new int[dist.length][data.numAttributes][]; //splitData( subsetIndices, m_Attribute, // m_SplitPoint, sortedIndices ); //int numInstancesBeforeSplit = sortedIndices[0].length; int belowTheSplitStartsAt = splitDataNew( m_Attribute, m_SplitPoint, sortedIndices, startAt, endAt ); m_Successors = new FastRandomTree[dist.length]; // dist.length now always == 2 for (int i = 0; i < dist.length; i++) { m_Successors[i] = new FastRandomTree(); m_Successors[i].m_MotherForest = this.m_MotherForest; m_Successors[i].data = this.data; // new in 0.99 - used in distributionSequentialAtt() m_Successors[i].tempDists = this.tempDists; m_Successors[i].tempDistsOther = this.tempDistsOther; m_Successors[i].tempProps = this.tempProps; // check if we're about to make an empty branch - this can happen with // nominal attributes with more than two categories (as of ver. 0.98) if ( belowTheSplitStartsAt - startAt == 0 ) { // in this case, modify the chosenAttDists[i] so that it contains // the current, before-split class probabilities, properly normalized // by the number of instances (as we won't be able to normalize // after the split) for ( int j = 0; j < dist[i].length; j++ ) dist[i][j] = classProbs[j] / sortedIndicesLength; } if ( i == 0 ) { // before split m_Successors[i].buildTree(sortedIndices, startAt, belowTheSplitStartsAt - 1, dist[i], m_Debug, attIndicesWindow, depth + 1); } else { // after split m_Successors[i].buildTree(sortedIndices, belowTheSplitStartsAt, endAt, dist[i], m_Debug, attIndicesWindow, depth + 1); } dist[i] = null; } sortedIndices = null; } else { // ------ make leaf -------- m_Attribute = -1; // normalize by dividing with the number of instances (as of ver. 0.97) // unless leaf is empty - this can happen with splits on nominal attributes if ( sortedIndicesLength != 0 ) for (int c = 0; c < classProbs.length; c++) { classProbs[c] /= sortedIndicesLength; } m_ClassProbs = classProbs; } this.data = null; // dereference all pointers so data can be GC'd after tree is built } /** * Computes size of the tree. * * @return the number of nodes */ public int numNodes() { if (m_Attribute == -1) { return 1; } else { int size = 1; for (int i = 0; i < m_Successors.length; i++) { size += m_Successors[i].numNodes(); } return size; } } /** * Splits instances into subsets. Not used anymore in 0.99. This is a * derivative of the splitData function from "weka.classifiers.trees.RandomTree", * with the following changes:

* * - When handling instances with missing values in attribute chosen for the * split, the FastRandomTree assignes the instance to one of the branches at * random, with bigger branches having a higher probability of getting the * instance.

* * - When splitting sortedIndices into two or more subsetIndices, * FastRandomTree checks whether an instance's split attribute value was above * splitpoint only once per instances, and stores result into the DataCache's * whatGoesWhere field, which is then read in splitting subsetIndices.

* * As a consequence of the above points, the exact branch sizes (even with * instances having unknowns in the split attribute) are known in advance so * subsetIndices arrays don't have to be 'resized' (i.e. a new shorter copy * of each one created and the old one GCed).

* * @param subsetIndices the sorted indices of the subset * @param att the attribute index * @param splitPoint the splitpoint for numeric attributes * @param sortedIndices the sorted indices of the whole set */ protected void splitData( int[][][] subsetIndices, int att, double splitPoint, int[][] sortedIndices ) { Random random = data.reusableRandomGenerator; int j; // 0.99: we have binary splits also for nominal data int[] num = new int[2]; // how many instances go to each branch if ( data.isAttrNominal(att) ) { // ============================ if nominal for (j = 0; j < sortedIndices[att].length; j++) { int inst = sortedIndices[att][j]; if ( data.isValueMissing(att, inst) ) { // ---------- has missing value // decide where to put this instance randomly, with bigger branches // getting a higher chance double rn = random.nextDouble(); int myBranch = -1; for (int k = 0; k < m_Prop.length; k++) { rn -= m_Prop[k]; if ( (rn <= 0) || k == (m_Prop.length-1) ) { myBranch = k; break; } } data.whatGoesWhere[ inst ] = myBranch; num[myBranch]++; } else { // ----------------------------- does not have missing value // if it matches the category to "split out", put above split // all other categories go below split int subset = ( data.vals[att][inst] == splitPoint ) ? 0 : 1; data.whatGoesWhere[ inst ] = subset; num[subset]++; } // --------------------------------------- end if has missing value } } else { // =================================================== if numeric num = new int[2]; for (j = 0; j < sortedIndices[att].length; j++) { int inst = sortedIndices[att][j]; //Instance inst = data.instance(sortedIndices[att][j]); if ( data.isValueMissing(att, inst) ) { // ---------- has missing value // decide if instance goes into subset 0 or 1 randomly, // with bigger subsets having a greater probability of getting // the instance assigned to them // instances with missing values get processed LAST (sort order) // so branch sizes are known by now (and stored in m_Prop) double rn = random.nextDouble(); int branch = ( rn > m_Prop[0] ) ? 1 : 0; data.whatGoesWhere[ inst ] = branch; num[ branch ]++; } else { // ----------------------------- does not have missing value int branch = ( data.vals[att][inst] < splitPoint ) ? 0 : 1; data.whatGoesWhere[ inst ] = branch; num[ branch ]++; } // --------------------------------------- end if has missing value } // end for instance by instance } // ============================================ end if nominal / numeric // create the new subset (branch) arrays of correct size -- as of 0.99, not anymore for (int a = 0; a < data.numAttributes; a++) { if ( a == data.classIndex ) continue; // no need to sort this one for (int branch = 0; branch < num.length; branch++) { subsetIndices[branch][a] = new int[num[branch]]; } } for (int a = 0; a < data.numAttributes; a++) { // xxxxxxxxxx attr by attr if (a == data.classIndex) continue; for (int branch = 0; branch < num.length; branch++) { num[branch] = 0; } // fill them with stuff by looking at goesWhere array for (j = 0; j < sortedIndices[ a ].length; j++) { int inst = sortedIndices[ a ][j]; int branch = data.whatGoesWhere[ inst ]; // can be 0 or 1 subsetIndices[ branch ][ a ][ num[branch] ] = sortedIndices[a][j]; num[branch]++; } } // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx end for attr by attr } /** * Splits instances into subsets; new for FastRF 0.99. Does not create new * arrays with split indices, but rather reorganizes the indices within the * supplied sortedIndices to conform with the split. Works only within given * boundaries.

* * Note: as of 0.99, all splits (incl. categorical) are always binary. * * @param att the attribute index * @param splitPoint the splitpoint for numeric attributes * @param sortedIndices the sorted indices of the whole set - gets overwritten! * @param startAt Inclusive, 0-based index. Does not touch anything before this value. * @param endAt Inclusive, 0-based index. Does not touch anything after this value. * * @return the first index of the "below the split" instances */ protected int splitDataNew( int att, double splitPoint, int[][] sortedIndices, int startAt, int endAt ) { Random random = data.reusableRandomGenerator; int j; // 0.99: we have binary splits also for nominal data int[] num = new int[2]; // how many instances go to each branch // we might possibly want to recycle this array for the whole tree int[] tempArr = new int[ endAt-startAt+1 ]; if ( data.isAttrNominal(att) ) { // ============================ if nominal for (j = startAt; j <= endAt; j++) { int inst = sortedIndices[att][j]; if ( data.isValueMissing(att, inst) ) { // ---------- has missing value // decide where to put this instance randomly, with bigger branches // getting a higher chance double rn = random.nextDouble(); int myBranch = -1; for (int k = 0; k < m_Prop.length; k++) { rn -= m_Prop[k]; if ( (rn <= 0) || k == (m_Prop.length-1) ) { myBranch = k; break; } } data.whatGoesWhere[ inst ] = myBranch; num[myBranch]++; } else { // ----------------------------- does not have missing value // if it matches the category to "split out", put above split // all other categories go below split int subset = ( data.vals[att][inst] == splitPoint ) ? 0 : 1; data.whatGoesWhere[ inst ] = subset; num[subset]++; } // --------------------------------------- end if has missing value } } else { // =================================================== if numeric num = new int[2]; for (j = startAt; j <= endAt ; j++) { int inst = sortedIndices[att][j]; //Instance inst = data.instance(sortedIndices[att][j]); if ( data.isValueMissing(att, inst) ) { // ---------- has missing value // decide if instance goes into subset 0 or 1 randomly, // with bigger subsets having a greater probability of getting // the instance assigned to them // instances with missing values get processed LAST (sort order) // so branch sizes are known by now (and stored in m_Prop) double rn = random.nextDouble(); int branch = ( rn > m_Prop[0] ) ? 1 : 0; data.whatGoesWhere[ inst ] = branch; num[ branch ]++; } else { // ----------------------------- does not have missing value int branch = ( data.vals[att][inst] < splitPoint ) ? 0 : 1; data.whatGoesWhere[ inst ] = branch; num[ branch ]++; } // --------------------------------------- end if has missing value } // end for instance by instance } // ============================================ end if nominal / numeric for (int a = 0; a < data.numAttributes; a++) { // xxxxxxxxxx attr by attr if (a == data.classIndex) continue; // the first index of the sortedIndices in the above branch, and the first index in the below int startAbove = 0, startBelow = num[0]; // always only 2 sub-branches, remember where second starts Arrays.fill(tempArr, 0); //for (int branch = 0; branch < num.length; branch++) { // num[branch] = 0; //} // fill them with stuff by looking at goesWhere array for (j = startAt; j <= endAt; j++) { int inst = sortedIndices[ a ][j]; int branch = data.whatGoesWhere[ inst ]; // can be only 0 or 1 if ( branch==0 ) { tempArr[ startAbove ] = sortedIndices[a][j]; startAbove++; } else { tempArr[ startBelow ] = sortedIndices[a][j]; startBelow++; } //subsetIndices[ branch == 0 ? startAbove : ][ a ][ num[branch] ] = sortedIndices[a][j]; //num[branch]++; } // now copy the tempArr into the sortedIndices, thus overwriting it System.arraycopy( tempArr, 0, sortedIndices[a], startAt, endAt-startAt+1 ); } // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx end for attr by attr return startAt+num[0]; // the first index of "below the split" instances } /** * Computes class distribution for an attribute. Not used anymore in 0.99. * Based on the splitData function from "weka.classifiers.trees.RandomTree", * with the following changes:

    * *
  • entropy pre-split is not computed at this point as the only thing * relevant for the (comparative) goodness of a split is entropy after splitting *
  • dist[][] is now computed only after the split point has been found, * and not updated continually by copying from currDist *
  • also, in Weka's RandomTree it was possible to create a split 'in the * middle' of instance 0, which would result in empty nodes after the * split; this is now fixed *
  • instance 0 is now generally skipped when looking for split points, * as the split point 'before instance 0' is not sensible; in versions * prior to 0.96 this change introduced a bug where attributes with * all missing values had their dists computed wrongly, which might * result in useless (but harmless) branches being added to the tree *
* * @param props gets filled with relative sizes of branches (total = 1), indexed * first per attribute * @param dists these are the contingency matrices, indexed first per attribute * @param att the attribute index (which one to change) * @param sortedIndices the sorted indices of the vals */ protected double distribution( double[][] props, double[][][] dists, int att, int[] sortedIndices ) { double splitPoint = -Double.MAX_VALUE; double[][] dist = null; // a contingency table of the split point vs class int i; if ( data.isAttrNominal(att) ) { // ====================== nominal attributes dist = new double[data.attNumVals[att]][data.numClasses]; for (i = 0; i < sortedIndices.length; i++) { int inst = sortedIndices[i]; if ( data.isValueMissing(att, inst) ) break; dist[ (int)data.vals[att][inst] ][ data.instClassValues[inst] ] += data.instWeights[inst]; } splitPoint = 0; // signals we've found a sensible split point; by // definition, a split on a nominal attribute is sensible } else { // ============================================ numeric attributes double[][] currDist = new double[2][data.numClasses]; dist = new double[2][data.numClasses]; //begin with moving all instances into second subset for (int j = 0; j < sortedIndices.length; j++) { int inst = sortedIndices[j]; if ( data.isValueMissing(att, inst) ) break; currDist[1][ data.instClassValues[inst] ] += data.instWeights[inst]; } copyDists(currDist, dist); //for (int j = 0; j < currDist.length; j++) // System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); double currVal = -Double.MAX_VALUE; // current value of splitting criterion double bestVal = -Double.MAX_VALUE; // best value of splitting criterion int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed for (i = 1; i < sortedIndices.length; i++) { // --- try all split points int inst = sortedIndices[i]; if ( data.isValueMissing(att, inst) ) break; int prevInst = sortedIndices[i-1]; currDist[0][ data.instClassValues[ prevInst ] ] += data.instWeights[ prevInst ] ; currDist[1][ data.instClassValues[ prevInst ] ] -= data.instWeights[ prevInst ] ; // do not allow splitting between two instances with the same value if ( data.vals[att][inst] > data.vals[att][prevInst] ) { // we want the lowest impurity after split; at this point, we don't // really care what we've had before spliting currVal = -SplitCriteria.entropyConditionedOnRows(currDist); if (currVal > bestVal) { bestVal = currVal; bestI = i; } } } // ------- end split points /* * Determine the best split point: * bestI == 0 only if all instances had missing values, or there were * less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE. * This is not really a useful split, as all of the instances are 'below' * the split line, but at least it's formally correct. And the dists[] * also has a default value set previously. */ if ( bestI > 0 ) { // ...at least one valid splitpoint was found int instJustBeforeSplit = sortedIndices[bestI-1]; int instJustAfterSplit = sortedIndices[bestI]; splitPoint = ( data.vals[ att ][ instJustAfterSplit ] + data.vals[ att ][ instJustBeforeSplit ] ) / 2.0; // Now make the correct dist[] from the default dist[] (all instances // in the second branch, by iterating through instances until we reach // bestI, and then stop. for ( int ii = 0; ii < bestI; ii++ ) { int inst = sortedIndices[ii]; dist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ; dist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ; } } } // ================================================== nominal or numeric? // compute total weights for each branch (= props) props[att] = countsToFreqs(dist); // distribute counts of instances with missing values // ver 0.96 - check for special case when *all* instances have missing vals if ( data.isValueMissing(att, sortedIndices[0]) ) i = 0; while (i < sortedIndices.length) { int inst = sortedIndices[i]; for (int branch = 0; branch < dist.length; branch++) { dist[ branch ][ data.instClassValues[inst] ] += props[ att ][ branch ] * data.instWeights[ inst ] ; } i++; } // return distribution after split and best split point dists[att] = dist; return splitPoint; } /** * Computes class distribution for an attribute. New in FastRF 0.99, main * changes: *
    *
  • now reuses the temporary counting arrays (this.tempDists, * this.tempDistsOthers) instead of creating/destroying arrays *
  • does not create a new "dists" for each attribute it examines; instead * it replaces the existing "dists" (supplied as a parameter) but only if the * split is better than the previous best split *
  • always creates binary splits, even for categorical variables; thus * might give slightly different classification results than the old * RandomForest *
* * @param propsBestAtt gets filled with relative sizes of branches (total = 1) * for the best examined attribute so far; updated ONLY if current attribute is * better that the previous best * @param distsBestAtt these are the contingency matrices for the best examined * attribute so far; updated ONLY if current attribute is better that the previous best * @param scoreBestAtt Checked against the score of the attToExamine to determine * if the propsBestAtt and distsBestAtt need to be updated. * @param attToExamine the attribute index (which one to examine, and change the above * matrices if the attribute is better than the previous one) * @param sortedIndices the sorted indices of the vals for the attToExamine. * @param startAt Index in sortedIndicesOfAtt; do not touch anything below this index. * @param endAt Index in sortedIndicesOfAtt; do not touch anything after this index. */ protected double distributionSequentialAtt( double[] propsBestAtt, double[][] distsBestAtt, double scoreBestAtt, int attToExamine, int[] sortedIndicesOfAtt, int startAt, int endAt ) { double splitPoint = -Double.MAX_VALUE; // a contingency table of the split point vs class. double[][] dist = this.tempDists; Arrays.fill( dist[0], 0.0 ); Arrays.fill( dist[1], 0.0 ); double[][] currDist = this.tempDistsOther; Arrays.fill( currDist[0], 0.0 ); Arrays.fill( currDist[1], 0.0 ); //double[][] dist = new double[2][data.numClasses]; //double[][] currDist = new double[2][data.numClasses]; int i; int sortedIndicesOfAttLength = endAt - startAt + 1; // find how many missing values we have for this attribute (they're always at the end) int lastNonmissingValIdx = endAt; for (int j = endAt; j >= startAt; j-- ) { if ( data.isValueMissing(attToExamine, sortedIndicesOfAtt[j]) ) { lastNonmissingValIdx = j-1; } else { break; } } if ( lastNonmissingValIdx < startAt ) { // only missing values in this feature?? return Double.NaN; // we cannot split on it } if ( data.isAttrNominal(attToExamine) ) { // ====================== nominal attributes // 0.99: new routine - makes a one-vs-all split on categorical attributes int numLvls = data.attNumVals[attToExamine]; int bestLvl = 0; // the index of the category which is best to "split out" // note: if we have only two levels, it doesn't matter which one we "split out" // we can thus safely check only the first one if ( numLvls <= 2 ) { bestLvl = 0; // this means that the category with index 0 always // goes 'above' the split and category with index 1 goes 'below' the split for (i = startAt; i <= lastNonmissingValIdx; i++) { int inst = sortedIndicesOfAtt[i]; dist[ (int)data.vals[attToExamine][inst] ][ data.instClassValues[inst] ] += data.instWeights[inst]; } } else { // for >2 levels, we have to search different splits // begin with moving all instances into second subset ("below split") for (int j = startAt; j <= lastNonmissingValIdx; j++) { int inst = sortedIndicesOfAtt[j]; currDist[1][ data.instClassValues[inst] ] += data.instWeights[inst]; } // create a default dist[] which we'll modify after we find the best class to split out copyDists(currDist, dist); double currVal = -Double.MAX_VALUE; // current value of splitting criterion double bestVal = -Double.MAX_VALUE; // best value of splitting criterion int lastSeen = startAt; // used to avoid looping through all instances for every lvl for ( int lvl = 0; lvl < numLvls; lvl++ ) { // reset the currDist to the default (everything "below split") - conveniently stored in dist[][] copyDists(dist, currDist); for (i = lastSeen; i <= lastNonmissingValIdx; i++) { lastSeen = i; int inst = sortedIndicesOfAtt[i]; if ( (int)data.vals[attToExamine][inst] < lvl ) { continue; } else if ( (int)data.vals[attToExamine][inst] == lvl ) { // move to "above split" from "below split" currDist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ; currDist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ; } else { break; // no need to loop forward, no more instances of this category } } // we filled the "dist" for the current level, find score and see if we like it currVal = -SplitCriteria.entropyConditionedOnRows(currDist); if ( currVal > bestVal ) { bestVal = currVal; bestLvl = lvl; } } // examine how well "splitting out" of individual levels works for us // remember the contingency table from the best "lvl" and store it in "dist" for (i = startAt; i <= lastNonmissingValIdx; i++) { int inst = sortedIndicesOfAtt[i]; if ( (int)data.vals[attToExamine][inst] == bestLvl ) { // move to "above split" from "below split" dist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ; dist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ; } else { break; // no need to loop forward, no more instances of this category } } } splitPoint = bestLvl; // signals we've found a sensible split point; by // definition, a split on a nominal attribute // will always be sensible } else { // ============================================ numeric attributes // re-use the 2 x nClass temporary arrays created when tree was initialized //Arrays.fill( dist[0], 0.0 ); //Arrays.fill( dist[1], 0.0 ); // begin with moving all instances into second subset ("below split") for (int j = startAt; j <= lastNonmissingValIdx; j++) { int inst = sortedIndicesOfAtt[j]; currDist[1][ data.instClassValues[inst] ] += data.instWeights[inst]; } copyDists(currDist, dist); double currVal = -Double.MAX_VALUE; // current value of splitting criterion double bestVal = -Double.MAX_VALUE; // best value of splitting criterion int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed for (i = startAt+1; i <= lastNonmissingValIdx; i++) { // --- try all split points int inst = sortedIndicesOfAtt[i]; int prevInst = sortedIndicesOfAtt[i-1]; currDist[0][ data.instClassValues[ prevInst ] ] += data.instWeights[ prevInst ] ; currDist[1][ data.instClassValues[ prevInst ] ] -= data.instWeights[ prevInst ] ; // do not allow splitting between two instances with the same value if ( data.vals[attToExamine][inst] > data.vals[attToExamine][prevInst] ) { // we want the lowest impurity after split; at this point, we don't // really care what we've had before spliting currVal = -SplitCriteria.entropyConditionedOnRows(currDist); if (currVal > bestVal) { bestVal = currVal; bestI = i; } } } // ------- end trying split points /* * Determine the best split point: * bestI == 0 only if all instances had missing values, or there were * less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE. * This is not really a useful split, as all of the instances are 'below' * the split line, but at least it's formally correct. And the dists[] * also has a default value set previously. */ if ( bestI > startAt ) { // ...at least one valid splitpoint was found int instJustBeforeSplit = sortedIndicesOfAtt[bestI-1]; int instJustAfterSplit = sortedIndicesOfAtt[bestI]; splitPoint = ( data.vals[ attToExamine ][ instJustAfterSplit ] + data.vals[ attToExamine ][ instJustBeforeSplit ] ) / 2.0; // now make the correct dist[] (for the best split point) from the // default dist[] (all instances in the second branch, by iterating // through instances until we reach bestI, and then stop. for ( int ii = startAt; ii < bestI; ii++ ) { int inst = sortedIndicesOfAtt[ii]; dist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ; dist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ; } } } // ================================================== nominal or numeric? // compute total weights for each branch (= props) // again, we reuse the tempProps of the tree not to create/destroy new arrays double[] props = this.tempProps; countsToFreqs(dist, props); // props gets overwritten, previous contents don't matters // distribute *counts* of instances with missing values using the "props" i = lastNonmissingValIdx + 1; /// start 1 after the non-missing val (if there is anything) while ( i <= endAt ) { int inst = sortedIndicesOfAtt[i]; dist[ 0 ][ data.instClassValues[inst] ] += props[ 0 ] * data.instWeights[ inst ] ; dist[ 1 ][ data.instClassValues[inst] ] += props[ 1 ] * data.instWeights[ inst ] ; i++; } // update the distribution after split and best split point // but ONLY if better than the previous one -- we need to recalculate the // entropy (because this changes after redistributing the instances with // missing values in the current attribute). Also, for categorical variables // it was not calculated before. double curScore = -SplitCriteria.entropyConditionedOnRows(dist); if ( curScore > scoreBestAtt && splitPoint > -Double.MAX_VALUE ) { // overwrite the "distsBestAtt" and "propsBestAtt" with current values copyDists(dist, distsBestAtt); System.arraycopy( props, 0, propsBestAtt, 0, props.length ); return splitPoint; } else { // returns a NaN instead of the splitpoint if the attribute was not better than a previous one. return Double.NaN; } } /** * Normalizes branch sizes so they contain frequencies (stored in "props") * instead of counts (stored in "dist"). Creates a new double[] which it * returns. */ protected static double[] countsToFreqs( double[][] dist ) { double[] props = new double[dist.length]; for (int k = 0; k < props.length; k++) { props[k] = Utils.sum(dist[k]); } if (Utils.eq(Utils.sum(props), 0)) { for (int k = 0; k < props.length; k++) { props[k] = 1.0 / (double) props.length; } } else { FastRfUtils.normalize(props); } return props; } /** * Normalizes branch sizes so they contain frequencies (stored in "props") * instead of counts (stored in "dist").

* * Overwrites the supplied "props"!

* * props.length must be == dist.length. */ protected static void countsToFreqs( double[][] dist, double[] props ) { for (int k = 0; k < props.length; k++) { props[k] = Utils.sum(dist[k]); } if (Utils.eq(Utils.sum(props), 0)) { for (int k = 0; k < props.length; k++) { props[k] = 1.0 / (double) props.length; } } else { FastRfUtils.normalize(props); } } /** * Makes a copy of a "dists" array, which is a 2 x numClasses array. * * @param distFrom * @param distTo Gets overwritten. */ protected static void copyDists( double[][] distFrom, double[][] distTo ) { for ( int i = 0; i < distFrom[0].length; i++ ) { distTo[0][i] = distFrom[0][i]; } for ( int i = 0; i < distFrom[1].length; i++ ) { distTo[1][i] = distFrom[1][i]; } } /** * Main method for this class. * * @param argv the commandline parameters */ public static void main(String[] argv) { runClassifier(new FastRandomTree(), argv); } @Override public String getRevision() { return RevisionUtils.extract("$Revision: 0.99$"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy