hr.irb.fastRandomForest.FastRandomTree Maven / Gradle / Ivy
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* FastRandomTree.java
* Copyright (C) 2001 University of Waikato, Hamilton, NZ (original code,
* RandomTree.java)
* Copyright (C) 2013 Fran Supek (adapted code)
*/
package hr.irb.fastRandomForest;
import java.util.Arrays;
import weka.classifiers.AbstractClassifier;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities.Capability;
import java.util.Random;
import weka.core.RevisionUtils;
/**
* Based on the "weka.classifiers.trees.RandomTree" class, revision 1.19,
* by Eibe Frank and Richard Kirkby, with major modifications made to improve
* the speed of classifier training.
*
* Please refer to the Javadoc of buildTree, splitData and distribution
* function, as well as the changelog.txt, for the details of changes to
* FastRandomTree.
*
* This class should be used only from within the FastRandomForest classifier.
*
* @author Eibe Frank ([email protected]) - original code
* @author Richard Kirkby ([email protected]) - original code
* @author Fran Supek (fran.supek[AT]irb.hr) - adapted code
* @version $Revision: 0.99$
*/
class FastRandomTree
extends AbstractClassifier
implements OptionHandler, WeightedInstancesHandler, Runnable {
/** for serialization */
static final long serialVersionUID = 8934314652175299375L;
/** The subtrees appended to this tree (node). */
protected FastRandomTree[] m_Successors;
/**
* For access to parameters of the RF (k, or maxDepth).
*/
protected FastRandomForest m_MotherForest;
/** The attribute to split on. */
protected int m_Attribute = -1;
/** The split point. */
protected double m_SplitPoint = Double.NaN;
/** The proportions of training instances going down each branch. */
protected double[] m_Prop = null;
/** Class probabilities from the training vals. */
protected double[] m_ClassProbs = null;
/** The dataset used for training. */
protected transient DataCache data = null;
/**
* Since 0.99: holds references to temporary arrays re-used by all nodes
* in the tree, used while calculating the "props" for various attributes in
* distributionSequentialAtt(). This is meant to avoid frequent
* creating/destroying of these arrays.
*/
protected transient double[] tempProps;
/**
* Since 0.99: holds references to temporary arrays re-used by all nodes
* in the tree, used while calculating the "dists" for various attributes
* in distributionSequentialAtt(). This is meant to avoid frequent
* creating/destroying of these arrays.
*/
protected transient double[][] tempDists;
protected transient double[][] tempDistsOther;
/** Minimum number of instances for leaf. */
protected static final int m_MinNum = 1;
/**
* Get the value of MinNum.
*
* @return Value of MinNum.
*/
public final int getMinNum() {
return m_MinNum;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String KValueTipText() {
return "Sets the number of randomly chosen attributes.";
}
/**
* Get the value of K.
*
* @return Value of K.
*/
public final int getKValue() {
return m_MotherForest.m_KValue;
}
/**
* Get the maximum depth of the tree, 0 for unlimited.
*
* @return the maximum depth.
*/
public final int getMaxDepth() {
return m_MotherForest.m_MaxDepth;
}
/**
* Returns default capabilities of the classifier.
*
* @return the capabilities of this classifier
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
return result;
}
/**
* This function is not supported by FastRandomTree, as it requires a
* DataCache for training.
* @throws Exception every time this function is called
*/
@Override
public void buildClassifier(Instances data) throws Exception {
throw new Exception("FastRandomTree can be used only by FastRandomForest " +
"and FastRfBagger classes, not directly.");
}
/**
* Builds classifier. Makes the initial call to the recursive buildTree
* function. The name "run()" is used to support multithreading via an
* ExecutorService.
*
* The "data" field of the FastRandomTree should contain a
* reference to a DataCache prior to calling this function, and that
* DataCache should have the "reusableRandomGenerator" field initialized.
* The FastRfBagging class normally takes care of this before invoking this
* function.
*/
public void run() {
// compute initial class counts
double[] classProbs = new double[data.numClasses];
for (int i = 0; i < data.numInstances; i++) {
classProbs[data.instClassValues[i]] += data.instWeights[i];
}
// create the attribute indices window - skip class
int[] attIndicesWindow = new int[data.numAttributes - 1];
int j = 0;
for (int i = 0; i < attIndicesWindow.length; i++) {
if (j == data.classIndex)
j++; // do not include the class
attIndicesWindow[i] = j++;
}
// prepare the DataCache by:
// ... creating an array for the whatGoesWhere field of the data
// ... creating the sortedIndices
data.whatGoesWhere = new int[ data.inBag.length ];
data.createInBagSortedIndices();
buildTree(data.sortedIndices, 0, data.sortedIndices[0].length-1,
classProbs, m_Debug, attIndicesWindow, 0);
this.data = null;
}
/**
* Computes class distribution of an instance using the FastRandomTree.
*
* In Weka's RandomTree, the distributions were normalized so that all
* probabilities sum to 1; this would abolish the effect of instance weights
* on voting. In FastRandomForest 0.97 onwards, the distributions are
* normalized by dividing with the number of instances going into a leaf.
*
* @param instance the instance to compute the distribution for
* @return the computed class distribution
* @throws Exception if computation fails
*/
@Override
public double[] distributionForInstance(Instance instance) throws Exception {
double[] returnedDist = null;
if (m_Attribute > -1) { // ============================ node is not a leaf
if (instance.isMissing(m_Attribute)) { // ---------------- missing value
returnedDist = new double[m_MotherForest.m_Info.numClasses()];
// split instance up
for (int i = 0; i < m_Successors.length; i++) {
double[] help = m_Successors[i].distributionForInstance(instance);
if (help != null) {
for (int j = 0; j < help.length; j++) {
returnedDist[j] += m_Prop[i] * help[j];
}
}
}
} else if (m_MotherForest.m_Info
.attribute(m_Attribute).isNominal()) { // ------ nominal
//returnedDist = m_Successors[(int) instance.value(m_Attribute)]
// .distributionForInstance(instance);
// 0.99: new - binary splits (also) for nominal attributes
if ( instance.value(m_Attribute) == m_SplitPoint ) {
returnedDist = m_Successors[0].distributionForInstance(instance);
} else {
returnedDist = m_Successors[1].distributionForInstance(instance);
}
} else { // ------------------------------------------ numeric attributes
if (instance.value(m_Attribute) < m_SplitPoint) {
returnedDist = m_Successors[0].distributionForInstance(instance);
} else {
returnedDist = m_Successors[1].distributionForInstance(instance);
}
}
return returnedDist;
} else { // =============================================== node is a leaf
return m_ClassProbs;
}
}
/**
* Computes class distribution of an instance using the FastRandomTree.
*
* Works correctly only if the DataCache has the same attributes as the one
* used to train the FastRandomTree - but this function does not check for
* that!
*
* Main use of this is to compute out-of-bag error (also when finding feature
* importances).
*
* @param instance the instance to compute the distribution for
* @return the computed class distribution
* @throws Exception if computation fails
*/
public double[] distributionForInstanceInDataCache(DataCache data, int instIdx) {
double[] returnedDist = null;
if (m_Attribute > -1) { // ============================ node is not a leaf
if ( data.isValueMissing(m_Attribute, instIdx) ) { // ---------------- missing value
returnedDist = new double[m_MotherForest.m_Info.numClasses()];
// split instance up
for (int i = 0; i < m_Successors.length; i++) {
double[] help = m_Successors[i].distributionForInstanceInDataCache(data, instIdx);
if (help != null) {
for (int j = 0; j < help.length; j++) {
returnedDist[j] += m_Prop[i] * help[j];
}
}
}
} else if ( data.isAttrNominal(m_Attribute) ) { // ------ nominal
//returnedDist = m_Successors[(int) instance.value(m_Attribute)]
// .distributionForInstance(instance);
// 0.99: new - binary splits (also) for nominal attributes
if ( data.vals[m_Attribute][instIdx] == m_SplitPoint ) {
returnedDist = m_Successors[0].distributionForInstanceInDataCache(data, instIdx);
} else {
returnedDist = m_Successors[1].distributionForInstanceInDataCache(data, instIdx);
}
} else { // ------------------------------------------ numeric attributes
if ( data.vals[m_Attribute][instIdx] < m_SplitPoint) {
returnedDist = m_Successors[0].distributionForInstanceInDataCache(data, instIdx);
} else {
returnedDist = m_Successors[1].distributionForInstanceInDataCache(data, instIdx);
}
}
return returnedDist;
} else { // =============================================== node is a leaf
return m_ClassProbs;
}
}
/**
* Recursively generates a tree. A derivative of the buildTree function from
* the "weka.classifiers.trees.RandomTree" class, with the following changes
* made:
*
*
* - m_ClassProbs are now remembered only in leaves, not in every node of
* the tree
*
*
- m_Distribution has been removed
*
*
- members of dists, splits, props and vals arrays which are not used are
* dereferenced prior to recursion to reduce memory requirements
*
*
- a check for "branch with no training instances" is now (FastRF 0.98)
* made before recursion; with the current implementation of splitData(),
* empty branches can appear only with nominal attributes with more than
* two categories
*
*
- each new 'tree' (i.e. node or leaf) is passed a reference to its
* 'mother forest', necessary to look up parameters such as maxDepth and K
*
*
- pre-split entropy is not recalculated unnecessarily
*
*
- uses DataCache instead of weka.core.Instances, the reference to the
* DataCache is stored as a field in FastRandomTree class and not passed
* recursively down new buildTree() calls
*
*
- similarly, a reference to the random number generator is stored
* in a field of the DataCache
*
*
- m_ClassProbs are now normalized by dividing with number of instances
* in leaf, instead of forcing the sum of class probabilities to 1.0;
* this has a large effect when class/instance weights are set by user
*
*
- a little imprecision is allowed in checking whether there was a
* decrease in entropy after splitting
*
*
- 0.99: the temporary arrays splits, props, vals now are not wide
* as the full number of attributes in the dataset (of which only "k" columns
* of randomly chosen attributes get filled). Now, it's just a single array
* which gets replaced as the k features are evaluated sequentially, but it
* gets replaced only if a next feature is better than a previous one.
*
*
- 0.99: the SortedIndices are now not cut up into smaller arrays on every
* split, but rather re-sorted within the same array in the splitDataNew(),
* and passed down to buildTree() as the original large matrix, but with
* start and end points explicitly specified
*
*
*
* @param sortedIndices the indices of the instances of the whole bootstrap replicate
* @param startAt First index of the instance to consider in this split; inclusive.
* @param endAt Last index of the instance to consider; inclusive.
* @param classProbs the class distribution
* @param debug whether debugging is on
* @param attIndicesWindow the attribute window to choose attributes from
* @param depth the current depth
*/
protected void buildTree(int[][] sortedIndices, int startAt, int endAt,
double[] classProbs,
boolean debug,
int[] attIndicesWindow,
int depth) {
m_Debug = debug;
int sortedIndicesLength = endAt - startAt + 1;
// Check if node doesn't contain enough instances or is pure
// or maximum depth reached, make leaf.
if ( ( sortedIndicesLength < Math.max(2, getMinNum()) ) // small
|| Utils.eq( classProbs[Utils.maxIndex(classProbs)], Utils.sum(classProbs) ) // pure
|| ( (getMaxDepth() > 0) && (depth >= getMaxDepth()) ) // deep
) {
m_Attribute = -1; // indicates leaf (no useful attribute to split on)
// normalize by dividing with the number of instances (as of ver. 0.97)
// unless leaf is empty - this can happen with splits on nominal
// attributes with more than two categories
if ( sortedIndicesLength != 0 )
for (int c = 0; c < classProbs.length; c++) {
classProbs[c] /= sortedIndicesLength;
}
m_ClassProbs = classProbs;
this.data = null;
return;
} // (leaf making)
// new 0.99: all the following are for the best attribute only! they're updated while sequentially through the attributes
double val = Double.NaN; // value of splitting criterion
double[][] dist = new double[2][data.numClasses]; // class distributions (contingency table), indexed first by branch, then by class
double[] prop = new double[2]; // the branch sizes (as fraction)
double split = Double.NaN; // split point
// Investigate K random attributes
int attIndex = 0;
int windowSize = attIndicesWindow.length;
int k = getKValue();
boolean sensibleSplitFound = false;
double prior = Double.NaN;
double bestNegPosterior = -Double.MAX_VALUE;
int bestAttIdx = -1;
while ((windowSize > 0) && (k-- > 0 || !sensibleSplitFound ) ) {
int chosenIndex = data.reusableRandomGenerator.nextInt(windowSize);
attIndex = attIndicesWindow[chosenIndex];
// shift chosen attIndex out of window
attIndicesWindow[chosenIndex] = attIndicesWindow[windowSize - 1];
attIndicesWindow[windowSize - 1] = attIndex;
windowSize--;
// new: 0.99
double candidateSplit = distributionSequentialAtt( prop, dist,
bestNegPosterior, attIndex,
sortedIndices[attIndex], startAt, endAt );
if ( Double.isNaN(candidateSplit) ) {
continue; // we did not improve over a previous attribute! "dist" is unchanged from before
}
// by this point we know we have an improvement, so we keep the new split point
split = candidateSplit;
bestAttIdx = attIndex;
if ( Double.isNaN(prior) ) { // needs to be computed only once per branch - is same for all attributes (even regardless of missing values)
prior = SplitCriteria.entropyOverColumns(dist);
}
double negPosterior = - SplitCriteria.entropyConditionedOnRows(dist); // this is an updated dist
if ( negPosterior > bestNegPosterior ) {
bestNegPosterior = negPosterior;
} else {
throw new IllegalArgumentException("Very strange!");
}
val = prior - (-negPosterior); // we want the greatest reduction in entropy
if ( val > 1e-2 ) { // we allow some leeway here to compensate
sensibleSplitFound = true; // for imprecision in entropy computation
}
} // feature by feature in window
if ( sensibleSplitFound ) {
m_Attribute = bestAttIdx; // find best attribute
m_SplitPoint = split;
m_Prop = prop;
prop = null; // can be GC'ed
//int[][][] subsetIndices =
// new int[dist.length][data.numAttributes][];
//splitData( subsetIndices, m_Attribute,
// m_SplitPoint, sortedIndices );
//int numInstancesBeforeSplit = sortedIndices[0].length;
int belowTheSplitStartsAt = splitDataNew( m_Attribute, m_SplitPoint, sortedIndices, startAt, endAt );
m_Successors = new FastRandomTree[dist.length]; // dist.length now always == 2
for (int i = 0; i < dist.length; i++) {
m_Successors[i] = new FastRandomTree();
m_Successors[i].m_MotherForest = this.m_MotherForest;
m_Successors[i].data = this.data;
// new in 0.99 - used in distributionSequentialAtt()
m_Successors[i].tempDists = this.tempDists;
m_Successors[i].tempDistsOther = this.tempDistsOther;
m_Successors[i].tempProps = this.tempProps;
// check if we're about to make an empty branch - this can happen with
// nominal attributes with more than two categories (as of ver. 0.98)
if ( belowTheSplitStartsAt - startAt == 0 ) {
// in this case, modify the chosenAttDists[i] so that it contains
// the current, before-split class probabilities, properly normalized
// by the number of instances (as we won't be able to normalize
// after the split)
for ( int j = 0; j < dist[i].length; j++ )
dist[i][j] = classProbs[j] / sortedIndicesLength;
}
if ( i == 0 ) { // before split
m_Successors[i].buildTree(sortedIndices, startAt, belowTheSplitStartsAt - 1,
dist[i], m_Debug, attIndicesWindow, depth + 1);
} else { // after split
m_Successors[i].buildTree(sortedIndices, belowTheSplitStartsAt, endAt,
dist[i], m_Debug, attIndicesWindow, depth + 1);
}
dist[i] = null;
}
sortedIndices = null;
} else { // ------ make leaf --------
m_Attribute = -1;
// normalize by dividing with the number of instances (as of ver. 0.97)
// unless leaf is empty - this can happen with splits on nominal attributes
if ( sortedIndicesLength != 0 )
for (int c = 0; c < classProbs.length; c++) {
classProbs[c] /= sortedIndicesLength;
}
m_ClassProbs = classProbs;
}
this.data = null; // dereference all pointers so data can be GC'd after tree is built
}
/**
* Computes size of the tree.
*
* @return the number of nodes
*/
public int numNodes() {
if (m_Attribute == -1) {
return 1;
} else {
int size = 1;
for (int i = 0; i < m_Successors.length; i++) {
size += m_Successors[i].numNodes();
}
return size;
}
}
/**
* Splits instances into subsets. Not used anymore in 0.99. This is a
* derivative of the splitData function from "weka.classifiers.trees.RandomTree",
* with the following changes:
*
* - When handling instances with missing values in attribute chosen for the
* split, the FastRandomTree assignes the instance to one of the branches at
* random, with bigger branches having a higher probability of getting the
* instance.
*
* - When splitting sortedIndices into two or more subsetIndices,
* FastRandomTree checks whether an instance's split attribute value was above
* splitpoint only once per instances, and stores result into the DataCache's
* whatGoesWhere field, which is then read in splitting subsetIndices.
*
* As a consequence of the above points, the exact branch sizes (even with
* instances having unknowns in the split attribute) are known in advance so
* subsetIndices arrays don't have to be 'resized' (i.e. a new shorter copy
* of each one created and the old one GCed).
*
* @param subsetIndices the sorted indices of the subset
* @param att the attribute index
* @param splitPoint the splitpoint for numeric attributes
* @param sortedIndices the sorted indices of the whole set
*/
protected void splitData( int[][][] subsetIndices,
int att, double splitPoint,
int[][] sortedIndices ) {
Random random = data.reusableRandomGenerator;
int j;
// 0.99: we have binary splits also for nominal data
int[] num = new int[2]; // how many instances go to each branch
if ( data.isAttrNominal(att) ) { // ============================ if nominal
for (j = 0; j < sortedIndices[att].length; j++) {
int inst = sortedIndices[att][j];
if ( data.isValueMissing(att, inst) ) { // ---------- has missing value
// decide where to put this instance randomly, with bigger branches
// getting a higher chance
double rn = random.nextDouble();
int myBranch = -1;
for (int k = 0; k < m_Prop.length; k++) {
rn -= m_Prop[k];
if ( (rn <= 0) || k == (m_Prop.length-1) ) {
myBranch = k;
break;
}
}
data.whatGoesWhere[ inst ] = myBranch;
num[myBranch]++;
} else { // ----------------------------- does not have missing value
// if it matches the category to "split out", put above split
// all other categories go below split
int subset = ( data.vals[att][inst] == splitPoint ) ? 0 : 1;
data.whatGoesWhere[ inst ] = subset;
num[subset]++;
} // --------------------------------------- end if has missing value
}
} else { // =================================================== if numeric
num = new int[2];
for (j = 0; j < sortedIndices[att].length; j++) {
int inst = sortedIndices[att][j];
//Instance inst = data.instance(sortedIndices[att][j]);
if ( data.isValueMissing(att, inst) ) { // ---------- has missing value
// decide if instance goes into subset 0 or 1 randomly,
// with bigger subsets having a greater probability of getting
// the instance assigned to them
// instances with missing values get processed LAST (sort order)
// so branch sizes are known by now (and stored in m_Prop)
double rn = random.nextDouble();
int branch = ( rn > m_Prop[0] ) ? 1 : 0;
data.whatGoesWhere[ inst ] = branch;
num[ branch ]++;
} else { // ----------------------------- does not have missing value
int branch = ( data.vals[att][inst] < splitPoint ) ? 0 : 1;
data.whatGoesWhere[ inst ] = branch;
num[ branch ]++;
} // --------------------------------------- end if has missing value
} // end for instance by instance
} // ============================================ end if nominal / numeric
// create the new subset (branch) arrays of correct size -- as of 0.99, not anymore
for (int a = 0; a < data.numAttributes; a++) {
if ( a == data.classIndex )
continue; // no need to sort this one
for (int branch = 0; branch < num.length; branch++) {
subsetIndices[branch][a] = new int[num[branch]];
}
}
for (int a = 0; a < data.numAttributes; a++) { // xxxxxxxxxx attr by attr
if (a == data.classIndex)
continue;
for (int branch = 0; branch < num.length; branch++) {
num[branch] = 0;
}
// fill them with stuff by looking at goesWhere array
for (j = 0; j < sortedIndices[ a ].length; j++) {
int inst = sortedIndices[ a ][j];
int branch = data.whatGoesWhere[ inst ]; // can be 0 or 1
subsetIndices[ branch ][ a ][ num[branch] ] = sortedIndices[a][j];
num[branch]++;
}
} // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx end for attr by attr
}
/**
* Splits instances into subsets; new for FastRF 0.99. Does not create new
* arrays with split indices, but rather reorganizes the indices within the
* supplied sortedIndices to conform with the split. Works only within given
* boundaries.
*
* Note: as of 0.99, all splits (incl. categorical) are always binary.
*
* @param att the attribute index
* @param splitPoint the splitpoint for numeric attributes
* @param sortedIndices the sorted indices of the whole set - gets overwritten!
* @param startAt Inclusive, 0-based index. Does not touch anything before this value.
* @param endAt Inclusive, 0-based index. Does not touch anything after this value.
*
* @return the first index of the "below the split" instances
*/
protected int splitDataNew(
int att, double splitPoint,
int[][] sortedIndices, int startAt, int endAt ) {
Random random = data.reusableRandomGenerator;
int j;
// 0.99: we have binary splits also for nominal data
int[] num = new int[2]; // how many instances go to each branch
// we might possibly want to recycle this array for the whole tree
int[] tempArr = new int[ endAt-startAt+1 ];
if ( data.isAttrNominal(att) ) { // ============================ if nominal
for (j = startAt; j <= endAt; j++) {
int inst = sortedIndices[att][j];
if ( data.isValueMissing(att, inst) ) { // ---------- has missing value
// decide where to put this instance randomly, with bigger branches
// getting a higher chance
double rn = random.nextDouble();
int myBranch = -1;
for (int k = 0; k < m_Prop.length; k++) {
rn -= m_Prop[k];
if ( (rn <= 0) || k == (m_Prop.length-1) ) {
myBranch = k;
break;
}
}
data.whatGoesWhere[ inst ] = myBranch;
num[myBranch]++;
} else { // ----------------------------- does not have missing value
// if it matches the category to "split out", put above split
// all other categories go below split
int subset = ( data.vals[att][inst] == splitPoint ) ? 0 : 1;
data.whatGoesWhere[ inst ] = subset;
num[subset]++;
} // --------------------------------------- end if has missing value
}
} else { // =================================================== if numeric
num = new int[2];
for (j = startAt; j <= endAt ; j++) {
int inst = sortedIndices[att][j];
//Instance inst = data.instance(sortedIndices[att][j]);
if ( data.isValueMissing(att, inst) ) { // ---------- has missing value
// decide if instance goes into subset 0 or 1 randomly,
// with bigger subsets having a greater probability of getting
// the instance assigned to them
// instances with missing values get processed LAST (sort order)
// so branch sizes are known by now (and stored in m_Prop)
double rn = random.nextDouble();
int branch = ( rn > m_Prop[0] ) ? 1 : 0;
data.whatGoesWhere[ inst ] = branch;
num[ branch ]++;
} else { // ----------------------------- does not have missing value
int branch = ( data.vals[att][inst] < splitPoint ) ? 0 : 1;
data.whatGoesWhere[ inst ] = branch;
num[ branch ]++;
} // --------------------------------------- end if has missing value
} // end for instance by instance
} // ============================================ end if nominal / numeric
for (int a = 0; a < data.numAttributes; a++) { // xxxxxxxxxx attr by attr
if (a == data.classIndex)
continue;
// the first index of the sortedIndices in the above branch, and the first index in the below
int startAbove = 0, startBelow = num[0]; // always only 2 sub-branches, remember where second starts
Arrays.fill(tempArr, 0);
//for (int branch = 0; branch < num.length; branch++) {
// num[branch] = 0;
//}
// fill them with stuff by looking at goesWhere array
for (j = startAt; j <= endAt; j++) {
int inst = sortedIndices[ a ][j];
int branch = data.whatGoesWhere[ inst ]; // can be only 0 or 1
if ( branch==0 ) {
tempArr[ startAbove ] = sortedIndices[a][j];
startAbove++;
} else {
tempArr[ startBelow ] = sortedIndices[a][j];
startBelow++;
}
//subsetIndices[ branch == 0 ? startAbove : ][ a ][ num[branch] ] = sortedIndices[a][j];
//num[branch]++;
}
// now copy the tempArr into the sortedIndices, thus overwriting it
System.arraycopy( tempArr, 0, sortedIndices[a], startAt, endAt-startAt+1 );
} // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx end for attr by attr
return startAt+num[0]; // the first index of "below the split" instances
}
/**
* Computes class distribution for an attribute. Not used anymore in 0.99.
* Based on the splitData function from "weka.classifiers.trees.RandomTree",
* with the following changes:
*
* - entropy pre-split is not computed at this point as the only thing
* relevant for the (comparative) goodness of a split is entropy after splitting
*
- dist[][] is now computed only after the split point has been found,
* and not updated continually by copying from currDist
*
- also, in Weka's RandomTree it was possible to create a split 'in the
* middle' of instance 0, which would result in empty nodes after the
* split; this is now fixed
*
- instance 0 is now generally skipped when looking for split points,
* as the split point 'before instance 0' is not sensible; in versions
* prior to 0.96 this change introduced a bug where attributes with
* all missing values had their dists computed wrongly, which might
* result in useless (but harmless) branches being added to the tree
*
*
* @param props gets filled with relative sizes of branches (total = 1), indexed
* first per attribute
* @param dists these are the contingency matrices, indexed first per attribute
* @param att the attribute index (which one to change)
* @param sortedIndices the sorted indices of the vals
*/
protected double distribution( double[][] props, double[][][] dists,
int att, int[] sortedIndices ) {
double splitPoint = -Double.MAX_VALUE;
double[][] dist = null; // a contingency table of the split point vs class
int i;
if ( data.isAttrNominal(att) ) { // ====================== nominal attributes
dist = new double[data.attNumVals[att]][data.numClasses];
for (i = 0; i < sortedIndices.length; i++) {
int inst = sortedIndices[i];
if ( data.isValueMissing(att, inst) )
break;
dist[ (int)data.vals[att][inst] ][ data.instClassValues[inst] ] += data.instWeights[inst];
}
splitPoint = 0; // signals we've found a sensible split point; by
// definition, a split on a nominal attribute is sensible
} else { // ============================================ numeric attributes
double[][] currDist = new double[2][data.numClasses];
dist = new double[2][data.numClasses];
//begin with moving all instances into second subset
for (int j = 0; j < sortedIndices.length; j++) {
int inst = sortedIndices[j];
if ( data.isValueMissing(att, inst) )
break;
currDist[1][ data.instClassValues[inst] ] += data.instWeights[inst];
}
copyDists(currDist, dist);
//for (int j = 0; j < currDist.length; j++)
// System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
double currVal = -Double.MAX_VALUE; // current value of splitting criterion
double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed
for (i = 1; i < sortedIndices.length; i++) { // --- try all split points
int inst = sortedIndices[i];
if ( data.isValueMissing(att, inst) )
break;
int prevInst = sortedIndices[i-1];
currDist[0][ data.instClassValues[ prevInst ] ]
+= data.instWeights[ prevInst ] ;
currDist[1][ data.instClassValues[ prevInst ] ]
-= data.instWeights[ prevInst ] ;
// do not allow splitting between two instances with the same value
if ( data.vals[att][inst] > data.vals[att][prevInst] ) {
// we want the lowest impurity after split; at this point, we don't
// really care what we've had before spliting
currVal = -SplitCriteria.entropyConditionedOnRows(currDist);
if (currVal > bestVal) {
bestVal = currVal;
bestI = i;
}
}
} // ------- end split points
/*
* Determine the best split point:
* bestI == 0 only if all instances had missing values, or there were
* less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE.
* This is not really a useful split, as all of the instances are 'below'
* the split line, but at least it's formally correct. And the dists[]
* also has a default value set previously.
*/
if ( bestI > 0 ) { // ...at least one valid splitpoint was found
int instJustBeforeSplit = sortedIndices[bestI-1];
int instJustAfterSplit = sortedIndices[bestI];
splitPoint = ( data.vals[ att ][ instJustAfterSplit ]
+ data.vals[ att ][ instJustBeforeSplit ] ) / 2.0;
// Now make the correct dist[] from the default dist[] (all instances
// in the second branch, by iterating through instances until we reach
// bestI, and then stop.
for ( int ii = 0; ii < bestI; ii++ ) {
int inst = sortedIndices[ii];
dist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ;
dist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ;
}
}
} // ================================================== nominal or numeric?
// compute total weights for each branch (= props)
props[att] = countsToFreqs(dist);
// distribute counts of instances with missing values
// ver 0.96 - check for special case when *all* instances have missing vals
if ( data.isValueMissing(att, sortedIndices[0]) )
i = 0;
while (i < sortedIndices.length) {
int inst = sortedIndices[i];
for (int branch = 0; branch < dist.length; branch++) {
dist[ branch ][ data.instClassValues[inst] ]
+= props[ att ][ branch ] * data.instWeights[ inst ] ;
}
i++;
}
// return distribution after split and best split point
dists[att] = dist;
return splitPoint;
}
/**
* Computes class distribution for an attribute. New in FastRF 0.99, main
* changes:
*
* - now reuses the temporary counting arrays (this.tempDists,
* this.tempDistsOthers) instead of creating/destroying arrays
*
- does not create a new "dists" for each attribute it examines; instead
* it replaces the existing "dists" (supplied as a parameter) but only if the
* split is better than the previous best split
*
- always creates binary splits, even for categorical variables; thus
* might give slightly different classification results than the old
* RandomForest
*
*
* @param propsBestAtt gets filled with relative sizes of branches (total = 1)
* for the best examined attribute so far; updated ONLY if current attribute is
* better that the previous best
* @param distsBestAtt these are the contingency matrices for the best examined
* attribute so far; updated ONLY if current attribute is better that the previous best
* @param scoreBestAtt Checked against the score of the attToExamine to determine
* if the propsBestAtt and distsBestAtt need to be updated.
* @param attToExamine the attribute index (which one to examine, and change the above
* matrices if the attribute is better than the previous one)
* @param sortedIndices the sorted indices of the vals for the attToExamine.
* @param startAt Index in sortedIndicesOfAtt; do not touch anything below this index.
* @param endAt Index in sortedIndicesOfAtt; do not touch anything after this index.
*/
protected double distributionSequentialAtt( double[] propsBestAtt, double[][] distsBestAtt,
double scoreBestAtt, int attToExamine, int[] sortedIndicesOfAtt, int startAt, int endAt ) {
double splitPoint = -Double.MAX_VALUE;
// a contingency table of the split point vs class.
double[][] dist = this.tempDists;
Arrays.fill( dist[0], 0.0 ); Arrays.fill( dist[1], 0.0 );
double[][] currDist = this.tempDistsOther;
Arrays.fill( currDist[0], 0.0 ); Arrays.fill( currDist[1], 0.0 );
//double[][] dist = new double[2][data.numClasses];
//double[][] currDist = new double[2][data.numClasses];
int i;
int sortedIndicesOfAttLength = endAt - startAt + 1;
// find how many missing values we have for this attribute (they're always at the end)
int lastNonmissingValIdx = endAt;
for (int j = endAt; j >= startAt; j-- ) {
if ( data.isValueMissing(attToExamine, sortedIndicesOfAtt[j]) ) {
lastNonmissingValIdx = j-1;
} else {
break;
}
}
if ( lastNonmissingValIdx < startAt ) { // only missing values in this feature??
return Double.NaN; // we cannot split on it
}
if ( data.isAttrNominal(attToExamine) ) { // ====================== nominal attributes
// 0.99: new routine - makes a one-vs-all split on categorical attributes
int numLvls = data.attNumVals[attToExamine];
int bestLvl = 0; // the index of the category which is best to "split out"
// note: if we have only two levels, it doesn't matter which one we "split out"
// we can thus safely check only the first one
if ( numLvls <= 2 ) {
bestLvl = 0; // this means that the category with index 0 always
// goes 'above' the split and category with index 1 goes 'below' the split
for (i = startAt; i <= lastNonmissingValIdx; i++) {
int inst = sortedIndicesOfAtt[i];
dist[ (int)data.vals[attToExamine][inst] ][ data.instClassValues[inst] ] += data.instWeights[inst];
}
} else { // for >2 levels, we have to search different splits
// begin with moving all instances into second subset ("below split")
for (int j = startAt; j <= lastNonmissingValIdx; j++) {
int inst = sortedIndicesOfAtt[j];
currDist[1][ data.instClassValues[inst] ] += data.instWeights[inst];
}
// create a default dist[] which we'll modify after we find the best class to split out
copyDists(currDist, dist);
double currVal = -Double.MAX_VALUE; // current value of splitting criterion
double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
int lastSeen = startAt; // used to avoid looping through all instances for every lvl
for ( int lvl = 0; lvl < numLvls; lvl++ ) {
// reset the currDist to the default (everything "below split") - conveniently stored in dist[][]
copyDists(dist, currDist);
for (i = lastSeen; i <= lastNonmissingValIdx; i++) {
lastSeen = i;
int inst = sortedIndicesOfAtt[i];
if ( (int)data.vals[attToExamine][inst] < lvl ) {
continue;
} else if ( (int)data.vals[attToExamine][inst] == lvl ) {
// move to "above split" from "below split"
currDist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ;
currDist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ;
} else {
break; // no need to loop forward, no more instances of this category
}
}
// we filled the "dist" for the current level, find score and see if we like it
currVal = -SplitCriteria.entropyConditionedOnRows(currDist);
if ( currVal > bestVal ) {
bestVal = currVal;
bestLvl = lvl;
}
} // examine how well "splitting out" of individual levels works for us
// remember the contingency table from the best "lvl" and store it in "dist"
for (i = startAt; i <= lastNonmissingValIdx; i++) {
int inst = sortedIndicesOfAtt[i];
if ( (int)data.vals[attToExamine][inst] == bestLvl ) {
// move to "above split" from "below split"
dist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ;
dist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ;
} else {
break; // no need to loop forward, no more instances of this category
}
}
}
splitPoint = bestLvl; // signals we've found a sensible split point; by
// definition, a split on a nominal attribute
// will always be sensible
} else { // ============================================ numeric attributes
// re-use the 2 x nClass temporary arrays created when tree was initialized
//Arrays.fill( dist[0], 0.0 );
//Arrays.fill( dist[1], 0.0 );
// begin with moving all instances into second subset ("below split")
for (int j = startAt; j <= lastNonmissingValIdx; j++) {
int inst = sortedIndicesOfAtt[j];
currDist[1][ data.instClassValues[inst] ] += data.instWeights[inst];
}
copyDists(currDist, dist);
double currVal = -Double.MAX_VALUE; // current value of splitting criterion
double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed
for (i = startAt+1; i <= lastNonmissingValIdx; i++) { // --- try all split points
int inst = sortedIndicesOfAtt[i];
int prevInst = sortedIndicesOfAtt[i-1];
currDist[0][ data.instClassValues[ prevInst ] ]
+= data.instWeights[ prevInst ] ;
currDist[1][ data.instClassValues[ prevInst ] ]
-= data.instWeights[ prevInst ] ;
// do not allow splitting between two instances with the same value
if ( data.vals[attToExamine][inst] > data.vals[attToExamine][prevInst] ) {
// we want the lowest impurity after split; at this point, we don't
// really care what we've had before spliting
currVal = -SplitCriteria.entropyConditionedOnRows(currDist);
if (currVal > bestVal) {
bestVal = currVal;
bestI = i;
}
}
} // ------- end trying split points
/*
* Determine the best split point:
* bestI == 0 only if all instances had missing values, or there were
* less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE.
* This is not really a useful split, as all of the instances are 'below'
* the split line, but at least it's formally correct. And the dists[]
* also has a default value set previously.
*/
if ( bestI > startAt ) { // ...at least one valid splitpoint was found
int instJustBeforeSplit = sortedIndicesOfAtt[bestI-1];
int instJustAfterSplit = sortedIndicesOfAtt[bestI];
splitPoint = ( data.vals[ attToExamine ][ instJustAfterSplit ]
+ data.vals[ attToExamine ][ instJustBeforeSplit ] ) / 2.0;
// now make the correct dist[] (for the best split point) from the
// default dist[] (all instances in the second branch, by iterating
// through instances until we reach bestI, and then stop.
for ( int ii = startAt; ii < bestI; ii++ ) {
int inst = sortedIndicesOfAtt[ii];
dist[0][ data.instClassValues[ inst ] ] += data.instWeights[ inst ] ;
dist[1][ data.instClassValues[ inst ] ] -= data.instWeights[ inst ] ;
}
}
} // ================================================== nominal or numeric?
// compute total weights for each branch (= props)
// again, we reuse the tempProps of the tree not to create/destroy new arrays
double[] props = this.tempProps;
countsToFreqs(dist, props); // props gets overwritten, previous contents don't matters
// distribute *counts* of instances with missing values using the "props"
i = lastNonmissingValIdx + 1; /// start 1 after the non-missing val (if there is anything)
while ( i <= endAt ) {
int inst = sortedIndicesOfAtt[i];
dist[ 0 ][ data.instClassValues[inst] ] += props[ 0 ] * data.instWeights[ inst ] ;
dist[ 1 ][ data.instClassValues[inst] ] += props[ 1 ] * data.instWeights[ inst ] ;
i++;
}
// update the distribution after split and best split point
// but ONLY if better than the previous one -- we need to recalculate the
// entropy (because this changes after redistributing the instances with
// missing values in the current attribute). Also, for categorical variables
// it was not calculated before.
double curScore = -SplitCriteria.entropyConditionedOnRows(dist);
if ( curScore > scoreBestAtt && splitPoint > -Double.MAX_VALUE ) { // overwrite the "distsBestAtt" and "propsBestAtt" with current values
copyDists(dist, distsBestAtt);
System.arraycopy( props, 0, propsBestAtt, 0, props.length );
return splitPoint;
} else {
// returns a NaN instead of the splitpoint if the attribute was not better than a previous one.
return Double.NaN;
}
}
/**
* Normalizes branch sizes so they contain frequencies (stored in "props")
* instead of counts (stored in "dist"). Creates a new double[] which it
* returns.
*/
protected static double[] countsToFreqs( double[][] dist ) {
double[] props = new double[dist.length];
for (int k = 0; k < props.length; k++) {
props[k] = Utils.sum(dist[k]);
}
if (Utils.eq(Utils.sum(props), 0)) {
for (int k = 0; k < props.length; k++) {
props[k] = 1.0 / (double) props.length;
}
} else {
FastRfUtils.normalize(props);
}
return props;
}
/**
* Normalizes branch sizes so they contain frequencies (stored in "props")
* instead of counts (stored in "dist").
*
* Overwrites the supplied "props"!
*
* props.length must be == dist.length.
*/
protected static void countsToFreqs( double[][] dist, double[] props ) {
for (int k = 0; k < props.length; k++) {
props[k] = Utils.sum(dist[k]);
}
if (Utils.eq(Utils.sum(props), 0)) {
for (int k = 0; k < props.length; k++) {
props[k] = 1.0 / (double) props.length;
}
} else {
FastRfUtils.normalize(props);
}
}
/**
* Makes a copy of a "dists" array, which is a 2 x numClasses array.
*
* @param distFrom
* @param distTo Gets overwritten.
*/
protected static void copyDists( double[][] distFrom, double[][] distTo ) {
for ( int i = 0; i < distFrom[0].length; i++ ) {
distTo[0][i] = distFrom[0][i];
}
for ( int i = 0; i < distFrom[1].length; i++ ) {
distTo[1][i] = distFrom[1][i];
}
}
/**
* Main method for this class.
*
* @param argv the commandline parameters
*/
public static void main(String[] argv) {
runClassifier(new FastRandomTree(), argv);
}
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 0.99$");
}
}