weka.core.neighboursearch.KDTree Maven / Gradle / Ivy
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* KDTree.java
* Copyright (C) 2000-2012 University of Waikato
*
*/
package weka.core.neighboursearch;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.DistanceFunction;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.neighboursearch.kdtrees.KDTreeNode;
import weka.core.neighboursearch.kdtrees.KDTreeNodeSplitter;
import weka.core.neighboursearch.kdtrees.SlidingMidPointOfWidestSide;
/**
* Class implementing the KDTree search algorithm for nearest neighbour search.
* The connection to dataset is only a reference. For the tree structure the indexes are stored in an array.
* Building the tree:
* If a node has <maximal-inst-number> (option -L) instances no further splitting is done. Also if the split would leave one side empty, the branch is not split any further even if the instances in the resulting node are more than <maximal-inst-number> instances.
* **PLEASE NOTE:** The algorithm can not handle missing values, so it is advisable to run ReplaceMissingValues filter if there are any missing values in the dataset.
*
* For more information see:
*
* Jerome H. Friedman, Jon Luis Bentley, Raphael Ari Finkel (1977). An Algorithm for Finding Best Matches in Logarithmic Expected Time. ACM Transactions on Mathematics Software. 3(3):209-226.
*
* Andrew Moore (1991). A tutorial on kd-trees.
*
*
* BibTeX:
*
* @article{Friedman1977,
* author = {Jerome H. Friedman and Jon Luis Bentley and Raphael Ari Finkel},
* journal = {ACM Transactions on Mathematics Software},
* month = {September},
* number = {3},
* pages = {209-226},
* title = {An Algorithm for Finding Best Matches in Logarithmic Expected Time},
* volume = {3},
* year = {1977}
* }
*
* @techreport{Moore1991,
* author = {Andrew Moore},
* booktitle = {University of Cambridge Computer Laboratory Technical Report No. 209},
* howpublished = {Extract from PhD Thesis},
* title = {A tutorial on kd-trees},
* year = {1991},
* HTTP = {Available from http://www.autonlab.org/autonweb/14665.html}
* }
*
*
*
* Valid options are:
*
* -S <classname and options>
* Node splitting method to use.
* (default: weka.core.neighboursearch.kdtrees.SlidingMidPointOfWidestSide)
*
* -W <value>
* Set minimal width of a box
* (default: 1.0E-2).
*
* -L
* Maximal number of instances in a leaf
* (default: 40).
*
* -N
* Normalizing will be done
* (Select dimension for split, with normalising to universe).
*
*
* @author Gabi Schmidberger (gabi[at-the-rate]cs[dot]waikato[dot]ac[dot]nz)
* @author Malcolm Ware (mfw4[at-the-rate]cs[dot]waikato[dot]ac[dot]nz)
* @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz)
* @version $Revision: 12479 $
*/
public class KDTree
extends NearestNeighbourSearch
implements TechnicalInformationHandler {
/** For serialization. */
private static final long serialVersionUID = 1505717283763272533L;
/**
* Array holding the distances of the nearest neighbours. It is filled up both
* by nearestNeighbour() and kNearestNeighbours().
*/
protected double[] m_DistanceList;
/**
* Indexlist of the instances of this kdtree. Instances get sorted according
* to the splits. the nodes of the KDTree just hold their start and end
* indices
*/
protected int[] m_InstList;
/** The root node of the tree. */
protected KDTreeNode m_Root;
/** The node splitter. */
protected KDTreeNodeSplitter m_Splitter = new SlidingMidPointOfWidestSide();
/** Tree stats. */
protected int m_NumNodes, m_NumLeaves, m_MaxDepth;
/** Tree Stats variables. */
protected TreePerformanceStats m_TreeStats = null;
// Constants
/** The index of MIN value in attributes' range array. */
public static final int MIN = EuclideanDistance.R_MIN;
/** The index of MAX value in attributes' range array. */
public static final int MAX = EuclideanDistance.R_MAX;
/** The index of WIDTH (MAX-MIN) value in attributes' range array. */
public static final int WIDTH = EuclideanDistance.R_WIDTH;
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.ARTICLE);
result.setValue(Field.AUTHOR, "Jerome H. Friedman and Jon Luis Bentley and Raphael Ari Finkel");
result.setValue(Field.YEAR, "1977");
result.setValue(Field.TITLE, "An Algorithm for Finding Best Matches in Logarithmic Expected Time");
result.setValue(Field.JOURNAL, "ACM Transactions on Mathematics Software");
result.setValue(Field.PAGES, "209-226");
result.setValue(Field.MONTH, "September");
result.setValue(Field.VOLUME, "3");
result.setValue(Field.NUMBER, "3");
additional = result.add(Type.TECHREPORT);
additional.setValue(Field.AUTHOR, "Andrew Moore");
additional.setValue(Field.YEAR, "1991");
additional.setValue(Field.TITLE, "A tutorial on kd-trees");
additional.setValue(Field.HOWPUBLISHED, "Extract from PhD Thesis");
additional.setValue(Field.BOOKTITLE, "University of Cambridge Computer Laboratory Technical Report No. 209");
additional.setValue(Field.HTTP, "Available from http://www.autonlab.org/autonweb/14665.html");
return result;
}
/**
* Creates a new instance of KDTree.
*/
public KDTree() {
super();
if (getMeasurePerformance())
m_Stats = m_TreeStats = new TreePerformanceStats();
}
/**
* Creates a new instance of KDTree.
* It also builds the tree on supplied set of Instances.
* @param insts The instances/points on which the BallTree
* should be built on.
*/
public KDTree(Instances insts) {
super(insts);
if (getMeasurePerformance())
m_Stats = m_TreeStats = new TreePerformanceStats();
}
/**
* Builds the KDTree on the supplied set of instances/points. It
* is adviseable to run the replace missing attributes filter
* on the passed instances first.
* NOTE: This method should not be called from outside this
* class. Outside classes should call setInstances(Instances)
* instead.
*
* @param instances The instances to build the tree on
* @throws Exception if something goes wrong
*/
protected void buildKDTree(Instances instances) throws Exception {
checkMissing(instances);
if (m_EuclideanDistance == null)
m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance(
instances);
else
m_EuclideanDistance.setInstances(instances);
m_Instances = instances;
int numInst = m_Instances.numInstances();
// Make the global index list
m_InstList = new int[numInst];
for (int i = 0; i < numInst; i++) {
m_InstList[i] = i;
}
double[][] universe = m_EuclideanDistance.getRanges();
// initializing internal fields of KDTreeSplitter
m_Splitter.setInstances(m_Instances);
m_Splitter.setInstanceList(m_InstList);
m_Splitter.setEuclideanDistanceFunction(m_EuclideanDistance);
m_Splitter.setNodeWidthNormalization(m_NormalizeNodeWidth);
// building tree
m_NumNodes = m_NumLeaves = 1;
m_MaxDepth = 0;
m_Root = new KDTreeNode(m_NumNodes, 0, m_Instances.numInstances() - 1,
universe);
splitNodes(m_Root, universe, m_MaxDepth + 1);
}
/**
* Recursively splits nodes of a tree starting from the supplied node.
* The splitting stops for any node for which the number of instances/points
* falls below a given threshold (given by m_MaxInstInLeaf), or if the
* maximum relative width/range of the instances/points
* (i.e. max_i(max(att_i) - min(att_i)) ) falls below a given threshold
* (given by m_MinBoxRelWidth).
*
* @param node The node to start splitting from.
* @param universe The attribute ranges of the whole dataset.
* @param depth The depth of the supplied node.
* @throws Exception If there is some problem
* splitting.
*/
protected void splitNodes(KDTreeNode node, double[][] universe,
int depth) throws Exception {
double[][] nodeRanges = m_EuclideanDistance.initializeRanges(m_InstList,
node.m_Start, node.m_End);
if (node.numInstances() <= m_MaxInstInLeaf
|| getMaxRelativeNodeWidth(nodeRanges, universe) <= m_MinBoxRelWidth)
return;
// splitting a node so it is no longer a leaf
m_NumLeaves--;
if (depth > m_MaxDepth)
m_MaxDepth = depth;
m_Splitter.splitNode(node, m_NumNodes, nodeRanges, universe);
m_NumNodes += 2;
m_NumLeaves += 2;
splitNodes(node.m_Left, universe, depth + 1);
splitNodes(node.m_Right, universe, depth + 1);
}
/**
* Returns (in the supplied heap object) the k nearest
* neighbours of the given instance starting from the give
* tree node. >k neighbours are returned if there are more than
* one neighbours at the kth boundary. NOTE: This method should
* not be used from outside this class. Outside classes should
* call kNearestNeighbours(Instance, int).
*
* @param target The instance to find the nearest neighbours for.
* @param node The KDTreeNode to start the search from.
* @param k The number of neighbours to find.
* @param heap The MyHeap object to store/update the kNNs found
* during the search.
* @param distanceToParents The distance of the supplied target
* to the parents of the supplied tree node.
* @throws Exception if the nearest neighbour could not be found.
*/
protected void findNearestNeighbours(Instance target, KDTreeNode node, int k,
MyHeap heap, double distanceToParents) throws Exception {
if (node.isALeaf()) {
if (m_TreeStats != null) {
m_TreeStats.updatePointCount(node.numInstances());
m_TreeStats.incrLeafCount();
}
double distance;
// look at all the instances in this leaf
for (int idx = node.m_Start; idx <= node.m_End; idx++) {
if (target == m_Instances.instance(m_InstList[idx])) // for
// hold-one-out
// cross-validation
continue;
if (heap.size() < k) {
distance = m_EuclideanDistance.distance(target, m_Instances
.instance(m_InstList[idx]), Double.POSITIVE_INFINITY, m_Stats);
heap.put(m_InstList[idx], distance);
} else {
MyHeapElement temp = heap.peek();
distance = m_EuclideanDistance.distance(target, m_Instances
.instance(m_InstList[idx]), temp.distance, m_Stats);
if (distance < temp.distance) {
heap.putBySubstitute(m_InstList[idx], distance);
} else if (distance == temp.distance) {
heap.putKthNearest(m_InstList[idx], distance);
}
}// end else heap.size==k
}// end for
} else {
if (m_TreeStats != null) {
m_TreeStats.incrIntNodeCount();
}
KDTreeNode nearer, further;
boolean targetInLeft = m_EuclideanDistance.valueIsSmallerEqual(target,
node.m_SplitDim, node.m_SplitValue);
if (targetInLeft) {
nearer = node.m_Left;
further = node.m_Right;
} else {
nearer = node.m_Right;
further = node.m_Left;
}
findNearestNeighbours(target, nearer, k, heap, distanceToParents);
// ... now look in further half if maxDist reaches into it
if (heap.size() < k) { // if haven't found the first k
double distanceToSplitPlane = distanceToParents
+ m_EuclideanDistance.sqDifference(node.m_SplitDim, target
.value(node.m_SplitDim), node.m_SplitValue);
findNearestNeighbours(target, further, k, heap, distanceToSplitPlane);
return;
} else { // else see if ball centered at query intersects with the other
// side.
double distanceToSplitPlane = distanceToParents
+ m_EuclideanDistance.sqDifference(node.m_SplitDim, target
.value(node.m_SplitDim), node.m_SplitValue);
if (heap.peek().distance >= distanceToSplitPlane) {
findNearestNeighbours(target, further, k, heap, distanceToSplitPlane);
}
}// end else
}// end else_if an internal node
}
/**
* Returns the k nearest neighbours of the supplied instance.
* >k neighbours are returned if there are more than one
* neighbours at the kth boundary.
*
* @param target The instance to find the nearest neighbours for.
* @param k The number of neighbours to find.
* @return The k nearest neighbours (or >k if more there are than
* one neighbours at the kth boundary).
* @throws Exception if the nearest neighbour could not be found.
*/
public Instances kNearestNeighbours(Instance target, int k) throws Exception {
checkMissing(target);
if (m_Stats != null)
m_Stats.searchStart();
MyHeap heap = new MyHeap(k);
findNearestNeighbours(target, m_Root, k, heap, 0.0);
if (m_Stats != null)
m_Stats.searchFinish();
Instances neighbours = new Instances(m_Instances, (heap.size() + heap
.noOfKthNearest()));
m_DistanceList = new double[heap.size() + heap.noOfKthNearest()];
int[] indices = new int[heap.size() + heap.noOfKthNearest()];
int i = indices.length - 1;
MyHeapElement h;
while (heap.noOfKthNearest() > 0) {
h = heap.getKthNearest();
indices[i] = h.index;
m_DistanceList[i] = h.distance;
i--;
}
while (heap.size() > 0) {
h = heap.get();
indices[i] = h.index;
m_DistanceList[i] = h.distance;
i--;
}
m_DistanceFunction.postProcessDistances(m_DistanceList);
for (int idx = 0; idx < indices.length; idx++) {
neighbours.add(m_Instances.instance(indices[idx]));
}
return neighbours;
}
/**
* Returns the nearest neighbour of the supplied target
* instance.
*
* @param target The instance to find the nearest neighbour for.
* @return The nearest neighbour from among the previously
* supplied training instances.
* @throws Exception if the neighbours could not be found.
*/
public Instance nearestNeighbour(Instance target) throws Exception {
return (kNearestNeighbours(target, 1)).instance(0);
}
/**
* Returns the distances to the kNearest or 1 nearest neighbour currently
* found with either the kNearestNeighbours or the nearestNeighbour method.
*
* @return array containing the distances of the
* nearestNeighbours. The length and ordering of the array
* is the same as that of the instances returned by
* nearestNeighbour functions.
* @throws Exception if called before calling kNearestNeighbours or
* nearestNeighbours.
*/
public double[] getDistances() throws Exception {
if (m_Instances == null || m_DistanceList == null)
throw new Exception("The tree has not been supplied with a set of "
+ "instances or getDistances() has been called "
+ "before calling kNearestNeighbours().");
return m_DistanceList;
}
/**
* Builds the KDTree on the given set of instances.
* @param instances The insts on which the KDTree is to be
* built.
* @throws Exception If some error occurs while
* building the KDTree
*/
public void setInstances(Instances instances) throws Exception {
super.setInstances(instances);
buildKDTree(instances);
}
/**
* Adds one instance to the KDTree. This updates the KDTree structure to take
* into account the newly added training instance.
*
* @param instance the instance to be added. Usually the newly added instance in the
* training set.
* @throws Exception If the instance cannot be added.
*/
public void update(Instance instance) throws Exception { // better to change
// to addInstance
if (m_Instances == null)
throw new Exception("No instances supplied yet. Have to call "
+ "setInstances(instances) with a set of Instances " + "first.");
addInstanceInfo(instance);
addInstanceToTree(instance, m_Root);
}
/**
* Recursively adds an instance to the tree starting from
* the supplied KDTreeNode.
* NOTE: This should not be called by outside classes,
* outside classes should instead call update(Instance)
* method.
*
* @param inst The instance to add to the tree
* @param node The node to start the recursive search
* from, for the leaf node where the supplied instance
* would go.
* @throws Exception If some error occurs while adding
* the instance.
*/
protected void addInstanceToTree(Instance inst, KDTreeNode node)
throws Exception {
if (node.isALeaf()) {
int instList[] = new int[m_Instances.numInstances()];
try {
System.arraycopy(m_InstList, 0, instList, 0, node.m_End + 1); // m_InstList.squeezeIn(m_End,
// index);
if (node.m_End < m_InstList.length - 1)
System.arraycopy(m_InstList, node.m_End + 1, instList,
node.m_End + 2, m_InstList.length - node.m_End - 1);
instList[node.m_End + 1] = m_Instances.numInstances() - 1;
} catch (ArrayIndexOutOfBoundsException ex) {
System.err.println("m_InstList.length: " + m_InstList.length
+ " instList.length: " + instList.length + "node.m_End+1: "
+ (node.m_End + 1) + "m_InstList.length-node.m_End+1: "
+ (m_InstList.length - node.m_End - 1));
throw ex;
}
m_InstList = instList;
node.m_End++;
node.m_NodeRanges = m_EuclideanDistance.updateRanges(inst,
node.m_NodeRanges);
m_Splitter.setInstanceList(m_InstList);
// split this leaf node if necessary
double[][] universe = m_EuclideanDistance.getRanges();
if (node.numInstances() > m_MaxInstInLeaf
&& getMaxRelativeNodeWidth(node.m_NodeRanges, universe) > m_MinBoxRelWidth) {
m_Splitter.splitNode(node, m_NumNodes, node.m_NodeRanges, universe);
m_NumNodes += 2;
}
}// end if node is a leaf
else {
if (m_EuclideanDistance.valueIsSmallerEqual(inst, node.m_SplitDim,
node.m_SplitValue)) {
addInstanceToTree(inst, node.m_Left);
afterAddInstance(node.m_Right);
} else
addInstanceToTree(inst, node.m_Right);
node.m_End++;
node.m_NodeRanges = m_EuclideanDistance.updateRanges(inst,
node.m_NodeRanges);
}
}
/**
* Corrects the start and end indices of a
* KDTreeNode after an instance is added to
* the tree. The start and end indices for
* the master index array (m_InstList)
* stored in the nodes need to be updated
* for all nodes in the subtree on the
* right of a node where the instance
* was added.
* NOTE: No outside class should call this
* method.
*
* @param node KDTreeNode whose start and end indices
* need to be updated.
*/
protected void afterAddInstance(KDTreeNode node) {
node.m_Start++;
node.m_End++;
if (!node.isALeaf()) {
afterAddInstance(node.m_Left);
afterAddInstance(node.m_Right);
}
}
/**
* Adds one instance to KDTree loosly. It only changes the ranges in
* EuclideanDistance, and does not affect the structure of the KDTree.
*
* @param instance the new instance. Usually this is the test instance
* supplied to update the range of attributes in the distance function.
*/
public void addInstanceInfo(Instance instance) {
m_EuclideanDistance.updateRanges(instance);
}
/**
* Checks if there is any instance with missing values. Throws an exception if
* there is, as KDTree does not handle missing values.
*
* @param instances the instances to check
* @throws Exception if missing values are encountered
*/
protected void checkMissing(Instances instances) throws Exception {
for (int i = 0; i < instances.numInstances(); i++) {
Instance ins = instances.instance(i);
for (int j = 0; j < ins.numValues(); j++) {
if (ins.index(j) != ins.classIndex())
if (ins.isMissingSparse(j)) {
throw new Exception("ERROR: KDTree can not deal with missing "
+ "values. Please run ReplaceMissingValues filter "
+ "on the dataset before passing it on to the KDTree.");
}
}
}
}
/**
* Checks if there is any missing value in the given
* instance.
* @param ins The instance to check missing values in.
* @throws Exception If there is a missing value in the
* instance.
*/
protected void checkMissing(Instance ins) throws Exception {
for (int j = 0; j < ins.numValues(); j++) {
if (ins.index(j) != ins.classIndex())
if (ins.isMissingSparse(j)) {
throw new Exception("ERROR: KDTree can not deal with missing "
+ "values. Please run ReplaceMissingValues filter "
+ "on the dataset before passing it on to the KDTree.");
}
}
}
/**
* Returns the maximum attribute width of instances/points
* in a KDTreeNode relative to the whole dataset.
*
* @param nodeRanges The attribute ranges of the
* KDTreeNode whose maximum relative width is to be
* determined.
* @param universe The attribute ranges of the whole
* dataset (training instances + test instances so
* far encountered).
* @return The maximum relative width
*/
protected double getMaxRelativeNodeWidth(double[][] nodeRanges,
double[][] universe) {
int widest = widestDim(nodeRanges, universe);
if(widest < 0)
return 0.0;
else
return nodeRanges[widest][WIDTH] / universe[widest][WIDTH];
}
/**
* Returns the widest dimension/attribute in a
* KDTreeNode (widest after normalizing).
* @param nodeRanges The attribute ranges of
* the KDTreeNode.
* @param universe The attribute ranges of the
* whole dataset (training instances + test
* instances so far encountered).
* @return The index of the widest
* dimension/attribute.
*/
protected int widestDim(double[][] nodeRanges, double[][] universe) {
final int classIdx = m_Instances.classIndex();
double widest = 0.0;
int w = -1;
if (m_NormalizeNodeWidth) {
for (int i = 0; i < nodeRanges.length; i++) {
double newWidest = nodeRanges[i][WIDTH] / universe[i][WIDTH];
if (newWidest > widest) {
if (i == classIdx)
continue;
widest = newWidest;
w = i;
}
}
} else {
for (int i = 0; i < nodeRanges.length; i++) {
if (nodeRanges[i][WIDTH] > widest) {
if (i == classIdx)
continue;
widest = nodeRanges[i][WIDTH];
w = i;
}
}
}
return w;
}
/**
* Returns the size of the tree.
*
* @return the size of the tree
*/
public double measureTreeSize() {
return m_NumNodes;
}
/**
* Returns the number of leaves.
*
* @return the number of leaves
*/
public double measureNumLeaves() {
return m_NumLeaves;
}
/**
* Returns the depth of the tree.
*
* @return The depth of the tree
*/
public double measureMaxDepth() {
return m_MaxDepth;
}
/**
* Returns an enumeration of the additional measure names.
*
* @return an enumeration of the measure names
*/
public Enumeration enumerateMeasures() {
Vector newVector = new Vector();
newVector.addElement("measureTreeSize");
newVector.addElement("measureNumLeaves");
newVector.addElement("measureMaxDepth");
if (m_Stats != null) {
newVector.addAll(Collections.list(m_Stats.enumerateMeasures()));
}
return newVector.elements();
}
/**
* Returns the value of the named measure.
*
* @param additionalMeasureName the name of
* the measure to query for its value.
* @return The value of the named measure
* @throws IllegalArgumentException If the named measure
* is not supported.
*/
public double getMeasure(String additionalMeasureName) {
if (additionalMeasureName.compareToIgnoreCase("measureMaxDepth") == 0) {
return measureMaxDepth();
} else if (additionalMeasureName.compareToIgnoreCase("measureTreeSize") == 0) {
return measureTreeSize();
} else if (additionalMeasureName.compareToIgnoreCase("measureNumLeaves") == 0) {
return measureNumLeaves();
} else if (m_Stats != null) {
return m_Stats.getMeasure(additionalMeasureName);
} else {
throw new IllegalArgumentException(additionalMeasureName
+ " not supported (KDTree)");
}
}
/**
* Sets whether to calculate the performance statistics or not.
* @param measurePerformance Should be true if performance
* statistics are to be measured.
*/
public void setMeasurePerformance(boolean measurePerformance) {
m_MeasurePerformance = measurePerformance;
if (m_MeasurePerformance) {
if (m_Stats == null)
m_Stats = m_TreeStats = new TreePerformanceStats();
} else
m_Stats = m_TreeStats = null;
}
/**
* Assigns instances to centers using KDTree.
*
* @param centers the current centers
* @param assignments the centerindex for each instance
* @param pc the threshold value for pruning.
* @throws Exception If there is some problem
* assigning instances to centers.
*/
public void centerInstances(Instances centers, int[] assignments, double pc)
throws Exception {
int[] centList = new int[centers.numInstances()];
for (int i = 0; i < centers.numInstances(); i++)
centList[i] = i;
determineAssignments(m_Root, centers, centList, assignments, pc);
}
/**
* Assigns instances to the current centers called candidates.
*
* @param node The node to start assigning the instances from.
* @param centers all the current centers.
* @param candidates the current centers the method works on.
* @param assignments the center index for each instance.
* @param pc the threshold value for pruning.
* @throws Exception If there is some problem assigning
* instances to centers.
*/
protected void determineAssignments(KDTreeNode node, Instances centers,
int[] candidates, int[] assignments, double pc) throws Exception {
// reduce number of owners for current hyper rectangle
int[] owners = refineOwners(node, centers, candidates);
// only one owner
if (owners.length == 1) {
// all instances of this node are owned by one center
for (int i = node.m_Start; i <= node.m_End; i++) {
assignments[m_InstList[i]] // the assignment of this instance
= owners[0]; // is the current owner
}
} else if (!node.isALeaf()) {
// more than one owner and it is not a leaf
determineAssignments(node.m_Left, centers, owners, assignments, pc);
determineAssignments(node.m_Right, centers, owners, assignments, pc);
} else {
// this is a leaf and there are more than 1 owner
// XMeans.
assignSubToCenters(node, centers, owners, assignments);
}
}
/**
* Refines the ownerlist.
*
* @param node The current tree node.
* @param centers all centers
* @param candidates the indexes of those centers that are candidates.
* @return list of owners
* @throws Exception If some problem occurs in refining.
*/
protected int[] refineOwners(KDTreeNode node, Instances centers,
int[] candidates) throws Exception {
int[] owners = new int[candidates.length];
double minDistance = Double.POSITIVE_INFINITY;
int ownerIndex = -1;
Instance owner;
int numCand = candidates.length;
double[] distance = new double[numCand];
boolean[] inside = new boolean[numCand];
for (int i = 0; i < numCand; i++) {
distance[i] = distanceToHrect(node, centers.instance(candidates[i]));
inside[i] = (distance[i] == 0.0);
if (distance[i] < minDistance) {
minDistance = distance[i];
ownerIndex = i;
}
}
owner = (Instance)centers.instance(candidates[ownerIndex]).copy();
// are there other owners
// loop also goes over already found owner, keeps order
// in owner list
int index = 0;
for (int i = 0; i < numCand; i++) {
// 1. all centers that are points within rectangle are owners
if ((inside[i])
// 2. take all points with same distance to the rect. as the owner
|| (distance[i] == distance[ownerIndex])) {
// add competitor to owners list
owners[index++] = candidates[i];
} else {
Instance competitor = (Instance)centers.instance(candidates[i]).copy();
if
// 3. point has larger distance to rectangle but still can compete
// with owner for some points in the rectangle
(!candidateIsFullOwner(node, owner, competitor))
{
// also add competitor to owners list
owners[index++] = candidates[i];
}
}
}
int[] result = new int[index];
for (int i = 0; i < index; i++)
result[i] = owners[i];
return result;
}
/**
* Returns the distance between a point and an hyperrectangle.
*
* @param node The current node from whose hyperrectangle
* the distance is to be measured.
* @param x the point
* @return the distance
* @throws Exception If some problem occurs in determining
* the distance to the hyperrectangle.
*/
protected double distanceToHrect(KDTreeNode node, Instance x) throws Exception {
double distance = 0.0;
Instance closestPoint = (Instance)x.copy();
boolean inside;
inside = clipToInsideHrect(node, closestPoint);
if (!inside)
distance = m_EuclideanDistance.distance(closestPoint, x);
return distance;
}
/**
* Finds the closest point in the hyper rectangle to a given point. Change the
* given point to this closest point by clipping of at all the dimensions to
* be clipped of. If the point is inside the rectangle it stays unchanged. The
* return value is true if the point was not changed, so the the return value
* is true if the point was inside the rectangle.
*
* @param node The current KDTreeNode in whose hyperrectangle the closest
* point is to be found.
* @param x a point
* @return true if the input point stayed unchanged.
*/
protected boolean clipToInsideHrect(KDTreeNode node, Instance x) {
boolean inside = true;
for (int i = 0; i < m_Instances.numAttributes(); i++) {
// TODO treat nominals differently!??
if (x.value(i) < node.m_NodeRanges[i][MIN]) {
x.setValue(i, node.m_NodeRanges[i][MIN]);
inside = false;
} else if (x.value(i) > node.m_NodeRanges[i][MAX]) {
x.setValue(i, node.m_NodeRanges[i][MAX]);
inside = false;
}
}
return inside;
}
/**
* Returns true if candidate is a full owner in respect to a competitor.
*
*
* The candidate has been the closer point to the current rectangle or even
* has been a point within the rectangle. The competitor is competing with the
* candidate for a few points out of the rectangle although it is a point
* further away from the rectangle then the candidate. The extrem point is the
* corner of the rectangle that is furthest away from the candidate towards
* the direction of the competitor.
*
* If the distance candidate to this extreme point is smaller then the
* distance competitor to this extreme point, then it is proven that none of
* the points in the rectangle can be owned be the competitor and the
* candidate is full owner of the rectangle in respect to this competitor. See
* also D. Pelleg and A. Moore's paper 'Accelerating exact k-means Algorithms
* with Geometric Reasoning'.
*
*
* @param node The current KDTreeNode / hyperrectangle.
* @param candidate instance that is candidate to be owner
* @param competitor instance that competes against the candidate
* @return true if candidate is full owner
* @throws Exception If some problem occurs.
*/
protected boolean candidateIsFullOwner(KDTreeNode node, Instance candidate,
Instance competitor) throws Exception {
// get extreme point
double[] extreme = new double[m_Instances.numAttributes()];
for (int i = 0; i < m_Instances.numAttributes(); i++) {
if ((competitor.value(i) - candidate.value(i)) > 0) {
extreme[i] = node.m_NodeRanges[i][MAX];
} else {
extreme[i] = node.m_NodeRanges[i][MIN];
}
}
Instance extremeI = candidate.copy(extreme);
boolean isFullOwner = m_EuclideanDistance.distance(extremeI, candidate) < m_EuclideanDistance
.distance(extremeI, competitor);
return isFullOwner;
}
/**
* Assigns instances of this node to center. Center to be assign to is decided
* by the distance function.
*
* @param node The KDTreeNode whose instances are to be assigned.
* @param centers all the input centers
* @param centList the list of centers to work with
* @param assignments index list of last assignments
* @throws Exception If there is error assigning the instances.
*/
public void assignSubToCenters(KDTreeNode node, Instances centers,
int[] centList, int[] assignments) throws Exception {
// todo: undecided situations
// WARNING: assignments is "input/output-parameter"
// should not be null and the following should not happen
if (assignments == null) {
assignments = new int[m_Instances.numInstances()];
for (int i = 0; i < assignments.length; i++) {
assignments[i] = -1;
}
}
// set assignments for all instances of this node
for (int i = node.m_Start; i <= node.m_End; i++) {
int instIndex = m_InstList[i];
Instance inst = m_Instances.instance(instIndex);
// if (instList[i] == 664) System.out.println("664***");
int newC = m_EuclideanDistance.closestPoint(inst, centers, centList);
// int newC = clusterProcessedInstance(inst, centers);
assignments[instIndex] = newC;
}
}
/**
* Properties' variables =====================================================
*/
/** flag for normalizing. */
boolean m_NormalizeNodeWidth = true;
/** The euclidean distance function to use. */
protected EuclideanDistance m_EuclideanDistance;
{ // to make sure we have only one object of EuclideanDistance
if (m_DistanceFunction instanceof EuclideanDistance)
m_EuclideanDistance = (EuclideanDistance) m_DistanceFunction;
else
m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance();
}
/** minimal relative width of a KDTree rectangle. */
protected double m_MinBoxRelWidth = 1.0E-2;
/** maximal number of instances in a leaf. */
protected int m_MaxInstInLeaf = 40;
/**
* the GET and SET - functions ===============================================
*/
/**
* Tip text for this property.
*
* @return the tip text for this property
*/
public String minBoxRelWidthTipText() {
return "The minimum relative width of the box. A node is only made a leaf "
+ "if the width of the split dimension of the instances in a node "
+ "normalized over the width of the split dimension of all the "
+ "instances is less than or equal to this minimum relative width.";
}
/**
* Sets the minimum relative box width.
*
* @param i the minimum relative box width
*/
public void setMinBoxRelWidth(double i) {
m_MinBoxRelWidth = i;
}
/**
* Gets the minimum relative box width.
*
* @return the minimum relative box width
*/
public double getMinBoxRelWidth() {
return m_MinBoxRelWidth;
}
/**
* Tip text for this property.
*
* @return the tip text for this property
*/
public String maxInstInLeafTipText() {
return "The max number of instances in a leaf.";
}
/**
* Sets the maximum number of instances in a leaf.
*
* @param i the maximum number of instances in a leaf
*/
public void setMaxInstInLeaf(int i) {
m_MaxInstInLeaf = i;
}
/**
* Get the maximum number of instances in a leaf.
*
* @return the maximum number of instances in a leaf
*/
public int getMaxInstInLeaf() {
return m_MaxInstInLeaf;
}
/**
* Tip text for this property.
*
* @return the tip text for this property
*/
public String normalizeNodeWidthTipText() {
return "Whether if the widths of the KDTree node should be normalized "
+ "by the width of the universe or not. "
+ "Where, width of the node is the range of the split attribute "
+ "based on the instances in that node, and width of the "
+ "universe is the range of the split attribute based on all the "
+ "instances (default: false).";
}
/**
* Sets the flag for normalizing the widths of a KDTree Node by the width of
* the dimension in the universe.
*
* @param n true to use normalizing.
*/
public void setNormalizeNodeWidth(boolean n) {
m_NormalizeNodeWidth = n;
}
/**
* Gets the normalize flag.
*
* @return True if normalizing
*/
public boolean getNormalizeNodeWidth() {
return m_NormalizeNodeWidth;
}
/**
* returns the distance function currently in use.
*
* @return the distance function
*/
public DistanceFunction getDistanceFunction() {
return (DistanceFunction) m_EuclideanDistance;
}
/**
* sets the distance function to use for nearest neighbour search.
*
* @param df the distance function to use
* @throws Exception if not EuclideanDistance
*/
public void setDistanceFunction(DistanceFunction df) throws Exception {
if (!(df instanceof EuclideanDistance))
throw new Exception("KDTree currently only works with "
+ "EuclideanDistanceFunction.");
m_DistanceFunction = m_EuclideanDistance = (EuclideanDistance) df;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String nodeSplitterTipText() {
return "The the splitting method to split the nodes of the KDTree.";
}
/**
* Returns the splitting method currently in use to split the nodes of the
* KDTree.
*
* @return The KDTreeNodeSplitter currently in use.
*/
public KDTreeNodeSplitter getNodeSplitter() {
return m_Splitter;
}
/**
* Sets the splitting method to use to split the nodes of the KDTree.
*
* @param splitter The KDTreeNodeSplitter to use.
*/
public void setNodeSplitter(KDTreeNodeSplitter splitter) {
m_Splitter = splitter;
}
/**
* Returns a string describing this nearest neighbour search algorithm.
*
* @return a description of the algorithm for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return
"Class implementing the KDTree search algorithm for nearest "
+ "neighbour search.\n"
+ "The connection to dataset is only a reference. For the tree "
+ "structure the indexes are stored in an array. \n"
+ "Building the tree:\n"
+ "If a node has (option -L) instances no "
+ "further splitting is done. Also if the split would leave one "
+ "side empty, the branch is not split any further even if the "
+ "instances in the resulting node are more than "
+ " instances.\n"
+ "**PLEASE NOTE:** The algorithm can not handle missing values, so it "
+ "is advisable to run ReplaceMissingValues filter if there are any "
+ "missing values in the dataset.\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
public Enumeration