
moa.classifiers.lazy.neighboursearch.kdtrees.KMeansInpiredMethod Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of moa Show documentation
Show all versions of moa Show documentation
Massive On-line Analysis is an environment for massive data mining. MOA
provides a framework for data stream mining and includes tools for evaluation
and a collection of machine learning algorithms. Related to the WEKA project,
also written in Java, while scaling to more demanding problems.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* KMeansInpiredMethod.java
* Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand
*/
package moa.classifiers.lazy.neighboursearch.kdtrees;
import com.yahoo.labs.samoa.instances.Instance;
import com.yahoo.labs.samoa.instances.Instances;
/**
* The class that splits a node into two such that the overall sum of squared distances of points to their centres on both sides of the (axis-parallel) splitting plane is minimum.
*
* For more information see also:
*
* Ashraf Masood Kibriya (2007). Fast Algorithms for Nearest Neighbour Search. Hamilton, New Zealand.
*
*
* BibTeX:
*
* @mastersthesis{Kibriya2007,
* address = {Hamilton, New Zealand},
* author = {Ashraf Masood Kibriya},
* school = {Department of Computer Science, School of Computing and Mathematical Sciences, University of Waikato},
* title = {Fast Algorithms for Nearest Neighbour Search},
* year = {2007}
* }
*
*
*
*
* @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz)
* @version $Revision: 8034 $
*/
public class KMeansInpiredMethod
extends KDTreeNodeSplitter {
/** for serialization. */
private static final long serialVersionUID = -866783749124714304L;
/**
* Returns a string describing this nearest neighbour search algorithm.
*
* @return a description of the algorithm for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return
"The class that splits a node into two such that the overall sum "
+ "of squared distances of points to their centres on both sides "
+ "of the (axis-parallel) splitting plane is minimum.\n\n"
+ "For more information see also:\n\n";
}
/**
* Splits a node into two such that the overall sum of squared distances
* of points to their centres on both sides of the (axis-parallel)
* splitting plane is minimum. The two nodes created after the whole
* splitting are correctly initialised. And, node.left and node.right
* are set appropriately.
* @param node The node to split.
* @param numNodesCreated The number of nodes that so far have been
* created for the tree, so that the newly created nodes are
* assigned correct/meaningful node numbers/ids.
* @param nodeRanges The attributes' range for the points inside
* the node that is to be split.
* @param universe The attributes' range for the whole
* point-space.
* @throws Exception If there is some problem in splitting the
* given node.
*/
public void splitNode(KDTreeNode node, int numNodesCreated,
double[][] nodeRanges, double[][] universe) throws Exception {
correctlyInitialized();
int splitDim = -1;
double splitVal = Double.NEGATIVE_INFINITY;
double leftAttSum[] = new double[m_Instances.numAttributes()],
rightAttSum[] = new double[m_Instances.numAttributes()],
leftAttSqSum[] = new double[m_Instances.numAttributes()],
rightAttSqSum[] = new double[m_Instances.numAttributes()],
rightSqMean, leftSqMean, leftSqSum, rightSqSum,
minSum = Double.POSITIVE_INFINITY, val;
for (int dim = 0; dim < m_Instances.numAttributes(); dim++) {
// m_MaxRelativeWidth in KDTree ensure there'll be atleast one dim with
// width > 0.0
if (node.m_NodeRanges[dim][WIDTH] == 0.0
|| dim == m_Instances.classIndex())
continue;
quickSort(m_Instances, m_InstList, dim, node.m_Start, node.m_End);
for (int i = node.m_Start; i <= node.m_End; i++) {
for (int j = 0; j < m_Instances.numAttributes(); j++) {
if (j == m_Instances.classIndex())
continue;
val = m_Instances.instance(m_InstList[i]).value(j);
if (m_NormalizeNodeWidth) {
if (Double.isNaN(universe[j][MIN])
|| universe[j][MIN] == universe[j][MAX])
val = 0.0;
else
val = ((val - universe[j][MIN]) / universe[j][WIDTH]); // normalizing
// value
}
if (i == node.m_Start) {
leftAttSum[j] = rightAttSum[j] = leftAttSqSum[j] = rightAttSqSum[j] = 0.0;
}
rightAttSum[j] += val;
rightAttSqSum[j] += val * val;
}
}
for (int i = node.m_Start; i <= node.m_End - 1; i++) {
Instance inst = m_Instances.instance(m_InstList[i]);
leftSqSum = rightSqSum = 0.0;
for (int j = 0; j < m_Instances.numAttributes(); j++) {
if (j == m_Instances.classIndex())
continue;
val = inst.value(j);
if (m_NormalizeNodeWidth) {
if (Double.isNaN(universe[j][MIN])
|| universe[j][MIN] == universe[j][MAX])
val = 0.0;
else
val = ((val - universe[j][MIN]) / universe[j][WIDTH]); // normalizing
// value
}
leftAttSum[j] += val;
rightAttSum[j] -= val;
leftAttSqSum[j] += val * val;
rightAttSqSum[j] -= val * val;
leftSqMean = leftAttSum[j] / (i - node.m_Start + 1);
leftSqMean *= leftSqMean;
rightSqMean = rightAttSum[j] / (node.m_End - i);
rightSqMean *= rightSqMean;
leftSqSum += leftAttSqSum[j] - (i - node.m_Start + 1) * leftSqMean;
rightSqSum += rightAttSqSum[j] - (node.m_End - i) * rightSqMean;
}
if (minSum > (leftSqSum + rightSqSum)) {
minSum = leftSqSum + rightSqSum;
if (i < node.m_End)
splitVal = (m_Instances.instance(m_InstList[i]).value(dim) + m_Instances
.instance(m_InstList[i + 1]).value(dim)) / 2;
else
splitVal = m_Instances.instance(m_InstList[i]).value(dim);
splitDim = dim;
}
}// end for instance i
}// end for attribute dim
int rightStart = rearrangePoints(m_InstList, node.m_Start, node.m_End,
splitDim, splitVal);
if (rightStart == node.m_Start || rightStart > node.m_End) {
System.out.println("node.m_Start: " + node.m_Start + " node.m_End: "
+ node.m_End + " splitDim: " + splitDim + " splitVal: " + splitVal
+ " node.min: " + node.m_NodeRanges[splitDim][MIN] + " node.max: "
+ node.m_NodeRanges[splitDim][MAX] + " node.numInstances: "
+ node.numInstances());
if (rightStart == node.m_Start)
throw new Exception("Left child is empty in node " + node.m_NodeNumber
+ ". Not possible with "
+ "KMeanInspiredMethod splitting method. Please " + "check code.");
else
throw new Exception("Right child is empty in node " + node.m_NodeNumber
+ ". Not possible with "
+ "KMeansInspiredMethod splitting method. Please " + "check code.");
}
node.m_SplitDim = splitDim;
node.m_SplitValue = splitVal;
node.m_Left = new KDTreeNode(numNodesCreated + 1, node.m_Start,
rightStart - 1, m_EuclideanDistance.initializeRanges(m_InstList,
node.m_Start, rightStart - 1));
node.m_Right = new KDTreeNode(numNodesCreated + 2, rightStart, node.m_End,
m_EuclideanDistance
.initializeRanges(m_InstList, rightStart, node.m_End));
}
/**
* Partitions the instances around a pivot. Used by quicksort and
* kthSmallestValue.
*
* @param insts The instances on which the tree is (or is
* to be) built.
* @param index The master index array containing indices
* of the instances.
* @param attidx The attribution/dimension based on which
* the instances should be partitioned.
* @param l The begining index of the portion of master index
* array that should be partitioned.
* @param r The end index of the portion of master index array
* that should be partitioned.
* @return the index of the middle element
*/
protected static int partition(Instances insts, int[] index, int attidx, int l, int r) {
double pivot = insts.instance(index[(l + r) / 2]).value(attidx);
int help;
while (l < r) {
while ((insts.instance(index[l]).value(attidx) < pivot) && (l < r)) {
l++;
}
while ((insts.instance(index[r]).value(attidx) > pivot) && (l < r)) {
r--;
}
if (l < r) {
help = index[l];
index[l] = index[r];
index[r] = help;
l++;
r--;
}
}
if ((l == r) && (insts.instance(index[r]).value(attidx) > pivot)) {
r--;
}
return r;
}
/**
* Sorts the instances according to the given attribute/dimension.
* The sorting is done on the master index array and not on the
* actual instances object.
*
* @param insts The instances on which the tree is (or is
* to be) built.
* @param indices The master index array containing indices
* of the instances.
* @param attidx The dimension/attribute based on which
* the instances should be sorted.
* @param left The begining index of the portion of the master
* index array that needs to be sorted.
* @param right The end index of the portion of the master index
* array that needs to be sorted.
*/
protected static void quickSort(Instances insts, int[] indices, int attidx, int left, int right) {
if (left < right) {
int middle = partition(insts, indices, attidx, left, right);
quickSort(insts, indices, attidx, left, middle);
quickSort(insts, indices, attidx, middle + 1, right);
}
}
/**
* Method to validate the sorting done by quickSort().
*
* @param insts The instances on which the tree is (or is
* to be) built.
* @param indices The master index array containing indices
* of the instances.
* @param attidx The dimension/attribute based on which
* the instances should be sorted.
* @param start The start of the portion in master index
* array that needs to be sorted.
* @param end The end of the portion in master index
* array that needs to be sorted.
* @throws Exception If the indices of the instances
* are not in sorted order.
*/
private static void checkSort(Instances insts, int[] indices, int attidx,
int start, int end) throws Exception {
for(int i=start+1; i<=end; i++) {
if( insts.instance(indices[i-1]).value(attidx) >
insts.instance(indices[i]).value(attidx) ) {
System.out.println("value[i-1]: "+insts.instance(indices[i-1]).value(attidx));
System.out.println("value[i]: "+insts.instance(indices[i]).value(attidx));
System.out.println("indices[i-1]: "+indices[i-1]);
System.out.println("indices[i]: "+indices[i]);
System.out.println("i: "+i);
if(insts.instance(indices[i-1]).value(attidx) > insts.instance(indices[i]).value(attidx))
System.out.println("value[i-1] > value[i]");
throw new Exception("Indices not sorted correctly.");
}//end if
}
}
/**
* Re-arranges the indices array so that in the portion of the array
* belonging to the node to be split, the points {@literal <=} to the splitVal
* are on the left of the portion and those {@literal >} the splitVal are on the right.
*
* @param indices The master index array.
* @param startidx The begining index of portion of indices that needs
* re-arranging.
* @param endidx The end index of portion of indices that needs
* re-arranging.
* @param splitDim The split dimension/attribute.
* @param splitVal The split value.
* @return The startIdx of the points {@literal >} the splitVal (the points
* belonging to the right child of the node).
*/
protected int rearrangePoints(int[] indices, final int startidx, final int endidx,
final int splitDim, final double splitVal) {
int tmp, left = startidx - 1;
for (int i = startidx; i <= endidx; i++) {
if (m_EuclideanDistance.valueIsSmallerEqual(m_Instances
.instance(indices[i]), splitDim, splitVal)) {
left++;
tmp = indices[left];
indices[left] = indices[i];
indices[i] = tmp;
}// end valueIsSmallerEqual
}// endfor
return left + 1;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy