All Downloads are FREE. Search and download functionalities are using the official Maven repository.

moa.cluster.CFCluster Maven / Gradle / Ivy

Go to download

Massive On-line Analysis is an environment for massive data mining. MOA provides a framework for data stream mining and includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.

There is a newer version: 2024.07.0
Show newest version
/*
 *    CFCluster.java
 *    Copyright (C) 2010 RWTH Aachen University, Germany
 *    @author Jansen ([email protected])
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 *    
 *    
 */

package moa.cluster;
import java.util.Arrays;
import com.yahoo.labs.samoa.instances.Instance;

/* micro cluster, as defined by Aggarwal et al, On Clustering Massive Data Streams: A Summarization Praradigm 
 * in the book Data streams : models and algorithms, by Charu C Aggarwal
 *  @article{
	title = {Data Streams: Models and Algorithms},
	author = {Aggarwal, Charu C.},
	year = {2007},
	publisher = {Springer Science+Business Media, LLC},
	url = {http://ebooks.ulb.tu-darmstadt.de/11157/},
	institution = {eBooks [http://ebooks.ulb.tu-darmstadt.de/perl/oai2] (Germany)},
}

DEFINITION A micro-clusterfor a set of d-dimensionalpoints Xi,. .Xi,
with t i m e s t a m p s ~. . .T,, is the (2-d+3)tuple (CF2", CFlX CF2t, CFlt, n),
wherein CF2" and CFlX each correspond to a vector of d entries. The definition of each of these entries is as follows:

o For each dimension, the sum of the squares of the data values is maintained
in CF2". Thus, CF2" contains d values. The p-th entry of CF2" is equal to
\sum_j=1^n(x_i_j)^2

o For each dimension, the sum of the data values is maintained in C F l X .
Thus, CFIX contains d values. The p-th entry of CFIX is equal to
\sum_j=1^n x_i_j

o The sum of the squares of the time stamps Ti,. .Tin maintained in CF2t

o The sum of the time stamps Ti, . . .Tin maintained in CFlt.

o The number of data points is maintained in n.

 */
public abstract class CFCluster extends SphereCluster {

	private static final long serialVersionUID = 1L;

	protected double radiusFactor = 1.8;

	/**
	 * Number of points in the cluster.
	 */
	protected double N;
	/**
	 * Linear sum of all the points added to the cluster.
	 */
	public double[] LS;
	/**
	 * Squared sum of all the points added to the cluster.
	 */
	public double[] SS;

	/**
	 * Instantiates an empty kernel with the given dimensionality.
	 * @param dimensions The number of dimensions of the points that can be in
	 * this kernel.
	 */
	public CFCluster(Instance instance, int dimensions) {
		this(instance.toDoubleArray(), dimensions);
	}

	protected CFCluster(int dimensions) {
		this.N = 0;
		this.LS = new double[dimensions];
		this.SS = new double[dimensions];
		Arrays.fill(this.LS, 0.0);
		Arrays.fill(this.SS, 0.0);
	}

	public CFCluster(double [] center, int dimensions) {
		this.N = 1;
		this.LS = center;
		this.SS = new double[dimensions];
		for (int i = 0; i < SS.length; i++) {
			SS[i]=Math.pow(center[i], 2);
		}
	}

	public CFCluster(CFCluster cluster) {
		this.N = cluster.N;
		this.LS = Arrays.copyOf(cluster.LS, cluster.LS.length);
		this.SS = Arrays.copyOf(cluster.SS, cluster.SS.length);
	}

	public void add(CFCluster cluster ) {
		this.N += cluster.N;
		addVectors( this.LS, cluster.LS );
		addVectors( this.SS, cluster.SS );
	}

	public abstract CFCluster getCF();

	/**
	 * @return this kernels' center
	 */
	 @Override
	 public double[] getCenter() {
		 assert (this.N>0);
		 double res[] = new double[this.LS.length];
		 for ( int i = 0; i < res.length; i++ ) {
			 res[i] = this.LS[i] / N;
		 }
		 return res;
	 }


	 @Override
	 public abstract double getInclusionProbability(Instance instance);

	 /**
	  * See interface Cluster
	  * @return The radius of the cluster.
	  */
	 @Override
	 public abstract double getRadius();

	 /**
	  * See interface Cluster
	  * @return The weight.
	  * @see Cluster#getWeight() 
	  */
	 @Override
	 public double getWeight() {
		 return N;
	 }

	 public void setN(double N){
		 this.N = N;
	 }

	 public double getN() {
		 return N;
	 }

	 /**
	  * Adds the second array to the first array element by element. The arrays
	  * must have the same length.
	  * @param a1 Vector to which the second vector is added.
	  * @param a2 Vector to be added. This vector does not change.
	  */
	 public static void addVectors(double[] a1, double[] a2) {
		 assert (a1 != null);
		 assert (a2 != null);
		 assert (a1.length == a2.length) : "Adding two arrays of different "
			 + "length";

		 for (int i = 0; i < a1.length; i++) {
			 a1[i] += a2[i];
		 }
	 }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy