All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.filters.unsupervised.attribute.AddCluster Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

The newest version!
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    AddCluster.java
 *    Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.filters.unsupervised.attribute;

import weka.clusterers.AbstractClusterer;
import weka.clusterers.Clusterer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.core.WeightedAttributesHandler;
import weka.core.WeightedInstancesHandler;
import weka.core.WekaException;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;

/**
 *  A filter that adds a new nominal attribute
 * representing the cluster assigned to each instance by the specified
 * clustering algorithm.
* Either the clustering algorithm gets built with the first batch of data or * one specifies are serialized clusterer model file to use instead. *

* * * Valid options are: *

* *

 * -W <clusterer specification>
 *  Full class name of clusterer to use, followed
 *  by scheme options. eg:
 *   "weka.clusterers.SimpleKMeans -N 3"
 *  (default: weka.clusterers.SimpleKMeans)
 * 
* *
 * -serialized <file>
 *  Instead of building a clusterer on the data, one can also provide
 *  a serialized model and use that for adding the clusters.
 * 
* *
 * -I <att1,att2-att4,...>
 *  The range of attributes the clusterer should ignore.
 * 
* * * * @author Richard Kirkby ([email protected]) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 15203 $ */ public class AddCluster extends Filter implements UnsupervisedFilter, OptionHandler, WeightedAttributesHandler, WeightedInstancesHandler { /** for serialization. */ static final long serialVersionUID = 7414280611943807337L; /** The clusterer used to do the cleansing. */ protected Clusterer m_Clusterer = new weka.clusterers.SimpleKMeans(); /** The file from which to load a serialized clusterer. */ protected File m_SerializedClustererFile = new File( System.getProperty("user.dir")); /** The actual clusterer used to do the clustering. */ protected Clusterer m_ActualClusterer = null; /** Range of attributes to ignore. */ protected Range m_IgnoreAttributesRange = null; /** Filter for removing attributes. */ protected Filter m_removeAttributes = new Remove(); /** * Returns the Capabilities of this filter, makes sure that the class is never * set (for the clusterer). * * @param data the data to use for customization * @return the capabilities of this object, based on the data * @see #getCapabilities() */ @Override public Capabilities getCapabilities(Instances data) { Instances newData; newData = new Instances(data, 0); newData.setClassIndex(-1); return super.getCapabilities(newData); } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = m_Clusterer.getCapabilities(); result.enableAllClasses(); result.setMinimumNumberInstances(0); return result; } /** * tests the data whether the filter can actually handle it. * * @param instanceInfo the data to test * @throws Exception if the test fails */ @Override protected void testInputFormat(Instances instanceInfo) throws Exception { getCapabilities(instanceInfo).testWithFail(removeIgnored(instanceInfo)); } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - * only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the inputFormat can't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_removeAttributes = null; return false; } /** * filters all attributes that should be ignored. * * @param data the data to filter * @return the filtered data * @throws Exception if filtering fails */ protected Instances removeIgnored(Instances data) throws Exception { Instances result = data; if (m_IgnoreAttributesRange != null || data.classIndex() >= 0) { m_removeAttributes = new Remove(); String rangeString = ""; if (m_IgnoreAttributesRange != null) { rangeString += m_IgnoreAttributesRange.getRanges(); } if (data.classIndex() >= 0) { if (rangeString.length() > 0) { rangeString += "," + (data.classIndex() + 1); } else { rangeString = "" + (data.classIndex() + 1); } } ((Remove) m_removeAttributes).setAttributeIndices(rangeString); ((Remove) m_removeAttributes).setInvertSelection(false); m_removeAttributes.setInputFormat(data); result = Filter.useFilter(data, m_removeAttributes); } return result; } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } Instances toFilter = getInputFormat(); if (!isFirstBatchDone()) { // serialized model or build clusterer from scratch? File file = getSerializedClustererFile(); if (!file.isDirectory()) { int[] attsToIgnore = null; ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file)); m_ActualClusterer = (Clusterer) ois.readObject(); Instances header = null; // let's see whether there's an Instances header stored as well try { header = (Instances) ois.readObject(); // ignored atts attsToIgnore = (int[]) ois.readObject(); } catch (Exception e) { // ignored } ois.close(); if (attsToIgnore != null && attsToIgnore.length > 0) { m_removeAttributes = new Remove(); ((Remove) m_removeAttributes).setAttributeIndicesArray(attsToIgnore); ((Remove) m_removeAttributes).setInvertSelection(false); m_removeAttributes.setInputFormat(toFilter); } // same dataset format? if ((header != null) && (!header.equalHeaders(toFilter))) { throw new WekaException( "Training header of clusterer and filter dataset don't match:\n" + header.equalHeadersMsg(toFilter)); } } else { // filter out attributes if necessary Instances toFilterIgnoringAttributes = removeIgnored(toFilter); m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer); m_ActualClusterer.buildClusterer(toFilterIgnoringAttributes); } // create output dataset with new attribute Instances filtered = new Instances(toFilter, 0); ArrayList nominal_values = new ArrayList( m_ActualClusterer.numberOfClusters()); for (int i = 0; i < m_ActualClusterer.numberOfClusters(); i++) { nominal_values.add("cluster" + (i + 1)); } filtered.insertAttributeAt(new Attribute("cluster", nominal_values), filtered.numAttributes()); setOutputFormat(filtered); } // build new dataset for (int i = 0; i < toFilter.numInstances(); i++) { convertInstance(toFilter.instance(i)); } flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Input an instance for filtering. Ordinarily the instance is processed and * made available for output immediately. Some filters require all instances * be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input format has been defined. */ @Override public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (outputFormatPeek() != null) { convertInstance(instance); return true; } bufferInput(instance); return false; } /** * Convert a single instance over. The converted instance is added to the end * of the output queue. * * @param instance the instance to convert * @throws Exception if something goes wrong */ protected void convertInstance(Instance instance) throws Exception { Instance original, processed; original = instance; // copy values double[] instanceVals = new double[instance.numAttributes() + 1]; for (int j = 0; j < instance.numAttributes(); j++) { instanceVals[j] = original.value(j); } Instance filteredI = null; if (m_removeAttributes != null) { m_removeAttributes.input(instance); filteredI = m_removeAttributes.output(); } else { filteredI = instance; } // add cluster to end try { instanceVals[instance.numAttributes()] = m_ActualClusterer .clusterInstance(filteredI); } catch (Exception e) { // clusterer couldn't cluster instance -> missing instanceVals[instance.numAttributes()] = Utils.missingValue(); } // create new instance if (original instanceof SparseInstance) { processed = new SparseInstance(original.weight(), instanceVals); } else { processed = new DenseInstance(original.weight(), instanceVals); } copyValues(processed, false, instance.dataset(), outputFormatPeek()); push(processed); // No need to copy instance } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy