weka.distributed.CanopyMapTask Maven / Gradle / Ivy
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* CanopyMapTask.java
* Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
*
*/
package weka.distributed;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;
import weka.clusterers.Canopy;
import weka.clusterers.Clusterer;
import weka.clusterers.PreconstructedFilteredClusterer;
import weka.core.Attribute;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.NormalizableDistance;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.StreamableFilterHelper;
import weka.core.Utils;
import weka.core.stats.ArffSummaryNumericMetric;
import weka.core.stats.NumericStats;
import weka.filters.Filter;
import weka.filters.PreconstructedFilter;
import weka.filters.StreamableFilter;
import weka.filters.unsupervised.attribute.PreconstructedMissingValuesReplacer;
import distributed.core.DistributedJobConfig;
/**
* Map task for building partial canopies
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 11431 $
*/
public class CanopyMapTask implements OptionHandler, EnvironmentHandler,
Serializable {
/** For serialization */
private static final long serialVersionUID = 5107020708019202338L;
/** Environment variables */
protected transient Environment m_env;
/** Training data header */
protected Instances m_header;
/** Canopy clusterer */
protected Canopy m_canopy;
/** The list of filters to use */
protected List m_filtersToUse;
/** The missing values replacer to use */
protected PreconstructedFilter m_missingValuesReplacer;
/**
* The final pre-processing filter to use (encapsulating all specified filters
* and the missing values replacer. This can be null if all we are using is
* missing values replacement. In this case the missing values replacer gets
* set directly on the canopy clusterer
*/
protected PreconstructedFilter m_finalFullPreprocess;
protected String m_userT1 = "" + Canopy.DEFAULT_T1;
protected String m_userT2 = "" + Canopy.DEFAULT_T2;
/** Requested number of clusters */
protected String m_numClusters = "2";
/**
* Prune low-density candidate canopies after every x instances have been seen
*/
protected String m_periodicPruningRate = "10000";
/**
* The minimum cluster density (according to T2 distance) allowed. Used when
* periodically pruning candidate canopies
*/
protected String m_minClusterDensity = "2";
/** The maximum number of candidate canopies to hold in memory at any one time */
protected String m_maxCanopyCandidates = "100";
/** If true then don't replace missing values with global means/modes */
protected boolean m_dontReplaceMissing;
/** heuristic value for T1 */
public double m_hT1 = -1;
/** heuristic value for T2 */
public double m_hT2 = -1;
/** True once all updates are completed and updateFinished() has been called */
protected boolean m_finalized;
/**
* Substitute environment variables in the supplied string.
*
* @param orig the string to modify
* @return the string with environment variables resolved
*/
public String environmentSubstitute(String orig) {
if (m_env == null) {
m_env = Environment.getSystemWide();
}
if (m_env != null) {
try {
orig = m_env.substitute(orig);
} catch (Exception ex) {
// not interested if there are no variables substituted
}
}
return orig;
}
public void init(Instances headerWithSummary) throws DistributedWekaException {
// to be called after setOptions();
m_header = headerWithSummary;
Instances headerNoSummary =
CSVToARFFHeaderReduceTask.stripSummaryAtts(m_header);
Instances dummyDistancePrimer =
CanopyReduceTask.getPrimingDataForDistanceFunction(m_header);
// heuristic T2
m_hT2 = getHeuristicT2(headerWithSummary);
// deal with filters
if (!m_dontReplaceMissing) {
try {
m_missingValuesReplacer =
new PreconstructedMissingValuesReplacer(m_header);
} catch (Exception ex) {
throw new DistributedWekaException(ex);
}
}
configureFilters(headerNoSummary);
configureCanopyClusterer(headerNoSummary, dummyDistancePrimer);
}
public void update(Instance inst) throws DistributedWekaException {
if (m_canopy == null) {
throw new DistributedWekaException(
"CanopyMapTask has not been initialized yet!");
}
if (m_finalized) {
throw new DistributedWekaException(
"This map task has been finalized - can't process any more updates");
}
Instance toProcess = inst;
if (m_finalFullPreprocess != null) {
try {
((Filter) m_finalFullPreprocess).input(toProcess);
toProcess = ((Filter) m_finalFullPreprocess).output();
if (toProcess == null) {
throw new Exception(
"Preprocessing filter did not make instance available immediately!");
}
} catch (Exception ex) {
throw new DistributedWekaException(ex);
}
}
try {
m_canopy.updateClusterer(toProcess);
} catch (Exception e) {
throw new DistributedWekaException(e);
}
}
public void updateFinished() {
if (m_canopy != null && !m_finalized) {
m_canopy.updateFinished();
m_finalized = true;
}
}
public Clusterer getFinalizedClusterer() throws DistributedWekaException {
if (m_canopy == null) {
throw new DistributedWekaException(
"CanopyMapTask has not been initialized yet!");
}
if (!m_finalized) {
throw new DistributedWekaException(
"This map task has note been finalized yet!");
}
if (m_finalFullPreprocess == null) {
return m_canopy;
}
PreconstructedFilteredClusterer fc = new PreconstructedFilteredClusterer();
fc.setFilter((Filter) m_finalFullPreprocess);
fc.setClusterer(m_canopy);
return fc;
}
protected void configureFilters(Instances headerNoSummary)
throws DistributedWekaException {
// setOptions() will have set up the pre-processing filters. Now
// we just adjust the final set depending on whether missing values
// are to be replaced as well. We always want missing values first
// in the list so that it processes the original data
if (m_filtersToUse != null && m_filtersToUse.size() > 0) {
List filters = new ArrayList();
if (!getDontReplaceMissingValues()) {
filters.add((StreamableFilter) m_missingValuesReplacer);
}
for (Filter f : m_filtersToUse) {
if (!(f instanceof StreamableFilter)) {
throw new DistributedWekaException("Filter " + f.getClass().getName()
+ " is not a StreamableFilter!");
}
filters.add((StreamableFilter) f);
}
try {
m_finalFullPreprocess =
StreamableFilterHelper.wrapStreamableFilters(filters);
} catch (Exception e) {
throw new DistributedWekaException(e);
}
}
if (m_finalFullPreprocess != null) {
try {
((Filter) m_finalFullPreprocess).setInputFormat(headerNoSummary);
} catch (Exception e) {
throw new DistributedWekaException(e);
}
}
}
protected void configureCanopyClusterer(Instances headerNoSummary,
Instances dummyDistancePrimer) throws DistributedWekaException {
m_canopy = new ECanopy();
if (!DistributedJobConfig.isEmpty(getMaxNumCanopies())) {
String nC = environmentSubstitute(getMaxNumCanopies());
System.err.println("[CanopyMap] max canopy clusters: " + nC);
try {
m_canopy.setNumClusters(Integer.parseInt(nC));
} catch (Exception ex) {
throw new DistributedWekaException(ex);
}
}
if (!DistributedJobConfig
.isEmpty(getMaxNumCandidateCanopiesToHoldInMemory())) {
m_canopy
.setMaxNumCandidateCanopiesToHoldInMemory(Integer
.parseInt(environmentSubstitute(getMaxNumCandidateCanopiesToHoldInMemory())));
}
if (!DistributedJobConfig.isEmpty(getPeriodicPruningRate())) {
m_canopy.setPeriodicPruningRate(Integer
.parseInt(environmentSubstitute(getPeriodicPruningRate())));
}
if (!DistributedJobConfig.isEmpty(getMinimumCanopyDensity())) {
m_canopy.setMinimumCanopyDensity(Double
.parseDouble(environmentSubstitute(getMinimumCanopyDensity())));
}
double userT2 = Double.parseDouble(environmentSubstitute(m_userT2));
if (userT2 > 0) {
m_hT2 = userT2;
}
m_canopy.setT2(m_hT2);
double userT1 = Double.parseDouble(environmentSubstitute(m_userT1));
m_hT1 = userT1 > 0 ? userT1 : -userT1 * m_hT2;
m_canopy.setT1(m_hT1);
// Set missing values replacer directly on the canopy clusterer
// if there are no other pre-processing filters
if (m_filtersToUse == null && m_missingValuesReplacer != null) {
m_canopy.setMissingValuesReplacer((Filter) m_missingValuesReplacer);
}
try {
Instances initInsts = headerNoSummary;
if (m_finalFullPreprocess != null) {
initInsts = ((Filter) m_finalFullPreprocess).getOutputFormat();
}
m_canopy.buildClusterer(initInsts);
// if there are any other filters (besides missing values)
// in play then we can't initialize the distance function
// with min/max dummy data (since we'd need the min/max
// attribute info from the transformed data)
if (m_finalFullPreprocess == null) {
m_canopy.initializeDistanceFunction(dummyDistancePrimer);
}
} catch (Exception ex) {
throw new DistributedWekaException(ex);
}
}
public static double getHeuristicT2(Instances headerWithSummary)
throws DistributedWekaException {
Instances headerNoSummary =
CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);
double[] mins = new double[headerNoSummary.numAttributes()];
double[] maxes = new double[headerNoSummary.numAttributes()];
double normalizedStdDevSum = 0;
for (int i = 0; i < headerNoSummary.numAttributes(); i++) {
Attribute orig = headerNoSummary.attribute(i);
Attribute summary =
headerWithSummary
.attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
+ orig.name());
if (orig.isNumeric()) {
// number of non-missing values
double count =
NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.COUNT
.ordinal()];
if (count > 2) {
mins[i] =
NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.MIN
.ordinal()];
maxes[i] =
NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.MAX
.ordinal()];
double stdD =
NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.STDDEV
.ordinal()];
if (!Utils.isMissingValue(stdD) && maxes[i] - mins[i] > 0) {
stdD = 0.5 * stdD / (maxes[i] - mins[i]);
normalizedStdDevSum += stdD;
}
}
} else if (orig.isNominal()) {
// doesn't matter for non numeric attributes
mins[i] = Utils.missingValue();
maxes[i] = Utils.missingValue();
normalizedStdDevSum += 0.25;
}
}
normalizedStdDevSum = Math.sqrt(normalizedStdDevSum);
return normalizedStdDevSum > 0 ? normalizedStdDevSum : 0;
}
@Override
public Enumeration
© 2015 - 2025 Weber Informatics LLC | Privacy Policy