weka.distributed.CanopyMapTask Maven / Gradle / Ivy

Go to download
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CanopyMapTask.java
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import weka.clusterers.Canopy;
import weka.clusterers.Clusterer;
import weka.clusterers.PreconstructedFilteredClusterer;
import weka.core.Attribute;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.NormalizableDistance;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.StreamableFilterHelper;
import weka.core.Utils;
import weka.core.stats.ArffSummaryNumericMetric;
import weka.core.stats.NumericStats;
import weka.filters.Filter;
import weka.filters.PreconstructedFilter;
import weka.filters.StreamableFilter;
import weka.filters.unsupervised.attribute.PreconstructedMissingValuesReplacer;
import distributed.core.DistributedJobConfig;

/**
 * Map task for building partial canopies
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 11431 $
 */
public class CanopyMapTask implements OptionHandler, EnvironmentHandler,
  Serializable {

  /** For serialization */
  private static final long serialVersionUID = 5107020708019202338L;

  /** Environment variables */
  protected transient Environment m_env;

  /** Training data header */
  protected Instances m_header;

  /** Canopy clusterer */
  protected Canopy m_canopy;

  /** The list of filters to use */
  protected List m_filtersToUse;

  /** The missing values replacer to use */
  protected PreconstructedFilter m_missingValuesReplacer;

  /**
   * The final pre-processing filter to use (encapsulating all specified filters
   * and the missing values replacer. This can be null if all we are using is
   * missing values replacement. In this case the missing values replacer gets
   * set directly on the canopy clusterer
   */
  protected PreconstructedFilter m_finalFullPreprocess;

  protected String m_userT1 = "" + Canopy.DEFAULT_T1;
  protected String m_userT2 = "" + Canopy.DEFAULT_T2;

  /** Requested number of clusters */
  protected String m_numClusters = "2";

  /**
   * Prune low-density candidate canopies after every x instances have been seen
   */
  protected String m_periodicPruningRate = "10000";

  /**
   * The minimum cluster density (according to T2 distance) allowed. Used when
   * periodically pruning candidate canopies
   */
  protected String m_minClusterDensity = "2";

  /** The maximum number of candidate canopies to hold in memory at any one time */
  protected String m_maxCanopyCandidates = "100";

  /** If true then don't replace missing values with global means/modes */
  protected boolean m_dontReplaceMissing;

  /** heuristic value for T1 */
  public double m_hT1 = -1;

  /** heuristic value for T2 */
  public double m_hT2 = -1;

  /** True once all updates are completed and updateFinished() has been called */
  protected boolean m_finalized;

  /**
   * Substitute environment variables in the supplied string.
   * 
   * @param orig the string to modify
   * @return the string with environment variables resolved
   */
  public String environmentSubstitute(String orig) {
    if (m_env == null) {
      m_env = Environment.getSystemWide();
    }

    if (m_env != null) {
      try {
        orig = m_env.substitute(orig);
      } catch (Exception ex) {
        // not interested if there are no variables substituted
      }
    }

    return orig;
  }

  public void init(Instances headerWithSummary) throws DistributedWekaException {
    // to be called after setOptions();

    m_header = headerWithSummary;

    Instances headerNoSummary =
      CSVToARFFHeaderReduceTask.stripSummaryAtts(m_header);
    Instances dummyDistancePrimer =
      CanopyReduceTask.getPrimingDataForDistanceFunction(m_header);

    // heuristic T2
    m_hT2 = getHeuristicT2(headerWithSummary);

    // deal with filters
    if (!m_dontReplaceMissing) {
      try {
        m_missingValuesReplacer =
          new PreconstructedMissingValuesReplacer(m_header);
      } catch (Exception ex) {
        throw new DistributedWekaException(ex);
      }
    }
    configureFilters(headerNoSummary);

    configureCanopyClusterer(headerNoSummary, dummyDistancePrimer);
  }

  public void update(Instance inst) throws DistributedWekaException {
    if (m_canopy == null) {
      throw new DistributedWekaException(
        "CanopyMapTask has not been initialized yet!");
    }

    if (m_finalized) {
      throw new DistributedWekaException(
        "This map task has been finalized - can't process any more updates");
    }

    Instance toProcess = inst;
    if (m_finalFullPreprocess != null) {
      try {
        ((Filter) m_finalFullPreprocess).input(toProcess);
        toProcess = ((Filter) m_finalFullPreprocess).output();

        if (toProcess == null) {
          throw new Exception(
            "Preprocessing filter did not make instance available immediately!");
        }
      } catch (Exception ex) {
        throw new DistributedWekaException(ex);
      }
    }

    try {
      m_canopy.updateClusterer(toProcess);
    } catch (Exception e) {
      throw new DistributedWekaException(e);
    }
  }

  public void updateFinished() {
    if (m_canopy != null && !m_finalized) {
      m_canopy.updateFinished();
      m_finalized = true;
    }
  }

  public Clusterer getFinalizedClusterer() throws DistributedWekaException {
    if (m_canopy == null) {
      throw new DistributedWekaException(
        "CanopyMapTask has not been initialized yet!");
    }

    if (!m_finalized) {
      throw new DistributedWekaException(
        "This map task has note been finalized yet!");
    }

    if (m_finalFullPreprocess == null) {
      return m_canopy;
    }

    PreconstructedFilteredClusterer fc = new PreconstructedFilteredClusterer();
    fc.setFilter((Filter) m_finalFullPreprocess);
    fc.setClusterer(m_canopy);

    return fc;
  }

  protected void configureFilters(Instances headerNoSummary)
    throws DistributedWekaException {
    // setOptions() will have set up the pre-processing filters. Now
    // we just adjust the final set depending on whether missing values
    // are to be replaced as well. We always want missing values first
    // in the list so that it processes the original data
    if (m_filtersToUse != null && m_filtersToUse.size() > 0) {
      List filters = new ArrayList();
      if (!getDontReplaceMissingValues()) {
        filters.add((StreamableFilter) m_missingValuesReplacer);
      }
      for (Filter f : m_filtersToUse) {
        if (!(f instanceof StreamableFilter)) {
          throw new DistributedWekaException("Filter " + f.getClass().getName()
            + " is not a StreamableFilter!");
        }

        filters.add((StreamableFilter) f);
      }

      try {
        m_finalFullPreprocess =
          StreamableFilterHelper.wrapStreamableFilters(filters);
      } catch (Exception e) {
        throw new DistributedWekaException(e);
      }
    }

    if (m_finalFullPreprocess != null) {
      try {
        ((Filter) m_finalFullPreprocess).setInputFormat(headerNoSummary);
      } catch (Exception e) {
        throw new DistributedWekaException(e);
      }
    }
  }

  protected void configureCanopyClusterer(Instances headerNoSummary,
    Instances dummyDistancePrimer) throws DistributedWekaException {

    m_canopy = new ECanopy();
    if (!DistributedJobConfig.isEmpty(getMaxNumCanopies())) {
      String nC = environmentSubstitute(getMaxNumCanopies());
      System.err.println("[CanopyMap] max canopy clusters: " + nC);
      try {
        m_canopy.setNumClusters(Integer.parseInt(nC));
      } catch (Exception ex) {
        throw new DistributedWekaException(ex);
      }
    }

    if (!DistributedJobConfig
      .isEmpty(getMaxNumCandidateCanopiesToHoldInMemory())) {
      m_canopy
        .setMaxNumCandidateCanopiesToHoldInMemory(Integer
          .parseInt(environmentSubstitute(getMaxNumCandidateCanopiesToHoldInMemory())));
    }

    if (!DistributedJobConfig.isEmpty(getPeriodicPruningRate())) {
      m_canopy.setPeriodicPruningRate(Integer
        .parseInt(environmentSubstitute(getPeriodicPruningRate())));
    }

    if (!DistributedJobConfig.isEmpty(getMinimumCanopyDensity())) {
      m_canopy.setMinimumCanopyDensity(Double
        .parseDouble(environmentSubstitute(getMinimumCanopyDensity())));
    }

    double userT2 = Double.parseDouble(environmentSubstitute(m_userT2));
    if (userT2 > 0) {
      m_hT2 = userT2;
    }

    m_canopy.setT2(m_hT2);

    double userT1 = Double.parseDouble(environmentSubstitute(m_userT1));
    m_hT1 = userT1 > 0 ? userT1 : -userT1 * m_hT2;
    m_canopy.setT1(m_hT1);

    // Set missing values replacer directly on the canopy clusterer
    // if there are no other pre-processing filters
    if (m_filtersToUse == null && m_missingValuesReplacer != null) {
      m_canopy.setMissingValuesReplacer((Filter) m_missingValuesReplacer);
    }

    try {
      Instances initInsts = headerNoSummary;
      if (m_finalFullPreprocess != null) {
        initInsts = ((Filter) m_finalFullPreprocess).getOutputFormat();
      }
      m_canopy.buildClusterer(initInsts);

      // if there are any other filters (besides missing values)
      // in play then we can't initialize the distance function
      // with min/max dummy data (since we'd need the min/max
      // attribute info from the transformed data)
      if (m_finalFullPreprocess == null) {
        m_canopy.initializeDistanceFunction(dummyDistancePrimer);
      }
    } catch (Exception ex) {
      throw new DistributedWekaException(ex);
    }
  }

  public static double getHeuristicT2(Instances headerWithSummary)
    throws DistributedWekaException {

    Instances headerNoSummary =
      CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);

    double[] mins = new double[headerNoSummary.numAttributes()];
    double[] maxes = new double[headerNoSummary.numAttributes()];
    double normalizedStdDevSum = 0;

    for (int i = 0; i < headerNoSummary.numAttributes(); i++) {
      Attribute orig = headerNoSummary.attribute(i);
      Attribute summary =
        headerWithSummary
          .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
            + orig.name());

      if (orig.isNumeric()) {

        // number of non-missing values
        double count =
          NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.COUNT
            .ordinal()];
        if (count > 2) {
          mins[i] =
            NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.MIN
              .ordinal()];
          maxes[i] =
            NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.MAX
              .ordinal()];

          double stdD =
            NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.STDDEV
              .ordinal()];
          if (!Utils.isMissingValue(stdD) && maxes[i] - mins[i] > 0) {
            stdD = 0.5 * stdD / (maxes[i] - mins[i]);
            normalizedStdDevSum += stdD;
          }
        }
      } else if (orig.isNominal()) {
        // doesn't matter for non numeric attributes
        mins[i] = Utils.missingValue();
        maxes[i] = Utils.missingValue();
        normalizedStdDevSum += 0.25;
      }
    }

    normalizedStdDevSum = Math.sqrt(normalizedStdDevSum);

    return normalizedStdDevSum > 0 ? normalizedStdDevSum : 0;
  }

  @Override
  public Enumeration