weka.distributed.CSVToARFFHeaderMapTask Maven / Gradle / Ivy

Go to download
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CSVToARFFHeaderMapTask.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed;

import au.com.bytecode.opencsv.CSVParser;
import com.clearspring.analytics.stream.quantile.TDigest;
import distributed.core.DistributedJobConfig;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.core.stats.ArffSummaryNumericMetric;
import weka.core.stats.NominalStats;
import weka.core.stats.NumericStats;
import weka.core.stats.Stats;
import weka.core.stats.StringStats;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.Vector;

/**
 * A map task that processes incoming lines in CSV format and builds up header
 * information. Can be configured with information on which columns to force to
 * be nominal, string, date etc. Nominal values can be determined automatically
 * or pre-supplied by the user. In addition to determining the format of the
 * columns in the data it also can compute meta data such as means, modes,
 * counts, standard deviations etc. These statistics get encoded in special
 * "summary" attributes in the header file - one for each numeric or nominal
 * attribute in the data.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 12441 $
 */
public class CSVToARFFHeaderMapTask implements OptionHandler, Serializable {

  /** Attribute name prefix for a summary statistics attribute */
  public static final String ARFF_SUMMARY_ATTRIBUTE_PREFIX = "arff_summary_";
  public static final int MAX_PARSING_ERRORS = 50;
  /**
   * For serialization
   */
  private static final long serialVersionUID = -3949274571568175413L;
  /** Attribute types for the incoming CSV columns */
  protected TYPE[] m_attributeTypes;

  /** A range of columns to force to be of type String */
  protected Range m_forceString = new Range();

  /** A range of columns to force to be of type Nominal */
  protected Range m_forceNominal = new Range();

  /** A range of columns to force to be of type Date */
  protected Range m_forceDate = new Range();

  /**
   * User supplied ranges to force to be string (passed to Range objects at init
   * time)
   */
  protected String m_stringRange = "";

  /**
   * User supplied ranges to force to be nominal (passed to Range objects at
   * init time)
   */
  protected String m_nominalRange = "";

  /**
   * User supplied ranges to force to be date (passed to Range objects at init
   * time)
   */
  protected String m_dateRange = "";

  /**
   * Holds the names of the incoming columns/attributes. Names will be generated
   * if not supplied by the user
   */
  protected List m_attributeNames = new ArrayList();
  /** The formatting string to use to parse dates */
  protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss";
  /** The formatter to use on dates */
  protected SimpleDateFormat m_formatter;
  /** The user-supplied legal nominal values - each entry in the list is a spec */
  protected List m_nominalLabelSpecs = new ArrayList();
  /**
   * The user-supplied default nominal values - each entry in the list is a spec
   */
  protected List m_nominalDefaultLabelSpecs = new ArrayList();
  /** Lookup for nominal values */
  protected Map> m_nominalVals =
    new HashMap>();
  /**
   * Default labels (if any) to use with nominal attributes. These are like a
   * "catch-all" and can be used when you are are explicitly specifying labels
   * but don't want to specify all labels. One use-case if to convert a
   * multi-class problem into a binary one, by simply specifying the positive
   * class label.
   */
  protected Map m_nominalDefaultVals =
    new HashMap();
  /** The placeholder for missing values. */
  protected String m_MissingValue = "?";
  /** enclosure character to use for strings - opencsv only allows one */
  protected String m_Enclosures = "\'";
  /** the field separator. */
  protected String m_FieldSeparator = ",";
  /** The CSV parser (unfortunately, the parser does not implement Serializable) */
  protected transient CSVParser m_parser;
  /** Whether to compute summary statistics or not */
  protected boolean m_computeSummaryStats = true;
  /** A map of attribute names to summary statistics */
  protected Map m_summaryStats = new HashMap();
  /**
   * Whether to treat zeros as missing values when computing summary stats for
   * numeric attributes
   */
  protected boolean m_treatZeroAsMissing;

  /** Whether to suppress command line options relating to quantile estimation */
  protected boolean m_suppressQuantileOptions;

  /** Whether to perform quantile estimation too */
  protected boolean m_estimateQuantiles = false;
  /** The compression level for the TDigest quantile estimator */
  protected double m_quantileCompression = NumericStats.Q_COMPRESSION;
  protected int m_parsingErrors;

  /**
   * Constructor
   */
  public CSVToARFFHeaderMapTask() {
    this(false);
  }

  /**
   * Constructor
   * 
   * @param suppressQuantileOptions true if commandline options relating to
   *          quantile estimation are to be suppressed
   */
  public CSVToARFFHeaderMapTask(boolean suppressQuantileOptions) {
    m_suppressQuantileOptions = suppressQuantileOptions;
  }

  /**
   * Update the summary statistics for a given attribute with the given value
   *
   * @param summaryStats the map of summary statistics
   * @param attName the name of the attribute being updated
   * @param value the value to update with (if the attribute is numeric)
   * @param nominalLabel holds the label/string for the attribute (if it is
   *          nominal or string)
   * @param isNominal true if the attribute is nominal
   * @param isString true if the attribute is a string attribute
   * @param treatZeroAsMissing treats zero as missing value for numeric
   *          attributes
   * @param estimateQuantiles true if we should estimate quantiles too
   * @param quantileCompression the compression level to use in the TDigest
   *          estimators
   */
  public static void updateSummaryStats(Map summaryStats,
    String attName, double value, String nominalLabel, boolean isNominal,
    boolean isString, boolean treatZeroAsMissing, boolean estimateQuantiles,
    double quantileCompression) {
    Stats s = summaryStats.get(attName);

    if (!isNominal && !isString) {
      // numeric attribute
      if (s == null) {
        s = new NumericStats(attName, quantileCompression);
        summaryStats.put(attName, s);
      }

      NumericStats ns = (NumericStats) s;
      ns.update(value, 1.0, treatZeroAsMissing, estimateQuantiles);
      // if (Utils.isMissingValue(value) || (treatZeroAsMissing && value == 0))
      // {
      // ns.m_stats[ArffSummaryNumericMetric.MISSING.ordinal()]++;
      // } else {
      // ns.m_stats[ArffSummaryNumericMetric.COUNT.ordinal()]++;
      // ns.m_stats[ArffSummaryNumericMetric.SUM.ordinal()] += value;
      // ns.m_stats[ArffSummaryNumericMetric.SUMSQ.ordinal()] += value * value;
      // if (Double.isNaN(ns.m_stats[ArffSummaryNumericMetric.MIN.ordinal()])) {
      // ns.m_stats[ArffSummaryNumericMetric.MIN.ordinal()] =
      // ns.m_stats[ArffSummaryNumericMetric.MAX.ordinal()] = value;
      // } else if (value < ns.m_stats[ArffSummaryNumericMetric.MIN.ordinal()])
      // {
      // ns.m_stats[ArffSummaryNumericMetric.MIN.ordinal()] = value;
      // } else if (value > ns.m_stats[ArffSummaryNumericMetric.MAX.ordinal()])
      // {
      // ns.m_stats[ArffSummaryNumericMetric.MAX.ordinal()] = value;
      // }
      // }
    } else if (isNominal) {
      // nominal attribute

      if (s == null) {
        s = new NominalStats(attName);
        summaryStats.put(attName, s);
      }

      // check to see if the type is correct - it
      // might not be if the first row(s) processed contain
      // missing values. In this case the TYPE would have
      // been undetermined (unless explicitly specified
      // by the user). The default is to assume the
      // attribute is numeric, so a NumericStats object
      // (initialized with only the missing count) would
      // have been created.

      if (s instanceof NumericStats) {
        double missing =
          ((NumericStats) s).getStats()[ArffSummaryNumericMetric.MISSING
            .ordinal()];

        // need to replace this with NominalStats and transfer over the missing
        // count
        s = new NominalStats(attName);
        ((NominalStats) s).add(null, missing);
        summaryStats.put(attName, s);
      }

      NominalStats ns = (NominalStats) s;
      ns.add(nominalLabel, 1.0);
      // if (Utils.isMissingValue(value) && nominalLabel == null) {
      // ns.add(nominalLabel, 1.0);
      // } else {
      //
      // NominalStats.Count c = ns.m_counts.get(nominalLabel);
      // if (c == null) {
      // c = new NominalStats.Count();
      // ns.m_counts.put(nominalLabel, c);
      // }
      // c.m_count += value;
      // }
    } else if (isString) {
      if (s == null) {
        s = new StringStats(attName);
        summaryStats.put(attName, s);
      }

      StringStats ss = (StringStats) s;
      ss.update(nominalLabel, 1.0);
    }
  }

  public static List
    instanceHeaderToAttributeNameList(Instances header) {
    List attNames = new ArrayList();

    for (int i = 0; i < header.numAttributes(); i++) {
      attNames.add(header.attribute(i).name());
    }

    return attNames;
  }

  public static void main(String[] args) {
    try {
      CSVToARFFHeaderMapTask task = new CSVToARFFHeaderMapTask();

      task = new CSVToARFFHeaderMapTask();
      task.setOptions(args);
      // task.setComputeSummaryStats(true);

      BufferedReader br = new BufferedReader(new FileReader(args[0]));
      String line = br.readLine();
      String[] names = line.split(",");
      List attNames = new ArrayList();
      for (String s : names) {
        attNames.add(s);
      }

      while ((line = br.readLine()) != null) {
        task.processRow(line, attNames);
      }

      br.close();

      System.err.println(task.getHeader());

      CSVToARFFHeaderReduceTask arffReduce = new CSVToARFFHeaderReduceTask();
      List instList = new ArrayList();
      instList.add(task.getHeader());
      Instances withSummary = arffReduce.aggregate(instList);

      System.err.println(withSummary);

    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }

  /**
   * Performs a "combine" operation using the supplied partial
   * CSVToARFFHeaderMapTask tasks. This is essentially a reduce operation, but
   * returns a single CSVToARFFHeaderMapTask object (rather than the final
   * header that is produced by CSVToARFFHeaderReduceTask). This allows several
   * reduce stages to be implemented (if desired) or partial reduces to occur in
   * parallel.
   *
   * @param tasks a list of CSVToARFFHeaderMapTasks to "combine"
   * @return a CSVToARFFHeaderMapTask with the merged state
   * @throws DistributedWekaException if a problem occurs
   */
  public static CSVToARFFHeaderMapTask combine(
    List tasks) throws DistributedWekaException {
    if (tasks == null || tasks.size() == 0) {
      throw new DistributedWekaException(
        "[CSVToARFFHeaderMapTask:combine] no tasks to combine!");
    }
    if (tasks.size() == 1) {
      return tasks.get(0);
    }

    Instances combinedHeaders = null;
    CSVToARFFHeaderMapTask master = tasks.get(0);
    List toCombine = new ArrayList();
    for (int i = 0; i < tasks.size(); i++) {
      toCombine.add(tasks.get(i).getHeader());
    }
    combinedHeaders = CSVToARFFHeaderReduceTask.aggregate(toCombine);

    Map mergedDigests = new HashMap();
    if (master.getComputeQuartilesAsPartOfSummaryStats()) {
      Instances headerNoSummary =
        CSVToARFFHeaderReduceTask.stripSummaryAtts(combinedHeaders);

      for (int i = 0; i < headerNoSummary.numAttributes(); i++) {
        List digestsToMerge = new ArrayList();
        String attName = headerNoSummary.attribute(i).name();

        for (CSVToARFFHeaderMapTask t : tasks) {
          Stats ns = t.m_summaryStats.get(attName);
          if (ns instanceof NumericStats) {
            TDigest partialEstimator =
              ((NumericStats) ns).getQuantileEstimator();
            if (partialEstimator != null) {
              digestsToMerge.add(partialEstimator);
            }
          }

          // HeaderAndQuantileDataHolder h =
          // t.getHeaderAndQuantileEstimators();
          // TDigest partialEstimator =
          // h.getQuantileEstimator(attName);
          // if (partialEstimator != null) {
          // digestsToMerge.add(partialEstimator);
          // }
        }

        if (digestsToMerge.size() > 0) {
          TDigest mergedForAtt =
            TDigest.merge(digestsToMerge.get(0).compression(), digestsToMerge);
          mergedDigests.put(attName, mergedForAtt);
        }
      }
    }

    // need to re-construct master now that we've (potentially) resolved
    // type conflicts within this combine operation
    master.fromHeader(combinedHeaders, mergedDigests);

    return master;
  }

  @Override
  public Enumeration