All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.knowledgeflow.steps.Appender Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.

There is a newer version: 3.8.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    Appender.java
 *    Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.knowledgeflow.steps;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;
import weka.core.WekaException;
import weka.core.converters.SerializedInstancesLoader;
import weka.gui.knowledgeflow.KFGUIConsts;
import weka.knowledgeflow.Data;
import weka.knowledgeflow.StepManager;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * A bean that appends multiple incoming data connections into a single data
 * set. The incoming connections can be either all instance connections or all
 * batch-oriented connections (i.e. data set, training set and test set).
 * Instance and batch connections can't be mixed. An amalgamated output is
 * created that is a combination of all the incoming attributes. Missing values
 * are used to fill columns that don't exist in a particular incoming data set.
 * If all incoming connections are instance connections, then the outgoing
 * connection must be an instance connection (and vice versa for incoming batch
 * connections).
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: $
 */
@KFStep(name = "Appender", category = "Flow",
  toolTipText = "Append multiple sets of instances",
  iconPath = KFGUIConsts.BASE_ICON_PATH + "Appender.png")
public class Appender extends BaseStep {

  private static final long serialVersionUID = -3003135257112845998L;

  /**
   * Used to keep track of how many upstream steps have sent us complete data
   * sets (batch) or headers (incremental) so far.
   */
  protected Map m_completed;

  /** Handles on temp files used to store batches of instances in batch mode */
  protected Map m_tempBatchFiles;

  /** Used to hold the final header in the case of incremental operation */
  protected Instances m_completeHeader;

  /** Gets decremented for each incoming instance stream that has finished */
  protected AtomicInteger m_streamingCountDown;

  /**
   * Holds savers used for incrementally saving incoming instance streams. After
   * we've seen the structure from each incoming connection we can create the
   * final output structure, pull any saved instances from the temp files and
   * discard these savers as they will no longer be needed.
   */
  protected transient Map m_incrementalSavers;

  /** Holds a files in play for incremental incoming connections */
  protected transient Map m_incrementalFiles;

  /** Re-usable data object for streaming mode */
  protected Data m_streamingData;

  /** True if this step has been reset */
  protected boolean m_isReset;

  /**
   * Initialize the step
   *
   * @throws WekaException if a problem occurs
   */
  @Override
  public void stepInit() throws WekaException {
    m_isReset = true;
    m_completed = new HashMap();
    m_tempBatchFiles = new HashMap();
    m_completeHeader = null;
    m_incrementalSavers = new HashMap();
    m_incrementalFiles = new HashMap();
    m_streamingCountDown = new AtomicInteger(
      getStepManager().numIncomingConnectionsOfType(StepManager.CON_INSTANCE));
    m_streamingData = new Data(StepManager.CON_INSTANCE);
  }

  /**
   * Get the incoming connection types accepted by this step at this time
   *
   * @return a list of incoming connection types
   */
  @Override
  public List getIncomingConnectionTypes() {
    List result = new ArrayList();
    if (getStepManager().numIncomingConnections() == 0 || getStepManager()
      .numIncomingConnectionsOfType(StepManager.CON_INSTANCE) == 0) {
      result.addAll(Arrays.asList(StepManager.CON_DATASET,
        StepManager.CON_TRAININGSET, StepManager.CON_TESTSET));
    }

    if (getStepManager().numIncomingConnections() == 0 || getStepManager()
      .numIncomingConnectionsOfType(StepManager.CON_INSTANCE) > 0) {
      result.add(StepManager.CON_INSTANCE);
    }

    return result;
  }

  /**
   * Get a list of outgoing connection types that this step can produce at this
   * time
   * 
   * @return a list of outgoing connection types
   */
  @Override
  public List getOutgoingConnectionTypes() {
    List result = new ArrayList();

    if (getStepManager()
      .numIncomingConnectionsOfType(StepManager.CON_INSTANCE) > 0) {
      result.add(StepManager.CON_INSTANCE);
    } else {
      result.addAll(Arrays.asList(StepManager.CON_DATASET,
        StepManager.CON_TRAININGSET, StepManager.CON_TESTSET));
    }

    return result;
  }

  /**
   * Process an incoming data payload (if the step accepts incoming connections)
   *
   * @param data the data to process
   * @throws WekaException if a problem occurs
   */
  @Override
  public void processIncoming(Data data) throws WekaException {
    if (m_isReset
      && !data.getConnectionName().equals(StepManager.CON_INSTANCE)) {
      getStepManager().processing();
      m_isReset = false;
    }

    if (data.getConnectionName().equals(StepManager.CON_INSTANCE)) {
      processStreaming(data);

      if (m_streamingCountDown.get() == 0) {
        // all done
        m_streamingData.clearPayload();
        getStepManager().throughputFinished(m_streamingData);
      }
    } else {
      processBatch(data);
      if (m_completed.size() == getStepManager().numIncomingConnections()) {
        // done
        getStepManager().finished();
        // save memory
        m_completed.clear();
        m_tempBatchFiles.clear();
      }
    }

    if (isStopRequested()) {
      getStepManager().interrupted();
      // save memory
      m_completed.clear();
      m_tempBatchFiles.clear();
      m_incrementalSavers.clear();
      m_incrementalFiles.clear();
    }
  }

  /**
   * Process batch data
   *
   * @param data the data to process
   * @throws WekaException if a problem occurs
   */
  protected synchronized void processBatch(Data data) throws WekaException {
    Integer setNum =
      data.getPayloadElement(StepManager.CON_AUX_DATA_SET_NUM, 1);
    Integer maxSetNum =
      data.getPayloadElement(StepManager.CON_AUX_DATA_MAX_SET_NUM, 1);
    Instances insts = data.getPrimaryPayload();

    if (setNum > 1 || maxSetNum > 1) {
      // can't accept more than one dataset/batch from a particular source
      throw new WekaException("Source " + data.getSourceStep().getName() + " "
        + "is generating more than one " + data.getConnectionName() + " "
        + "in a batch");
    }

    Instances header = new Instances(insts, 0);
    m_completed.put(data.getSourceStep(), header);
    // write these instances (serialized) to a temp file
    try {
      File tmpF =
        File.createTempFile("weka", SerializedInstancesLoader.FILE_EXTENSION);
      // tmpF.deleteOnExit();
      ObjectOutputStream oos = new ObjectOutputStream(
        new BufferedOutputStream(new FileOutputStream(tmpF)));
      oos.writeObject(insts);
      oos.flush();
      oos.close();

      m_tempBatchFiles.put(data.getSourceStep(), tmpF);
    } catch (IOException e1) {
      throw new WekaException(e1);
    }

    if (isStopRequested()) {
      return;
    }

    // have we seen a dataset from every incoming connection?
    if (m_completed.size() == getStepManager().numIncomingConnections()) {
      // process all headers and create mongo header for new output.
      // missing values will fill columns that don't exist in particular data
      // sets
      Instances output = makeOutputHeader();
      getStepManager().logDetailed("Making output header structure");

      try {
        for (File f : m_tempBatchFiles.values()) {
          ObjectInputStream ois = new ObjectInputStream(
            new BufferedInputStream(new FileInputStream(f)));
          Instances temp = (Instances) ois.readObject();
          ois.close();

          // copy each instance over
          for (int i = 0; i < temp.numInstances(); i++) {
            Instance converted = makeOutputInstance(output, temp.instance(i));
            output.add(converted);
          }
        }

        Data outputD = new Data(data.getConnectionName(), output);
        outputD.setPayloadElement(StepManager.CON_AUX_DATA_SET_NUM, 1);
        outputD.setPayloadElement(StepManager.CON_AUX_DATA_MAX_SET_NUM, 1);
        getStepManager().outputData(outputD);
      } catch (Exception ex) {
        throw new WekaException(ex);
      }
    }
  }

  /**
   * Process streaming data
   *
   * @param data the data to process
   * @throws WekaException if a problem occurs
   */
  protected synchronized void processStreaming(Data data) throws WekaException {
    if (isStopRequested()) {
      return;
    }

    Step source = data.getSourceStep();
    Instance inst = data.getPrimaryPayload();
    if (!m_completed.containsKey(source)) {
      m_completed.put(source, inst.dataset());
    }

    if (m_completed.size() == getStepManager().numIncomingConnections()
      && m_completeHeader == null) {
      // create mondo header...
      getStepManager().logDetailed("Creating output header structure");
      m_completeHeader = makeOutputHeader();

      // now check for any buffered instances
      if (m_incrementalSavers.size() > 0) {
        // read in and convert these instances now
        for (Map.Entry e : m_incrementalSavers
          .entrySet()) {
          // for (ObjectOutputStream s : m_incrementalSavers.values()) {
          ObjectOutputStream s = e.getValue();
          // finish off the saving process first
          try {
            // s.writeIncremental(null);
            s.flush();
            s.close();

            // File tmpFile = s.retrieveFile();
            File tmpFile = m_incrementalFiles.get(e.getKey());
            ObjectInputStream ois = new ObjectInputStream(
              new BufferedInputStream(new FileInputStream(tmpFile)));
            Instance tmpLoaded = null;
            do {
              try {
                tmpLoaded = (Instance) ois.readObject();
                Instance converted =
                  makeOutputInstance(m_completeHeader, tmpLoaded);
                m_streamingData.setPayloadElement(StepManager.CON_INSTANCE,
                  converted);
                getStepManager().outputData(m_streamingData);
              } catch (Exception ex) {
                // EOF
                ois.close();
                break;
              }
            } while (tmpLoaded != null);

            /*
             * ArffLoader loader = new ArffLoader(); loader.setFile(tmpFile);
             * Instances tempStructure = loader.getStructure(); Instance
             * tempLoaded = loader.getNextInstance(tempStructure); while
             * (tempLoaded != null) { Instance converted =
             * makeOutputInstance(m_completeHeader, tempLoaded);
             * m_streamingData.setPayloadElement(StepManager.CON_INSTANCE,
             * converted); getStepManager().outputData(data);
             * 
             * tempLoaded = loader.getNextInstance(tempStructure); }
             */
          } catch (Exception ex) {
            throw new WekaException(ex);
          }
        }
        m_incrementalSavers.clear();
        m_incrementalFiles.clear();
      }
    }

    if (isStopRequested()) {
      return;
    }

    if (getStepManager().isStreamFinished(data)) {
      m_streamingCountDown.decrementAndGet();
      return;
    }

    if (m_completeHeader == null) {

      ObjectOutputStream saver = m_incrementalSavers.get(data.getSourceStep());
      if (saver == null) {
        try {
          File tmpFile = File.createTempFile("weka", ".arff");
          saver = new ObjectOutputStream(
            new BufferedOutputStream(new FileOutputStream(tmpFile)));
          m_incrementalSavers.put(data.getSourceStep(), saver);
          m_incrementalFiles.put(data.getSourceStep(), tmpFile);
        } catch (IOException ex) {
          throw new WekaException(ex);
        }
      }

      // ArffSaver saver = m_incrementalSavers.get(data.getSourceStep());
      // if (saver == null) {
      /*
       * saver = new ArffSaver(); try { File tmpFile =
       * File.createTempFile("weka", ".arff"); saver.setFile(tmpFile);
       * saver.setRetrieval(weka.core.converters.Saver.INCREMENTAL);
       * saver.setInstances(new Instances(inst.dataset(), 0));
       * m_incrementalSavers.put(data.getSourceStep(), saver); } catch
       * (IOException e1) { throw new WekaException(e1); }
       */

      try {
        // saver.writeIncremental(inst);
        saver.writeObject(inst);
      } catch (IOException e1) {
        throw new WekaException(e1);
      }
      // }
    } else {
      Instance newI = makeOutputInstance(m_completeHeader, inst);
      m_streamingData.setPayloadElement(StepManager.CON_INSTANCE, newI);
      getStepManager().outputData(m_streamingData);
    }
  }

  /**
   * Makes an output instance
   *
   * @param output the structure of the output
   * @param source the source instance
   * @return an output instance
   */
  private Instance makeOutputInstance(Instances output, Instance source) {

    double[] newVals = new double[output.numAttributes()];
    for (int i = 0; i < newVals.length; i++) {
      newVals[i] = Utils.missingValue();
    }

    for (int i = 0; i < source.numAttributes(); i++) {
      if (!source.isMissing(i)) {
        Attribute s = source.attribute(i);
        int outputIndex = output.attribute(s.name()).index();
        if (s.isNumeric()) {
          newVals[outputIndex] = source.value(s);
        } else if (s.isString()) {
          String sVal = source.stringValue(s);
          newVals[outputIndex] =
            output.attribute(outputIndex).addStringValue(sVal);
        } else if (s.isRelationValued()) {
          Instances rVal = source.relationalValue(s);
          newVals[outputIndex] =
            output.attribute(outputIndex).addRelation(rVal);
        } else if (s.isNominal()) {
          String nomVal = source.stringValue(s);
          newVals[outputIndex] =
            output.attribute(outputIndex).indexOfValue(nomVal);
        }
      }
    }

    Instance newInst = new DenseInstance(source.weight(), newVals);
    newInst.setDataset(output);

    return newInst;
  }

  /**
   * Create the structure of the output
   *
   * @return the structure of the output as a header-only set of instances
   * @throws WekaException if a problem occurs
   */
  protected Instances makeOutputHeader() throws WekaException {
    return makeOutputHeader(m_completed.values());
  }

  /**
   * Create the structure of the output given a collection of input structures
   *
   * @param headers a collection of incoming instance structures
   * @return the structure of the output as a header-only set of instances
   * @throws WekaException if a problem occurs
   */
  protected Instances makeOutputHeader(Collection headers)
    throws WekaException {
    // process each header in turn...
    Map attLookup = new HashMap();
    List attList = new ArrayList();
    Map> nominalLookups =
      new HashMap>();
    for (Instances h : headers) {
      for (int i = 0; i < h.numAttributes(); i++) {
        Attribute a = h.attribute(i);
        if (!attLookup.containsKey(a.name())) {
          attLookup.put(a.name(), a);
          attList.add(a);
          if (a.isNominal()) {
            TreeSet nVals = new TreeSet();
            for (int j = 0; j < a.numValues(); j++) {
              nVals.add(a.value(j));
            }
            nominalLookups.put(a.name(), nVals);
          }
        } else {
          Attribute storedVersion = attLookup.get(a.name());
          // mismatched types between headers - can't continue
          if (storedVersion.type() != a.type()) {
            throw new WekaException("Conflicting types for attribute "
              + "name '" + a.name() + "' between incoming " + "instance sets");
          }

          if (storedVersion.isNominal()) {
            Set storedVals = nominalLookups.get(a.name());
            for (int j = 0; j < a.numValues(); j++) {
              storedVals.add(a.value(j));
            }
          }
        }
      }
    }

    ArrayList finalAttList = new ArrayList();
    for (Attribute a : attList) {
      Attribute newAtt = null;
      if (a.isDate()) {
        newAtt = new Attribute(a.name(), a.getDateFormat());
      } else if (a.isNumeric()) {
        newAtt = new Attribute(a.name());
      } else if (a.isRelationValued()) {
        newAtt = new Attribute(a.name(), a.relation());
      } else if (a.isNominal()) {
        Set vals = nominalLookups.get(a.name());
        List newVals = new ArrayList();
        for (String v : vals) {
          newVals.add(v);
        }
        newAtt = new Attribute(a.name(), newVals);
      } else if (a.isString()) {
        newAtt = new Attribute(a.name(), (List) null);
      }

      finalAttList.add(newAtt);
    }

    return new Instances(
      "Appended_" + getStepManager().numIncomingConnections() + "_sets",
      finalAttList, 0);
  }

  /**
   * If possible, get the output structure for the named connection type as a
   * header-only set of instances. Can return null if the specified connection
   * type is not representable as Instances or cannot be determined at present.
   * 
   * @param connectionName the name of the connection to get the output structure for
   * @return the output structure or null if it can't be produced
   * @throws WekaException if a problem occurs
   */
  @Override
  public Instances outputStructureForConnectionType(String connectionName)
    throws WekaException {

    if (getStepManager().numIncomingConnections() > 0) {
      List incomingHeaders = new ArrayList();
      for (Map.Entry> e : getStepManager()
        .getIncomingConnections().entrySet()) {
        if (e.getValue().size() > 0) {
          String incomingConType = e.getKey();
          for (StepManager sm : e.getValue()) {
            Instances incomingStruc = getStepManager()
              .getIncomingStructureFromStep(sm, incomingConType);
            if (incomingStruc == null) {
              // can't determine final output structure if any incoming
              // structures are null at present
              return null;
            }
            incomingHeaders.add(incomingStruc);
          }
        }
      }
      if (incomingHeaders.size() > 0) {
        return makeOutputHeader(incomingHeaders);
      }
    }

    return null;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy