All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.knowledgeflow.steps.Loader Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    Join.java
 *    Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.knowledgeflow.steps;

import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SerializedObject;
import weka.core.WekaException;
import weka.core.converters.FileSourcedConverter;
import weka.gui.ProgrammaticProperty;
import weka.gui.beans.StreamThroughput;
import weka.gui.knowledgeflow.StepVisual;
import weka.knowledgeflow.Data;
import weka.knowledgeflow.StepManager;
import weka.knowledgeflow.StepManagerImpl;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

/**
 * Knowledge Flow step that wraps {@code weka.core.converters.Loader}s.
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: $
 */
@KFStep(name = "Loader", category = "DataSources",
  toolTipText = "Weka loader wrapper", iconPath = "")
public class Loader extends WekaAlgorithmWrapper implements Serializable {

  private static final long serialVersionUID = -788869066035779154L;

  /**
   * Global info for the wrapped loader (if it exists).
   */
  protected String m_globalInfo;

  /** True if we're going to be streaming instance objects */
  protected boolean m_instanceGeneration;

  /** True if there are no outgoing connections */
  protected boolean m_noOutputs;

  /** Reusable data container */
  protected Data m_instanceData;

  /** For measuring the overall flow throughput */
  protected StreamThroughput m_flowThroughput;

  /**
   * Get the class of the wrapped algorithm
   *
   * @return the class of the wrapped algorithm
   */
  @Override
  public Class getWrappedAlgorithmClass() {
    return weka.core.converters.Loader.class;
  }

  /**
   * Set the wrapped algorithm to use
   *
   * @param algo the algorithm to use
   */
  @Override
  public void setWrappedAlgorithm(Object algo) {
    super.setWrappedAlgorithm(algo);
    m_defaultIconPath = StepVisual.BASE_ICON_PATH + "DefaultDataSource.gif";
  }

  /**
   * Convenience method - calls {@code getWrappedAlgorithm()}
   *
   * @return the wrapped loader
   */
  public weka.core.converters.Loader getLoader() {
    return (weka.core.converters.Loader) getWrappedAlgorithm();
  }

  /**
   * Convenience method - calls {@code setWrappedAlgorithm()}
   *
   * @param loader the loader to use
   */
  @ProgrammaticProperty
  public void setLoader(weka.core.converters.Loader loader) {
    setWrappedAlgorithm(loader);
  }

  /**
   * Initialize the step.
   *
   * @throws WekaException if a problem occurs during initialization
   */
  @Override
  public void stepInit() throws WekaException {

    if (!(getWrappedAlgorithm() instanceof weka.core.converters.Loader)) {
      throw new WekaException("Incorrect type of algorithm");
    }

    int numDatasetOutputs =
      getStepManager().numOutgoingConnectionsOfType(StepManager.CON_DATASET);
    int numInstanceOutputs =
      getStepManager().numOutgoingConnectionsOfType(StepManager.CON_INSTANCE);

    m_noOutputs = numInstanceOutputs == 0 && numDatasetOutputs == 0;

    if (numDatasetOutputs > 0 && numInstanceOutputs > 0) {
      throw new WekaException(
        "Can't have both instance and dataSet outgoing connections!");
    }

    if (getWrappedAlgorithm() instanceof EnvironmentHandler) {
      ((EnvironmentHandler) getWrappedAlgorithm())
        .setEnvironment(getStepManager().getExecutionEnvironment()
          .getEnvironmentVariables());
    }

    m_instanceGeneration = numInstanceOutputs > 0;
    m_instanceData = new Data(StepManager.CON_INSTANCE);
  }

  /**
   * Start executing
   *
   * @throws WekaException if a problem occurs
   */
  @Override
  public void start() throws WekaException {
    if (m_noOutputs) {
      return;
    }
    getStepManager().processing();

    weka.core.converters.Loader theLoader =
      (weka.core.converters.Loader) getWrappedAlgorithm();

    String startMessage =
      (theLoader instanceof FileSourcedConverter) ? "Loading "
        + environmentSubstitute(((FileSourcedConverter) theLoader).retrieveFile().toString())
        : "Loading...";

    getStepManager().logBasic(startMessage);
    getStepManager().statusMessage(startMessage);

    if (!m_instanceGeneration) {
      try {
        theLoader.reset();
        theLoader.setRetrieval(weka.core.converters.Loader.BATCH);
        Instances dataset = theLoader.getDataSet();
        getStepManager().logBasic("Loaded " + dataset.relationName());
        Data data = new Data();
        data.setPayloadElement(StepManager.CON_DATASET, dataset);
        getStepManager().outputData(StepManager.CON_DATASET, data);
      } catch (Exception ex) {
        throw new WekaException(ex);
      } finally {
        getStepManager().finished();
      }
    } else {
      String stm =
        getName() + "$" + hashCode() + 99 + "| overall flow throughput -|";
      m_flowThroughput =
        new StreamThroughput(stm, "Starting flow...",
          ((StepManagerImpl) getStepManager()).getLog());

      Instance nextInstance = null;
      Instances structure = null;
      Instances structureCopy = null;
      Instances currentStructure = null;
      boolean stringAttsPresent = false;

      try {
        theLoader.reset();
        theLoader.setRetrieval(weka.core.converters.Loader.INCREMENTAL);
        structure = theLoader.getStructure();
        if (structure.checkForStringAttributes()) {
          structureCopy =
            (Instances) (new SerializedObject(structure).getObject());
          stringAttsPresent = true;
        }
        currentStructure = structure;
      } catch (Exception ex) {
        throw new WekaException(ex);
      }

      if (isStopRequested()) {
        return;
      }

      try {
        nextInstance = theLoader.getNextInstance(structure);
      } catch (Exception ex) {
        // getStepManager().throughputFinished(m_instanceData);
        throw new WekaException(ex);
      }

      while (!isStopRequested() && nextInstance != null) {
        m_flowThroughput.updateStart();
        getStepManager().throughputUpdateStart();

        if (stringAttsPresent) {
          if (currentStructure == structure) {
            currentStructure = structureCopy;
          } else {
            currentStructure = structure;
          }
        }

        m_instanceData
          .setPayloadElement(StepManager.CON_INSTANCE, nextInstance);

        try {
          nextInstance = theLoader.getNextInstance(currentStructure);
        } catch (Exception ex) {
          getStepManager().throughputFinished(m_instanceData);
          throw new WekaException(ex);
        }
        getStepManager().throughputUpdateEnd(); // finished read operation
        getStepManager().outputData(StepManager.CON_INSTANCE, m_instanceData);

        m_flowThroughput.updateEnd(((StepManagerImpl) getStepManager())
          .getLog());
      }

      if (isStopRequested()) {
        ((StepManagerImpl) getStepManager()).getLog().statusMessage(
          stm + "remove");
        return;
      }
      m_flowThroughput.finished(((StepManagerImpl) getStepManager()).getLog());

      // signal end of input
      m_instanceData.clearPayload();
      getStepManager().throughputFinished(m_instanceData);
      // int flowSpeed = m_flowThroughput.getAverageInstancesPerSecond();
      // String finalMessage += ("" + flowSpeed +
      // " insts/sec (flow throughput)");
    }
  }

  /**
   * If possible, get the output structure for the named connection type as a
   * header-only set of instances. Can return null if the specified connection
   * type is not representable as Instances or cannot be determined at present.
   *
   * @param connectionName the name of the connection type to get the output
   *          structure for
   * @param env Environment to use
   * @return the output structure as a header-only Instances object
   * @throws WekaException if a problem occurs
   */
  @Override
  public Instances outputStructureForConnectionType(String connectionName,
    Environment env) throws WekaException {

    // can't reset the laoder to get the structure if we're actively
    // processing...
    if (getStepManager().isStepBusy()) {
      return null;
    }

    try {
      weka.core.converters.Loader theLoader =
        (weka.core.converters.Loader) getWrappedAlgorithm();
      theLoader.reset();
      if (theLoader instanceof EnvironmentHandler) {
        Environment toUse = env != null ? env : Environment.getSystemWide();
        ((EnvironmentHandler) theLoader).setEnvironment(toUse);
      }
      return theLoader.getStructure();
    } catch (Exception ex) {
      getStepManager().logError(ex.getMessage(), ex);
    }

    return null;
  }

  /**
   * Get a list of incoming connection types that this step can accept. Ideally
   * (and if appropriate), this should take into account the state of the step
   * and any existing incoming connections. E.g. a step might be able to accept
   * one (and only one) incoming batch data connection.
   *
   * @return a list of incoming connections that this step can accept given its
   *         current state
   */
  @Override
  public List getIncomingConnectionTypes() {
    // doesn't accept incoming connections
    return null;
  }

  /**
   * Get a list of outgoing connection types that this step can produce. Ideally
   * (and if appropriate), this should take into account the state of the step
   * and the incoming connections. E.g. depending on what incoming connection is
   * present, a step might be able to produce a trainingSet output, a testSet
   * output or neither, but not both.
   *
   * @return a list of outgoing connections that this step can produce
   */
  @Override
  public List getOutgoingConnectionTypes() {
    List outgoing = new ArrayList();
    int numDatasetOutputs =
      getStepManager().numOutgoingConnectionsOfType(StepManager.CON_DATASET);
    int numInstanceOutputs =
      getStepManager().numOutgoingConnectionsOfType(StepManager.CON_INSTANCE);

    if (numDatasetOutputs == 0 && numInstanceOutputs == 0) {
      outgoing.add(StepManager.CON_DATASET);
      outgoing.add(StepManager.CON_INSTANCE);
    } else if (numDatasetOutputs > 0) {
      outgoing.add(StepManager.CON_DATASET);
    } else if (numInstanceOutputs > 0) {
      outgoing.add(StepManager.CON_INSTANCE);
    }

    return outgoing;
  }

  /**
   * Return the fully qualified name of a custom editor component (JComponent)
   * to use for editing the properties of the step. This method can return null,
   * in which case the system will dynamically generate an editor using the
   * GenericObjectEditor
   *
   * @return the fully qualified name of a step editor component
   */
  @Override
  public String getCustomEditorForStep() {
    return "weka.gui.knowledgeflow.steps.LoaderStepEditorDialog";
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy