All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.knowledgeflow.steps.PairedDataHelper Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.

There is a newer version: 3.8.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    PairedDataHelper.java
 *    Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.knowledgeflow.steps;

import weka.core.WekaException;
import weka.knowledgeflow.Data;
import weka.knowledgeflow.StepManager;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 

* A helper class that Step implementations can use when processing paired data * (e.g. train and test sets). Has the concept of a primary and secondary * connection/data type, where the secondary connection/data for a given set * number typically needs to be processed using a result generated from the * corresponding primary connection/data. This class takes care of ensuring that * the secondary connection/data is only processed once the primary has * completed. Users of this helper need to provide an implementation of the * PairedProcessor inner interface, where the processPrimary() method will be * called to process the primary data/connection (and return a result), and * processSecondary() called to deal with the secondary connection/data. The * result of execution on a particular primary data set number can be retrieved * by calling the getIndexedPrimaryResult() method, passing in the set number of * the primary result to retrieve. *

* * This class also provides an arbitrary storage mechanism for additional * results beyond the primary type of result. It also takes care of invoking * processing() and finished() on the client step's StepManager.
*
* *
 *     public class MyFunkyStep extends BaseStep
 *       implements PairedDataHelper.PairedProcessor {
 *       ...
 *       protected PairedDataHelper m_helper;
 *       ...
 *       public void stepInit() {
 *         m_helper = new PairedDataHelper(this, this,
 *         StepManager.[CON_WHATEVER_YOUR_PRIMARY_CONNECTION_IS],
 *         StepManager.[CON_WHATEVER_YOUR_SECONDARY_CONNECTION_IS]);
 * 
 *         ...
 *       }
 * 
 *       public void processIncoming(Data data) throws WekaException {
 *         // delegate to our helper to handle primary/secondary synchronization
 *         // issues
 *         m_helper.process(data);
 *       }
 * 
 *       public MyFunkyMainResult processPrimary(Integer setNum, Integer maxSetNun,
 *         Data data, PairedDataHelper helper) throws WekaException {
 *           SomeDataTypeToProcess someData = data.getPrimaryPayload();
 * 
 *           MyFunkyMainResult processor = new MyFunkyMainResult();
 *           // do some processing using MyFunkyMainResult and SomeDataToProcess
 *           ...
 *           // output some data to downstream steps if necessary
 *           ...
 * 
 *           return processor;
 *       }
 * 
 *       public void processSecondary(Integer setNum, Integer maxSetNum, Data data,
 *         PairedDataHelper helper) throws WekaException {
 *         SomeDataTypeToProcess someData = data.getPrimaryPayload();
 * 
 *         // get the MyFunkyMainResult for this set number
 *         MyFunkyMainResult result = helper.getIndexedPrimaryResult(setNum);
 * 
 *         // do some stuff with the result and the secondary data
 *         ...
 *         // output some data to downstream steps if necessary
 *       }
 *     }
 * 
* * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: $ */ public class PairedDataHelper

implements java.io.Serializable { /** For serialization */ private static final long serialVersionUID = -7813465607881227514L; /** * Storage of arbitrary indexed results computed during execution of * PairedProcessor.processPrimary() */ protected Map> m_namedIndexedStore = new ConcurrentHashMap>(); /** Storage of the indexed primary result */ protected Map m_primaryResultMap = new ConcurrentHashMap(); /** * Holds the secondary data objects, if they arrive before the corresponding * primary has been computed */ protected Map m_secondaryDataMap = new ConcurrentHashMap(); /** The type of connection to route to PairedProcessor.processPrimary() */ protected String m_primaryConType; /** The type of connection to route to PairedProcessor.processSecondary() */ protected String m_secondaryConType; /** The PairedProcessor implementation that will do the actual work */ protected transient PairedProcessor m_processor; /** The step that owns this helper */ protected transient Step m_ownerStep; /** Keep track of completed primary/secondary pairs */ protected transient AtomicInteger m_setCount; /** * Constructor * * @param owner the owner step * @param processor the PairedProcessor implementation * @param primaryConType the primary connection type * @param secondaryConType the secondary connection type */ public PairedDataHelper(Step owner, PairedProcessor processor, String primaryConType, String secondaryConType) { m_primaryConType = primaryConType; m_secondaryConType = secondaryConType; m_ownerStep = owner; m_processor = processor; } /** * Initiate routing and processing for a particular data object * * @param data the data object to process * @throws WekaException if a problem occurs */ public void process(Data data) throws WekaException { if (m_ownerStep.getStepManager().isStopRequested()) { m_ownerStep.getStepManager().interrupted(); return; } String connType = data.getConnectionName(); if (connType.equals(m_primaryConType)) { processPrimary(data); } else if (m_secondaryConType != null && connType.equals(m_secondaryConType)) { processSecondary(data); } else { throw new WekaException("Illegal connection/data type: " + connType); } if (!m_ownerStep.getStepManager().isStopRequested()) { if (m_setCount != null && m_setCount.get() == 0) { m_ownerStep.getStepManager().finished(); // save memory m_primaryResultMap.clear(); m_secondaryDataMap.clear(); m_namedIndexedStore.clear(); } } else { m_ownerStep.getStepManager().interrupted(); } } /** * Handle the processing of the primary data/connection. Performs * initialization in the case of receiving the first data object in a batch. * Delegates actual processing work to the PairedProcessor. * * @param data the data to process * @throws WekaException if a problem occurs */ @SuppressWarnings("unchecked") private void processPrimary(Data data) throws WekaException { Integer setNum = data.getPayloadElement(StepManager.CON_AUX_DATA_SET_NUM, 1); Integer maxSetNum = data.getPayloadElement(StepManager.CON_AUX_DATA_MAX_SET_NUM, 1); if (m_setCount == null) { m_setCount = new AtomicInteger(maxSetNum); } if (setNum == 1) { m_ownerStep.getStepManager().processing(); m_ownerStep.getStepManager().statusMessage( "Processing set/fold " + setNum + " out of " + maxSetNum); } if (!m_ownerStep.getStepManager().isStopRequested()) { P result = (P) m_processor.processPrimary(setNum, maxSetNum, data, this); if (result != null) { m_primaryResultMap.put(setNum, result); } } else { m_ownerStep.getStepManager().interrupted(); return; } Data waitingSecondary = m_secondaryDataMap.get(setNum); if (waitingSecondary != null) { processSecondary(waitingSecondary); } else if (m_secondaryConType == null) { // no secondary connection m_setCount.decrementAndGet(); } } /** * Handle processing of the secondary data/connection. Stores the secondary if * there is no corresponding primary result generated yet. Delegates actual * processing work to the PairedProcessor * * @param data the data to process * @throws WekaException if a problem occurs */ @SuppressWarnings("unchecked") private synchronized void processSecondary(Data data) throws WekaException { Integer setNum = data.getPayloadElement(StepManager.CON_AUX_DATA_SET_NUM, 1); Integer maxSetNum = data.getPayloadElement(StepManager.CON_AUX_DATA_MAX_SET_NUM, 1); P primaryData = m_primaryResultMap.get(setNum); if (primaryData == null) { // store, ready for the arrival of the matching primary data m_secondaryDataMap.put(setNum, data); return; } if (!m_ownerStep.getStepManager().isStopRequested()) { m_processor.processSecondary(setNum, maxSetNum, data, this); } else { m_ownerStep.getStepManager().interrupted(); return; } m_setCount.decrementAndGet(); } /** * Retrieve the primary result corresponding to a given set number * * @param index the set number of the result to get * @return the primary result */ public P getIndexedPrimaryResult(int index) { return m_primaryResultMap.get(index); } /** * Reset the helper. The helper must be reset between runs if it is being * re-used (as opposed to a new helper instance being created). */ public void reset() { // dont' reset if we're still processing! if (m_setCount != null && m_setCount.get() > 0 && !m_ownerStep.getStepManager().isStopRequested()) { return; } m_setCount = null; } /** * Return true if there is no further processing to be done * * @return true if processing is done */ public boolean isFinished() { return m_setCount.get() == 0; } /** * Create a indexed store with a given name * * @param name the name of the store to create */ public void createNamedIndexedStore(String name) { m_namedIndexedStore.put(name, new ConcurrentHashMap()); } /** * Gets an indexed value from a named store * * @param storeName the name of the store to retrieve from * @param index the index of the value to get * @param the type of the value * @return the requested value or null if either the store does not exist or * the value does not exist in the store. */ @SuppressWarnings("unchecked") public T getIndexedValueFromNamedStore(String storeName, Integer index) { Map store = m_namedIndexedStore.get(storeName); if (store != null) { return (T) store.get(index); } return null; } /** * Adds a value to a named store with the given index. Creates the named store * if it doesn't already exist. * * @param storeName the name of the store to add to * @param index the index to associate with the value * @param value the value to store */ public synchronized void addIndexedValueToNamedStore(String storeName, Integer index, Object value) { Map store = m_namedIndexedStore.get(storeName); if (store == null) { createNamedIndexedStore(storeName); store = m_namedIndexedStore.get(storeName); } store.put(index, value); } /** * Interface for processors of paired data to implement. See the description * in the class documentation of PairedDataHelper. */ public interface PairedProcessor

{ P processPrimary(Integer setNum, Integer maxSetNum, Data data, PairedDataHelper

helper) throws WekaException; void processSecondary(Integer setNum, Integer maxSetNum, Data data, PairedDataHelper

helper) throws WekaException; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy