All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pentaho.di.trans.steps.multimerge.MultiMergeJoin Maven / Gradle / Ivy

The newest version!
/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.multimerge;

import java.util.ArrayList;
import java.util.List;
import java.util.PriorityQueue;

import org.pentaho.di.core.RowSet;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransHopMeta;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepIOMetaInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.step.errorhandling.StreamInterface;

/**
 * Merge rows from 2 sorted streams and output joined rows with matched key fields. Use this instead of hash join is
 * both your input streams are too big to fit in memory. Note that both the inputs must be sorted on the join key.
 *
 * This is a first prototype implementation that only handles two streams and inner join. It also always outputs all
 * values from both streams. Ideally, we should: 1) Support any number of incoming streams 2) Allow user to choose the
 * join type (inner, outer) for each stream 3) Allow user to choose which fields to push to next step 4) Have multiple
 * output ports as follows: a) Containing matched records b) Unmatched records for each input port 5) Support incoming
 * rows to be sorted either on ascending or descending order. The currently implementation only supports ascending
 *
 * @author Biswapesh
 * @since 24-nov-2006
 */

public class MultiMergeJoin extends BaseStep implements StepInterface {
  private static Class PKG = MultiMergeJoinMeta.class; // for i18n purposes, needed by Translator2!!

  private MultiMergeJoinMeta meta;
  private MultiMergeJoinData data;

  public MultiMergeJoin( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
    Trans trans ) {
    super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
  }

  private boolean processFirstRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
    meta = (MultiMergeJoinMeta) smi;
    data = (MultiMergeJoinData) sdi;

    TransMeta transMeta = getTransMeta();
    TransHopMeta transHopMeta;

    StepIOMetaInterface stepIOMeta = meta.getStepIOMeta();
    List infoStreams = stepIOMeta.getInfoStreams();
    StreamInterface stream;
    StepMeta toStepMeta = meta.getParentStepMeta();
    StepMeta fromStepMeta;

    ArrayList inputStepNameList = new ArrayList();
    String[] inputStepNames = meta.getInputSteps();
    String inputStepName;

    for ( int i = 0; i < infoStreams.size(); i++ ) {
      inputStepName = inputStepNames[i];
      stream = infoStreams.get( i );
      fromStepMeta = stream.getStepMeta();
      if ( fromStepMeta == null ) {
        //should not arrive here, shoud typically have been caught by init.
        throw new KettleException(
          BaseMessages.getString( PKG, "MultiMergeJoin.Log.UnableToFindReferenceStream", inputStepName ) );
      }
      //check the hop
      transHopMeta = transMeta.findTransHop( fromStepMeta,  toStepMeta, true );
      //there is no hop: this is unexpected.
      if ( transHopMeta == null ) {
        //should not arrive here, shoud typically have been caught by init.
        throw new KettleException(
          BaseMessages.getString( PKG, "MultiMergeJoin.Log.UnableToFindReferenceStream", inputStepName ) );
      } else if ( transHopMeta.isEnabled() ) {
        inputStepNameList.add( inputStepName );
      } else {
        logDetailed( BaseMessages.getString( PKG, "MultiMergeJoin.Log.IgnoringStep", inputStepName ) );
      }
    }

    int streamSize = inputStepNameList.size();
    if ( streamSize == 0 ) {
      return false;
    }

    String keyField;
    String[] keyFields;

    data.rowSets = new RowSet[streamSize];
    RowSet rowSet;
    Object[] row;
    data.rows = new Object[streamSize][];
    data.metas = new RowMetaInterface[streamSize];
    data.rowLengths = new int[streamSize];
    MultiMergeJoinData.QueueComparator comparator = new MultiMergeJoinData.QueueComparator( data );
    data.queue = new PriorityQueue( streamSize, comparator );
    data.results = new ArrayList>( streamSize );
    MultiMergeJoinData.QueueEntry queueEntry;
    data.queueEntries = new MultiMergeJoinData.QueueEntry[streamSize];
    data.drainIndices = new int[streamSize];
    data.keyNrs = new int[streamSize][];
    data.dummy = new Object[streamSize][];

    RowMetaInterface rowMeta;
    data.outputRowMeta = new RowMeta();
    for ( int i = 0, j = 0; i < inputStepNames.length; i++ ) {
      inputStepName = inputStepNames[i];
      if ( !inputStepNameList.contains( inputStepName ) ) {
        //ignore step with disabled hop.
        continue;
      }

      queueEntry = new MultiMergeJoinData.QueueEntry();
      queueEntry.index = j;
      data.queueEntries[j] = queueEntry;

      data.results.add( new ArrayList() );

      rowSet = findInputRowSet( inputStepName );
      if ( rowSet == null ) {
        throw new KettleException( BaseMessages.getString(
          PKG, "MultiMergeJoin.Exception.UnableToFindSpecifiedStep", inputStepName ) );
      }
      data.rowSets[j] = rowSet;

      row = getRowFrom( rowSet );
      data.rows[j] = row;
      if ( row == null ) {
        rowMeta = getTransMeta().getStepFields( inputStepName );
        data.metas[j] = rowMeta;
      } else {
        queueEntry.row = row;
        rowMeta = rowSet.getRowMeta();

        keyField = meta.getKeyFields()[i];
        String[] keyFieldParts = keyField.split( "," );
        String keyFieldPart;
        data.keyNrs[j] = new int[keyFieldParts.length];
        for ( int k = 0; k < keyFieldParts.length; k++ ) {
          keyFieldPart = keyFieldParts[k];
          data.keyNrs[j][k] = rowMeta.indexOfValue( keyFieldPart );
          if ( data.keyNrs[j][k] < 0 ) {
            String message =
              BaseMessages.getString( PKG, "MultiMergeJoin.Exception.UnableToFindFieldInReferenceStream", keyFieldPart, inputStepName );
            logError( message );
            throw new KettleStepException( message );
          }
        }
        data.metas[j] = rowMeta;
        data.queue.add( data.queueEntries[j] );
      }
      data.outputRowMeta.mergeRowMeta( rowMeta.clone() );
      data.rowLengths[j] = rowMeta.size();
      data.dummy[j] = RowDataUtil.allocateRowData( rowMeta.size() );
      j++;
    }
    return true;
  }

  public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
    meta = (MultiMergeJoinMeta) smi;
    data = (MultiMergeJoinData) sdi;

    if ( first ) {
      if ( !processFirstRow( smi, sdi ) ) {
        setOutputDone();
        return false;
      }
      first = false;
    }

    if ( log.isRowLevel() ) {
      String metaString =
        BaseMessages
          .getString( PKG, "MultiMergeJoin.Log.DataInfo", data.metas[0].getString( data.rows[0] ) + "" );
      for ( int i = 1; i < data.metas.length; i++ ) {
        metaString += data.metas[i].getString( data.rows[i] );
      }
      logRowlevel( metaString );
    }

    /*
     * We can stop processing if any of the following is true: a) All streams are empty b) Any stream is empty and join
     * type is INNER
     */
    int streamSize = data.metas.length;
    if ( data.optional ) {
      if ( data.queue.isEmpty() ) {
        setOutputDone();
        return false;
      }
      MultiMergeJoinData.QueueEntry minEntry = data.queue.poll();
      int drainSize = 1;
      data.rows[minEntry.index] = minEntry.row;
      data.drainIndices[0] = minEntry.index;
      MultiMergeJoinData.QueueComparator comparator = (MultiMergeJoinData.QueueComparator) data.queue.comparator();
      while ( !data.queue.isEmpty() && comparator.compare( data.queue.peek(), minEntry ) == 0 ) {
        MultiMergeJoinData.QueueEntry entry = data.queue.poll();
        data.rows[entry.index] = entry.row;
        data.drainIndices[drainSize++] = entry.index;
      }
      int index;
      Object[] row = null;
      // rows from nonempty input streams match: get all equal rows and create result set
      for ( int i = 0; i < drainSize; i++ ) {
        index = data.drainIndices[i];
        data.results.get( index ).add( data.rows[index] );
        while ( !isStopped()
          && ( ( row = getRowFrom( data.rowSets[index] ) ) != null && data.metas[index].compare(
            data.rows[index], row, data.keyNrs[index] ) == 0 ) ) {
          data.results.get( index ).add( row );
        }
        if ( isStopped() ) {
          return false;
        }
        if ( row != null ) {
          data.queueEntries[index].row = row;
          data.queue.add( data.queueEntries[index] );
        }
      }
      for ( int i = 0; i < streamSize; i++ ) {
        data.drainIndices[i] = 0;
        if ( data.results.get( i ).isEmpty() ) {
          data.results.get( i ).add( data.dummy[i] );
        }
      }

      int current = 0;

      while ( true ) {
        for ( int i = 0; i < streamSize; i++ ) {
          data.rows[i] = data.results.get( i ).get( data.drainIndices[i] );
        }
        row = RowDataUtil.createResizedCopy( data.rows, data.rowLengths );

        putRow( data.outputRowMeta, row );

        while ( ++data.drainIndices[current] >= data.results.get( current ).size() ) {
          data.drainIndices[current] = 0;
          if ( ++current >= streamSize ) {
            break;
          }
        }
        if ( current >= streamSize ) {
          break;
        }
        current = 0;
      }
      for ( int i = 0; i < streamSize; i++ ) {
        data.results.get( i ).clear();
      }
    } else {
      if ( data.queue.size() < streamSize ) {
        data.queue.clear();
        for ( int i = 0; i < streamSize; i++ ) {
          while ( data.rows[i] != null && !isStopped() ) {
            data.rows[i] = getRowFrom( data.rowSets[i] );
          }
        }
        setOutputDone();
        return false;
      }

      MultiMergeJoinData.QueueEntry minEntry = data.queue.poll();
      int drainSize = 1;
      data.rows[minEntry.index] = minEntry.row;
      data.drainIndices[0] = minEntry.index;
      MultiMergeJoinData.QueueComparator comparator = (MultiMergeJoinData.QueueComparator) data.queue.comparator();
      while ( !data.queue.isEmpty() && comparator.compare( data.queue.peek(), minEntry ) == 0 ) {
        MultiMergeJoinData.QueueEntry entry = data.queue.poll();
        data.rows[entry.index] = entry.row;
        data.drainIndices[drainSize++] = entry.index;
      }
      Object[] row = null;
      if ( data.queue.isEmpty() ) {
        // rows from all input streams match: get all equal rows and create result set
        for ( int i = 0; i < streamSize; i++ ) {
          data.results.get( i ).add( data.rows[i] );
          while ( !isStopped()
            && ( ( row = getRowFrom( data.rowSets[i] ) ) != null && data.metas[i].compare(
              data.rows[i], row, data.keyNrs[i] ) == 0 ) ) {
            data.results.get( i ).add( row );
          }
          if ( isStopped() ) {
            return false;
          }
          if ( row != null ) {
            data.queueEntries[i].row = row;
            data.queue.add( data.queueEntries[i] );
          }
        }
        for ( int i = 0; i < streamSize; i++ ) {
          data.drainIndices[i] = 0;
        }

        int current = 0;
        while ( true ) {
          for ( int i = 0; i < streamSize; i++ ) {
            data.rows[i] = data.results.get( i ).get( data.drainIndices[i] );
          }
          row = RowDataUtil.createResizedCopy( data.rows, data.rowLengths );

          putRow( data.outputRowMeta, row );
          while ( ++data.drainIndices[current] >= data.results.get( current ).size() ) {
            data.drainIndices[current] = 0;
            if ( ++current >= streamSize ) {
              break;
            }
          }
          if ( current >= streamSize ) {
            break;
          }
          current = 0;
        }
        for ( int i = 0; i < streamSize; i++ ) {
          data.results.get( i ).clear();
        }
      } else {
        // mismatch found and no results can be generated

        for ( int i = 0; i < drainSize; i++ ) {
          int index = data.drainIndices[i];
          while ( ( row = getRowFrom( data.rowSets[index] ) ) != null
            && data.metas[index].compare( data.rows[index], row, data.keyNrs[index] ) == 0 ) {
            if ( isStopped() ) {
              break;
            }
          }
          if ( isStopped() || row == null ) {
            break;
          }
          data.queueEntries[index].row = row;
          data.queue.add( data.queueEntries[index] );
        }
        if ( isStopped() ) {
          return false;
        }
      }
    }
    if ( checkFeedback( getLinesRead() ) ) {
      logBasic( BaseMessages.getString( PKG, "MultiMergeJoin.LineNumber" ) + getLinesRead() );
    }
    return true;
  }

  /**
   * @see StepInterface#init(org.pentaho.di.trans.step.StepMetaInterface , org.pentaho.di.trans.step.StepDataInterface)
   */
  public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
    meta = (MultiMergeJoinMeta) smi;
    data = (MultiMergeJoinData) sdi;

    if ( super.init( smi, sdi ) ) {
      StepIOMetaInterface stepIOMeta = meta.getStepIOMeta();
      String[] inputStepNames = meta.getInputSteps();
      String inputStepName;
      List infoStreams = stepIOMeta.getInfoStreams();
      StreamInterface stream;
      for ( int i = 0; i < infoStreams.size(); i++ ) {
        inputStepName = inputStepNames[i];
        stream = infoStreams.get( i );
        if ( stream.getStepMeta() == null ) {
          logError( BaseMessages.getString( PKG, "MultiMergeJoin.Log.UnableToFindReferenceStream", inputStepName ) );
          return false;
        }
      }
      String joinType = meta.getJoinType();
      for ( int i = 0; i < MultiMergeJoinMeta.join_types.length; ++i ) {
        if ( joinType.equalsIgnoreCase( MultiMergeJoinMeta.join_types[i] ) ) {
          data.optional = MultiMergeJoinMeta.optionals[i];
          return true;
        }
      }
      logError( BaseMessages.getString( PKG, "MultiMergeJoin.Log.InvalidJoinType", meta.getJoinType() ) );
      return false;
    }
    return true;
  }

  /**
   * Checks whether incoming rows are join compatible. This essentially means that the keys being compared should be of
   * the same datatype and both rows should have the same number of keys specified
   *
   * @param row1
   *          Reference row
   * @param row2
   *          Row to compare to
   *
   * @return true when templates are compatible.
   */
  protected boolean isInputLayoutValid( RowMetaInterface[] rows ) {
    if ( rows != null ) {
      // Compare the key types
      String[] keyFields = meta.getKeyFields();
      /*
       * int nrKeyFields = keyFields.length;
       *
       * for (int i=0;i keyList = new ArrayList();
      for ( int i = 0; i < keyFields.length; i++ ) {
        String[] keys = keyFields[i].split( "," );
        keyList.add( keys );
        int count = keys.length;
        if ( i != 0 && prevCount != count ) {
          logError( "Number of keys do not match " );
          return false;
        } else {
          prevCount = count;
        }
      }

      // check:3 compare the key types
      for ( int i = 0; i < prevCount; i++ ) {
        ValueMetaInterface preValue = null;
        for ( int j = 0; j < rows.length; j++ ) {
          ValueMetaInterface v = rows[j].searchValueMeta( keyList.get( j )[i] );
          if ( v == null ) {
            return false;
          }
          if ( j != 0 && v.getType() != preValue.getType() ) {
            logError( "key data type do not match " );
            return false;
          } else {
            preValue = v;
          }
        }
      }
    }
    // we got here, all seems to be ok.
    return true;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy