org.pentaho.di.trans.steps.mergejoin.MergeJoin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.mergejoin;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.step.errorhandling.StreamInterface;
/**
* Merge rows from 2 sorted streams and output joined rows with matched key fields. Use this instead of hash join is
* both your input streams are too big to fit in memory. Note that both the inputs must be sorted on the join key.
*
* This is a first prototype implementation that only handles two streams and inner join. It also always outputs all
* values from both streams. Ideally, we should: 1) Support any number of incoming streams 2) Allow user to choose the
* join type (inner, outer) for each stream 3) Allow user to choose which fields to push to next step 4) Have multiple
* output ports as follows: a) Containing matched records b) Unmatched records for each input port 5) Support incoming
* rows to be sorted either on ascending or descending order. The currently implementation only supports ascending
*
* @author Biswapesh
* @since 24-nov-2006
*/
public class MergeJoin extends BaseStep implements StepInterface {
private static Class> PKG = MergeJoinMeta.class; // for i18n purposes, needed by Translator2!!
private MergeJoinMeta meta;
private MergeJoinData data;
public MergeJoin( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
meta = (MergeJoinMeta) smi;
data = (MergeJoinData) sdi;
int compare;
if ( first ) {
first = false;
// Find the RowSet to read from
//
List infoStreams = meta.getStepIOMeta().getInfoStreams();
data.oneRowSet = findInputRowSet( infoStreams.get( 0 ).getStepname() );
if ( data.oneRowSet == null ) {
throw new KettleException( BaseMessages.getString(
PKG, "MergeJoin.Exception.UnableToFindSpecifiedStep", infoStreams.get( 0 ).getStepname() ) );
}
data.twoRowSet = findInputRowSet( infoStreams.get( 1 ).getStepname() );
if ( data.twoRowSet == null ) {
throw new KettleException( BaseMessages.getString(
PKG, "MergeJoin.Exception.UnableToFindSpecifiedStep", infoStreams.get( 1 ).getStepname() ) );
}
data.one = getRowFrom( data.oneRowSet );
if ( data.one != null ) {
data.oneMeta = data.oneRowSet.getRowMeta();
} else {
data.one = null;
data.oneMeta = getTransMeta().getStepFields( infoStreams.get( 0 ).getStepname() );
}
data.two = getRowFrom( data.twoRowSet );
if ( data.two != null ) {
data.twoMeta = data.twoRowSet.getRowMeta();
} else {
data.two = null;
data.twoMeta = getTransMeta().getStepFields( infoStreams.get( 1 ).getStepname() );
}
// just for speed: oneMeta+twoMeta
//
data.outputRowMeta = new RowMeta();
data.outputRowMeta.mergeRowMeta( data.oneMeta.clone() );
data.outputRowMeta.mergeRowMeta( data.twoMeta.clone() );
if ( data.one != null ) {
// Find the key indexes:
data.keyNrs1 = new int[meta.getKeyFields1().length];
for ( int i = 0; i < data.keyNrs1.length; i++ ) {
data.keyNrs1[i] = data.oneMeta.indexOfValue( meta.getKeyFields1()[i] );
if ( data.keyNrs1[i] < 0 ) {
String message =
BaseMessages.getString( PKG, "MergeJoin.Exception.UnableToFindFieldInReferenceStream", meta
.getKeyFields1()[i] );
logError( message );
throw new KettleStepException( message );
}
}
}
if ( data.two != null ) {
// Find the key indexes:
data.keyNrs2 = new int[meta.getKeyFields2().length];
for ( int i = 0; i < data.keyNrs2.length; i++ ) {
data.keyNrs2[i] = data.twoMeta.indexOfValue( meta.getKeyFields2()[i] );
if ( data.keyNrs2[i] < 0 ) {
String message =
BaseMessages.getString( PKG, "MergeJoin.Exception.UnableToFindFieldInReferenceStream", meta
.getKeyFields2()[i] );
logError( message );
throw new KettleStepException( message );
}
}
}
// Calculate one_dummy... defaults to null
data.one_dummy = RowDataUtil.allocateRowData( data.oneMeta.size() + data.twoMeta.size() );
// Calculate two_dummy... defaults to null
//
data.two_dummy = new Object[data.twoMeta.size()];
}
if ( log.isRowLevel() ) {
logRowlevel( BaseMessages.getString( PKG, "MergeJoin.Log.DataInfo", data.oneMeta.getString( data.one ) + "" )
+ data.twoMeta.getString( data.two ) );
}
/*
* We can stop processing if any of the following is true: a) Both streams are empty b) First stream is empty and
* join type is INNER or LEFT OUTER c) Second stream is empty and join type is INNER or RIGHT OUTER
*/
if ( ( data.one == null && data.two == null )
|| ( data.one == null && data.one_optional == false ) || ( data.two == null && data.two_optional == false ) ) {
// Before we stop processing, we have to make sure that all rows from both input streams are depleted!
// If we don't do this, the transformation can stall.
//
while ( data.one != null && !isStopped() ) {
data.one = getRowFrom( data.oneRowSet );
}
while ( data.two != null && !isStopped() ) {
data.two = getRowFrom( data.twoRowSet );
}
setOutputDone();
return false;
}
if ( data.one == null ) {
compare = -1;
} else {
if ( data.two == null ) {
compare = 1;
} else {
int cmp = data.oneMeta.compare( data.one, data.twoMeta, data.two, data.keyNrs1, data.keyNrs2 );
compare = cmp > 0 ? 1 : cmp < 0 ? -1 : 0;
}
}
switch ( compare ) {
case 0:
/*
* We've got a match. This is what we do next (to handle duplicate keys correctly): Read the next record from
* both streams If any of the keys match, this means we have duplicates. We therefore Create an array of all
* rows that have the same keys Push a Cartesian product of the two arrays to output Else Just push the combined
* rowset to output
*/
data.one_next = getRowFrom( data.oneRowSet );
data.two_next = getRowFrom( data.twoRowSet );
int compare1 =
( data.one_next == null ) ? -1 : data.oneMeta.compare(
data.one, data.one_next, data.keyNrs1, data.keyNrs1 );
int compare2 =
( data.two_next == null ) ? -1 : data.twoMeta.compare(
data.two, data.two_next, data.keyNrs2, data.keyNrs2 );
if ( compare1 == 0 || compare2 == 0 ) { // Duplicate keys
if ( data.ones == null ) {
data.ones = new ArrayList