org.pentaho.di.trans.steps.groupby.GroupBy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.groupby;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.Collections;
import org.apache.commons.math.stat.descriptive.rank.Percentile;
import org.apache.commons.vfs2.FileObject;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.exception.KettlePluginException;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueDataUtil;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaBase;
import org.pentaho.di.core.row.value.ValueMetaFactory;
import org.pentaho.di.core.row.value.ValueMetaInteger;
import org.pentaho.di.core.row.value.ValueMetaNone;
import org.pentaho.di.core.row.value.ValueMetaNumber;
import org.pentaho.di.core.row.value.ValueMetaString;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
/**
* Groups informations based on aggregation rules. (sum, count, ...)
*
* @author Matt
* @since 2-jun-2003
*/
public class GroupBy extends BaseStep implements StepInterface {
private static Class> PKG = GroupByMeta.class; // for i18n purposes, needed by Translator2!!
private GroupByMeta meta;
private GroupByData data;
private boolean allNullsAreZero = false;
private boolean minNullIsValued = false;
public GroupBy( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
meta = (GroupByMeta) getStepMeta().getStepMetaInterface();
data = (GroupByData) stepDataInterface;
}
@Override
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
meta = (GroupByMeta) smi;
data = (GroupByData) sdi;
Object[] r = getRow(); // get row!
if ( first ) {
String val = getVariable( Const.KETTLE_AGGREGATION_ALL_NULLS_ARE_ZERO, "N" );
allNullsAreZero = ValueMetaBase.convertStringToBoolean( val );
val = getVariable( Const.KETTLE_AGGREGATION_MIN_NULL_IS_VALUED, "N" );
minNullIsValued = ValueMetaBase.convertStringToBoolean( val );
// What is the output looking like?
//
data.inputRowMeta = getInputRowMeta();
// In case we have 0 input rows, we still want to send out a single row aggregate
// However... the problem then is that we don't know the layout from receiving it from the previous step over the
// row set.
// So we need to calculated based on the metadata...
//
if ( data.inputRowMeta == null ) {
data.inputRowMeta = getTransMeta().getPrevStepFields( getStepMeta() );
}
data.outputRowMeta = data.inputRowMeta.clone();
meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore );
// Do all the work we can beforehand
// Calculate indexes, loop up fields, etc.
//
data.counts = new long[ meta.getSubjectField().length ];
data.subjectnrs = new int[ meta.getSubjectField().length ];
data.cumulativeSumSourceIndexes = new ArrayList<>();
data.cumulativeSumTargetIndexes = new ArrayList<>();
data.cumulativeAvgSourceIndexes = new ArrayList<>();
data.cumulativeAvgTargetIndexes = new ArrayList<>();
for ( int i = 0; i < meta.getSubjectField().length; i++ ) {
if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_COUNT_ANY ) {
data.subjectnrs[ i ] = 0;
} else {
data.subjectnrs[ i ] = data.inputRowMeta.indexOfValue( meta.getSubjectField()[i] );
}
if ( ( r != null ) && ( data.subjectnrs[ i ] < 0 ) ) {
logError( BaseMessages.getString( PKG, "GroupBy.Log.AggregateSubjectFieldCouldNotFound",
meta.getSubjectField()[ i ] ) );
setErrors( 1 );
stopAll();
return false;
}
if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_CUMULATIVE_SUM ) {
data.cumulativeSumSourceIndexes.add( data.subjectnrs[ i ] );
// The position of the target in the output row is the input row size + i
//
data.cumulativeSumTargetIndexes.add( data.inputRowMeta.size() + i );
}
if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_CUMULATIVE_AVERAGE ) {
data.cumulativeAvgSourceIndexes.add( data.subjectnrs[ i ] );
// The position of the target in the output row is the input row size + i
//
data.cumulativeAvgTargetIndexes.add( data.inputRowMeta.size() + i );
}
}
data.previousSums = new Object[ data.cumulativeSumTargetIndexes.size() ];
data.previousAvgSum = new Object[ data.cumulativeAvgTargetIndexes.size() ];
data.previousAvgCount = new long[ data.cumulativeAvgTargetIndexes.size() ];
data.groupnrs = new int[ meta.getGroupField().length ];
for ( int i = 0; i < meta.getGroupField().length; i++ ) {
data.groupnrs[ i ] = data.inputRowMeta.indexOfValue( meta.getGroupField()[i] );
if ( ( r != null ) && ( data.groupnrs[ i ] < 0 ) ) {
logError( BaseMessages.getString( PKG, "GroupBy.Log.GroupFieldCouldNotFound", meta.getGroupField()[ i ] ) );
setErrors( 1 );
stopAll();
return false;
}
}
// Create a metadata value for the counter Integers
//
data.valueMetaInteger = new ValueMetaInteger( "count" );
data.valueMetaNumber = new ValueMetaNumber( "sum" );
// Initialize the group metadata
//
initGroupMeta( data.inputRowMeta );
}
if ( first || data.newBatch ) {
// Create a new group aggregate (init)
//
newAggregate( r );
}
if ( first ) {
// for speed: groupMeta+aggMeta
//
data.groupAggMeta = new RowMeta();
data.groupAggMeta.addRowMeta( data.groupMeta );
data.groupAggMeta.addRowMeta( data.aggMeta );
}
if ( r == null ) { // no more input to be expected... (or none received in the first place)
handleLastOfGroup();
setOutputDone();
return false;
}
if ( first || data.newBatch ) {
first = false;
data.newBatch = false;
data.previous = data.inputRowMeta.cloneRow( r ); // copy the row to previous
} else {
calcAggregate( data.previous );
if ( meta.passAllRows() ) {
addToBuffer( data.previous );
}
}
if ( !sameGroup( data.previous, r ) ) {
if ( meta.passAllRows() ) {
// Not the same group: close output (if any)
closeOutput();
// Get all rows from the buffer!
data.groupResult = getAggregateResult();
Object[] row = getRowFromBuffer();
long lineNr = 0;
while ( row != null ) {
int size = data.inputRowMeta.size();
row = RowDataUtil.addRowData( row, size, data.groupResult );
size += data.groupResult.length;
lineNr++;
if ( meta.isAddingLineNrInGroup() && !Utils.isEmpty( meta.getLineNrInGroupField() ) ) {
Object lineNrValue = new Long( lineNr );
// ValueMetaInterface lineNrValueMeta = new ValueMeta(meta.getLineNrInGroupField(),
// ValueMetaInterface.TYPE_INTEGER);
// lineNrValueMeta.setLength(9);
row = RowDataUtil.addValueData( row, size, lineNrValue );
size++;
}
addCumulativeSums( row );
addCumulativeAverages( row );
putRow( data.outputRowMeta, row );
row = getRowFromBuffer();
}
closeInput();
} else {
Object[] result = buildResult( data.previous );
if ( result != null ) {
putRow( data.groupAggMeta, result ); // copy row to possible alternate rowset(s).
}
}
newAggregate( r ); // Create a new group aggregate (init)
}
data.previous = data.inputRowMeta.cloneRow( r );
if ( checkFeedback( getLinesRead() ) ) {
if ( log.isBasic() ) {
logBasic( BaseMessages.getString( PKG, "GroupBy.LineNumber" ) + getLinesRead() );
}
}
return true;
}
private void handleLastOfGroup() throws KettleException {
if ( meta.passAllRows() ) {
// ALL ROWS
if ( data.previous != null ) {
calcAggregate( data.previous );
addToBuffer( data.previous );
}
data.groupResult = getAggregateResult();
Object[] row = getRowFromBuffer();
long lineNr = 0;
while ( row != null ) {
int size = data.inputRowMeta.size();
row = RowDataUtil.addRowData( row, size, data.groupResult );
size += data.groupResult.length;
lineNr++;
if ( meta.isAddingLineNrInGroup() && !Utils.isEmpty( meta.getLineNrInGroupField() ) ) {
Object lineNrValue = new Long( lineNr );
// ValueMetaInterface lineNrValueMeta = new ValueMeta(meta.getLineNrInGroupField(),
// ValueMetaInterface.TYPE_INTEGER);
// lineNrValueMeta.setLength(9);
row = RowDataUtil.addValueData( row, size, lineNrValue );
size++;
}
addCumulativeSums( row );
addCumulativeAverages( row );
putRow( data.outputRowMeta, row );
row = getRowFromBuffer();
}
closeInput();
} else {
// JUST THE GROUP + AGGREGATE
// Don't forget the last set of rows...
if ( data.previous != null ) {
calcAggregate( data.previous );
}
Object[] result = buildResult( data.previous );
if ( result != null ) {
putRow( data.groupAggMeta, result );
}
}
}
private void addCumulativeSums( Object[] row ) throws KettleValueException {
// We need to adjust this row with cumulative averages?
//
for ( int i = 0; i < data.cumulativeSumSourceIndexes.size(); i++ ) {
int sourceIndex = data.cumulativeSumSourceIndexes.get( i );
Object previousTarget = data.previousSums[ i ];
Object sourceValue = row[ sourceIndex ];
int targetIndex = data.cumulativeSumTargetIndexes.get( i );
ValueMetaInterface sourceMeta = data.inputRowMeta.getValueMeta( sourceIndex );
ValueMetaInterface targetMeta = data.outputRowMeta.getValueMeta( targetIndex );
// If the first values where null, or this is the first time around, just take the source value...
//
if ( targetMeta.isNull( previousTarget ) ) {
row[ targetIndex ] = sourceMeta.convertToNormalStorageType( sourceValue );
} else {
// If the source value is null, just take the previous target value
//
if ( sourceMeta.isNull( sourceValue ) ) {
row[ targetIndex ] = previousTarget;
} else {
row[ targetIndex ] = ValueDataUtil.plus( targetMeta, data.previousSums[ i ], sourceMeta, row[ sourceIndex ] );
}
}
data.previousSums[ i ] = row[ targetIndex ];
}
}
private void addCumulativeAverages( Object[] row ) throws KettleValueException {
// We need to adjust this row with cumulative sums
//
for ( int i = 0; i < data.cumulativeAvgSourceIndexes.size(); i++ ) {
int sourceIndex = data.cumulativeAvgSourceIndexes.get( i );
Object previousTarget = data.previousAvgSum[ i ];
Object sourceValue = row[ sourceIndex ];
int targetIndex = data.cumulativeAvgTargetIndexes.get( i );
ValueMetaInterface sourceMeta = data.inputRowMeta.getValueMeta( sourceIndex );
ValueMetaInterface targetMeta = data.outputRowMeta.getValueMeta( targetIndex );
// If the first values where null, or this is the first time around, just take the source value...
//
Object sum = null;
if ( targetMeta.isNull( previousTarget ) ) {
sum = sourceMeta.convertToNormalStorageType( sourceValue );
} else {
// If the source value is null, just take the previous target value
//
if ( sourceMeta.isNull( sourceValue ) ) {
sum = previousTarget;
} else {
if ( sourceMeta.isInteger() ) {
sum = ValueDataUtil.plus( data.valueMetaInteger, data.previousAvgSum[ i ], sourceMeta, row[ sourceIndex ] );
} else {
sum = ValueDataUtil.plus( targetMeta, data.previousAvgSum[ i ], sourceMeta, row[ sourceIndex ] );
}
}
}
data.previousAvgSum[ i ] = sum;
if ( !sourceMeta.isNull( sourceValue ) ) {
data.previousAvgCount[ i ]++;
}
if ( sourceMeta.isInteger() ) {
// Change to number as the exception
//
if ( sum == null ) {
row[ targetIndex ] = null;
} else {
row[ targetIndex ] = new Double( ( (Long) sum ).doubleValue() / data.previousAvgCount[ i ] );
}
} else {
row[ targetIndex ] = ValueDataUtil.divide( targetMeta, sum, data.valueMetaInteger, data.previousAvgCount[ i ] );
}
}
}
// Is the row r of the same group as previous?
boolean sameGroup( Object[] previous, Object[] r ) throws KettleValueException {
return data.inputRowMeta.compare( previous, r, data.groupnrs ) == 0;
}
/**
* used for junits in GroupByAggregationNullsTest
*
* @param row
* @throws KettleValueException
*/
@SuppressWarnings( "unchecked" ) void calcAggregate( Object[] row ) throws KettleValueException {
for ( int i = 0; i < data.subjectnrs.length; i++ ) {
Object subj = row[ data.subjectnrs[ i ] ];
ValueMetaInterface subjMeta = data.inputRowMeta.getValueMeta( data.subjectnrs[ i ] );
Object value = data.agg[ i ];
ValueMetaInterface valueMeta = data.aggMeta.getValueMeta( i );
switch ( meta.getAggregateType()[ i ] ) {
case GroupByMeta.TYPE_GROUP_SUM:
data.agg[ i ] = ValueDataUtil.sum( valueMeta, value, subjMeta, subj );
break;
case GroupByMeta.TYPE_GROUP_AVERAGE:
if ( !subjMeta.isNull( subj ) ) {
data.agg[ i ] = ValueDataUtil.sum( valueMeta, value, subjMeta, subj );
data.counts[ i ]++;
}
break;
case GroupByMeta.TYPE_GROUP_MEDIAN:
case GroupByMeta.TYPE_GROUP_PERCENTILE:
case GroupByMeta.TYPE_GROUP_PERCENTILE_NEAREST_RANK:
if ( !subjMeta.isNull( subj ) ) {
( (List) data.agg[ i ] ).add( subjMeta.getNumber( subj ) );
}
break;
case GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION:
case GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION_SAMPLE:
if ( !subjMeta.isNull( subj ) ) {
data.counts[ i ]++;
double n = data.counts[ i ];
double x = subjMeta.getNumber( subj );
// for standard deviation null is exact 0
double sum = value == null ? new Double( 0 ) : (Double) value;
double mean = data.mean[ i ];
double delta = x - mean;
mean = mean + ( delta / n );
sum = sum + delta * ( x - mean );
data.mean[ i ] = mean;
data.agg[ i ] = sum;
}
break;
case GroupByMeta.TYPE_GROUP_COUNT_DISTINCT:
if ( !subjMeta.isNull( subj ) ) {
if ( data.distinctObjs == null ) {
data.distinctObjs = new Set[ meta.getSubjectField().length ];
}
if ( data.distinctObjs[ i ] == null ) {
data.distinctObjs[ i ] = new TreeSet