org.pentaho.di.trans.steps.parallelgzipcsv.ParGzipCsvInput Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.parallelgzipcsv;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.commons.vfs2.FileObject;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
/**
* Read a simple CSV file Just output Strings found in the file...
*
* @author Matt
* @since 2007-07-05
*/
public class ParGzipCsvInput extends BaseStep implements StepInterface {
private static Class> PKG = ParGzipCsvInputMeta.class; // for i18n purposes, needed by Translator2!!
private ParGzipCsvInputMeta meta;
private ParGzipCsvInputData data;
public ParGzipCsvInput( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
meta = (ParGzipCsvInputMeta) smi;
data = (ParGzipCsvInputData) sdi;
if ( first ) {
first = false;
data.outputRowMeta = new RowMeta();
meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore );
if ( data.filenames == null ) {
// We're expecting the list of filenames from the previous step(s)...
//
getFilenamesFromPreviousSteps();
}
// We only run in parallel if we have at least one file to process
// AND if we have more than one step copy running...
//
data.parallel = meta.isRunningInParallel() && data.totalNumberOfSteps > 1;
// The conversion logic for when the lazy conversion is turned of is simple:
// Pretend it's a lazy conversion object anyway and get the native type during conversion.
//
data.convertRowMeta = data.outputRowMeta.clone();
for ( ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList() ) {
valueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_BINARY_STRING );
}
// Calculate the indexes for the filename and row number fields
//
data.filenameFieldIndex = -1;
if ( !Utils.isEmpty( meta.getFilenameField() ) && meta.isIncludingFilename() ) {
data.filenameFieldIndex = meta.getInputFields().length;
}
data.rownumFieldIndex = -1;
if ( !Utils.isEmpty( meta.getRowNumField() ) ) {
data.rownumFieldIndex = meta.getInputFields().length;
if ( data.filenameFieldIndex >= 0 ) {
data.rownumFieldIndex++;
}
}
// Open the next file...
//
boolean opened = false;
while ( data.filenr < data.filenames.length ) {
if ( openNextFile() ) {
opened = true;
break;
}
}
if ( !opened ) {
setOutputDone(); // last file, end here
return false;
}
}
Object[] outputRowData = readOneRow( true ); // get row, set busy!
if ( outputRowData == null ) { // no more input to be expected...
if ( skipToNextBlock() ) {
// If we need to open a new file, make sure we don't stop when we get a false from the openNextFile() algorithm.
// It can also mean that the file is smaller than the block size
// In that case, check the file number and retry until we get a valid file position to work with.
//
boolean opened = false;
while ( data.filenr < data.filenames.length ) {
if ( openNextFile() ) {
opened = true;
break;
}
}
if ( opened ) {
return true; // try again on the next loop in the next file...
} else {
incrementLinesUpdated();
setOutputDone(); // last file, end here
return false;
}
} else {
return true; // try again on the next loop in the next block...
}
} else {
putRow( data.outputRowMeta, outputRowData ); // copy row to possible alternate rowset(s).
if ( checkFeedback( getLinesInput() ) ) {
if ( log.isBasic() ) {
logBasic( BaseMessages
.getString( PKG, "ParGzipCsvInput.Log.LineNumber", Long.toString( getLinesInput() ) ) );
}
}
}
return true;
}
private boolean skipToNextBlock() throws KettleException {
if ( data.eofReached ) {
return true; // next file please!
}
// Reset the bytes read in the current block of data
//
data.totalBytesRead = 0L;
data.blockNr++;
if ( data.parallel ) {
// So our first act is to skip to the correct position in the compressed stream...
// The number of bytes to skip is nrOfSteps*BufferSize
//
long positionToReach =
( data.blockNr * data.blockSize * data.totalNumberOfSteps ) + data.stepNumber * data.blockSize;
// How many bytes do we need to skip to get where we need to be?
//
long bytesToSkip = positionToReach - data.fileReadPosition;
logBasic( "Skipping "
+ bytesToSkip + " bytes to go to position " + positionToReach + " for step copy " + data.stepNumber );
// Get into position...
//
try {
long bytesSkipped = 0;
while ( bytesSkipped < bytesToSkip ) {
long n = data.gzis.skip( bytesToSkip - bytesSkipped );
if ( n <= 0 ) {
// EOF reached...
//
data.eofReached = true;
data.fileReadPosition += bytesSkipped;
return true; // nothing more to be found in the file, stop right here.
}
bytesSkipped += n;
}
data.fileReadPosition += bytesSkipped;
// Now we need to clear the buffer, reset everything...
//
clearBuffer();
// Now get read until the next CR:
//
readOneRow( false );
return false;
} catch ( IOException e ) {
throw new KettleException( "Error skipping " + bytesToSkip + " bytes to the next block of data", e );
}
} else {
// this situation should never happen.
//
return true; // stop processing the file
}
}
private void getFilenamesFromPreviousSteps() throws KettleException {
List filenames = new ArrayList();
boolean firstRow = true;
int index = -1;
Object[] row = getRow();
while ( row != null ) {
if ( firstRow ) {
firstRow = false;
// Get the filename field index...
//
String filenameField = environmentSubstitute( meta.getFilenameField() );
index = getInputRowMeta().indexOfValue( filenameField );
if ( index < 0 ) {
throw new KettleException( BaseMessages.getString(
PKG, "ParGzipCsvInput.Exception.FilenameFieldNotFound", filenameField ) );
}
}
String filename = getInputRowMeta().getString( row, index );
filenames.add( filename ); // add it to the list...
row = getRow(); // Grab another row...
}
data.filenames = filenames.toArray( new String[filenames.size()] );
logBasic( BaseMessages.getString( PKG, "ParGzipCsvInput.Log.ReadingFromNrFiles", Integer
.toString( data.filenames.length ) ) );
}
public void dispose( StepMetaInterface smi, StepDataInterface sdi ) {
try {
closeFile(); // close the final file
} catch ( Exception ignored ) {
// Exceptions on stream / file closing should be ignored.
}
super.dispose( smi, sdi );
}
private boolean openNextFile() throws KettleException {
try {
// Close the previous file...
//
closeFile();
if ( data.filenr >= data.filenames.length ) {
return false;
}
// Open the next one...
//
logBasic( "Opening file #" + data.filenr + " : " + data.filenames[data.filenr] );
FileObject fileObject = KettleVFS.getFileObject( data.filenames[data.filenr], getTransMeta() );
data.fis = KettleVFS.getInputStream( fileObject );
if ( meta.isLazyConversionActive() ) {
data.binaryFilename = data.filenames[data.filenr].getBytes();
}
data.gzis = new GZIPInputStream( data.fis, data.bufferSize );
clearBuffer();
data.fileReadPosition = 0L;
data.blockNr = 0;
data.eofReached = false;
// Skip to the next file...
//
data.filenr++;
// If we are running in parallel and we need to skip bytes in the first file, let's do so here.
//
if ( data.parallel ) {
// Calculate the first block of data to read from the file
// If the buffer size is 500, we read 0-499 for the first file,
// 500-999 for the second, 1000-1499 for the third, etc.
//
// After that we need to get 1500-1999 for the first step again,
// 2000-2499 for the second, 2500-2999 for the third, etc.
//
// This is equivalent :
//
// FROM : stepNumber * bufferSize + blockNr*bufferSize*nrOfSteps
// TO : FROM + bufferSize - 1
//
// Example : step 0, block 0, size 500:
// From: 0*500+0*500*3=0 To: 0+500-1=499
//
// Example : step 0, block 1, size 500:
// From: 0*500+1*500*3=1500 To: 1500+500-1=1999
//
// So our first act is to skip to the correct position in the compressed stream...
//
data.blockSize = 2 * data.bufferSize; // for now.
long bytesToSkip = data.stepNumber * data.blockSize;
if ( bytesToSkip > 0 ) {
// Get into position for block 0
//
logBasic( "Skipping "
+ bytesToSkip + " bytes to go to position " + bytesToSkip + " for step copy " + data.stepNumber );
long bytesSkipped = 0L;
while ( bytesSkipped < bytesToSkip ) {
long n = data.gzis.skip( bytesToSkip - bytesSkipped );
if ( n <= 0 ) {
// EOF in this file, can't read a block in this step copy
data.eofReached = true;
return false;
}
bytesSkipped += n;
}
// Keep track of the file pointer!
//
data.fileReadPosition += bytesSkipped;
// Reset the bytes read in the current block of data
//
data.totalBytesRead = 0L;
// Skip the first row until the next CR
//
readOneRow( false );
} else {
// Reset the bytes read in the current block of data
//
data.totalBytesRead = 0L;
// See if we need to skip a header row...
//
if ( meta.isHeaderPresent() ) {
readOneRow( false );
}
}
} else {
// Just one block: read it all until we hit an EOF.
//
data.blockSize = Long.MAX_VALUE; // 9,223,372,036 GB
// Also see here if we need to skip a header row...
//
if ( meta.isHeaderPresent() ) {
readOneRow( false );
}
}
// Add filename to result filenames ?
if ( meta.isAddResultFile() ) {
ResultFile resultFile =
new ResultFile( ResultFile.FILE_TYPE_GENERAL, fileObject, getTransMeta().getName(), toString() );
resultFile.setComment( "File was read by a Csv input step" );
addResultFile( resultFile );
}
// Reset the row number pointer...
//
data.rowNumber = 1L;
return true;
} catch ( Exception e ) {
throw new KettleException( e );
}
}
private void clearBuffer() {
data.startBuffer = 0;
data.endBuffer = 0;
data.maxBuffer = 0;
}
/**
* Check to see if the buffer size is large enough given the data.endBuffer pointer.
* Resize the buffer if there is not enough room.
*
* @return false if everything is OK, true if there is a problem and we should stop.
* @throws IOException
* in case there is a I/O problem (read error)
*/
private boolean checkBufferSize() throws KettleException {
if ( data.endBuffer >= data.maxBuffer ) {
// Oops, we need to read more data...
// Better resize this before we read other things in it...
//
if ( data.eofReached || data.getMoreData() ) {
// If we didn't manage to read anything, we return true to indicate we're done
//
return true;
}
}
return false;
}
/**
* Read a single row of data from the file...
*
* @param doConversions
* if you want to do conversions, set to false for the header row.
* @return a row of data...
* @throws KettleException
*/
private Object[] readOneRow( boolean doConversions ) throws KettleException {
// First see if we haven't gone past our block boundary!
// Not >= because a block can start smack at the beginning of a line.
// Since we always skip the first row after skipping a block that would mean we drop rows here and there.
// So keep this > (larger than)
//
if ( data.totalBytesRead > data.blockSize ) {
// skip to the next block or file by returning null
//
return null;
}
try {
Object[] outputRowData = RowDataUtil.allocateRowData( data.outputRowMeta.size() );
int outputIndex = 0;
boolean newLineFound = false;
int newLines = 0;
// The strategy is as follows...
// We read a block of byte[] from the file.
// We scan for the separators in the file (NOT for line feeds etc)
// Then we scan that block of data.
// We keep a byte[] that we extend if needed..
// At the end of the block we read another, etc.
//
// Let's start by looking where we left off reading.
//
while ( !newLineFound && outputIndex < meta.getInputFields().length ) {
if ( checkBufferSize() ) {
// Last row was being discarded if the last item is null and
// there is no end of line delimiter
if ( outputRowData != null ) {
// Make certain that at least one record exists before
// filling the rest of them with null
if ( outputIndex > 0 ) {
return ( outputRowData );
}
}
return null; // nothing more to read, call it a day.
}
// OK, at this point we should have data in the byteBuffer and we should be able to scan for the next
// delimiter (;)
// So let's look for a delimiter.
// Also skip over the enclosures ("), it is NOT taking into account escaped enclosures.
// Later we can add an option for having escaped or double enclosures in the file.
//
boolean delimiterFound = false;
boolean enclosureFound = false;
int escapedEnclosureFound = 0;
while ( !delimiterFound ) {
// If we find the first char, we might find others as well ;-)
// Single byte delimiters only for now.
//
if ( data.byteBuffer[data.endBuffer] == data.delimiter[0] ) {
delimiterFound = true;
} else if ( data.byteBuffer[data.endBuffer] == '\n' || data.byteBuffer[data.endBuffer] == '\r' ) {
// Perhaps we found a new line?
// "\n\r".getBytes()
//
data.endBuffer++;
data.totalBytesRead++;
newLines = 1;
if ( !checkBufferSize() ) {
// re-check for double delimiters...
if ( data.byteBuffer[data.endBuffer] == '\n' || data.byteBuffer[data.endBuffer] == '\r' ) {
data.endBuffer++;
data.totalBytesRead++;
newLines = 2;
checkBufferSize();
}
}
newLineFound = true;
delimiterFound = true;
} else if ( data.enclosure != null && data.byteBuffer[data.endBuffer] == data.enclosure[0] ) {
// Perhaps we need to skip over an enclosed part?
// We always expect exactly one enclosure character
// If we find the enclosure doubled, we consider it escaped.
// --> "" is converted to " later on.
//
enclosureFound = true;
boolean keepGoing;
do {
data.endBuffer++;
if ( checkBufferSize() ) {
enclosureFound = false;
break;
}
keepGoing = data.byteBuffer[data.endBuffer] != data.enclosure[0];
if ( !keepGoing ) {
// We found an enclosure character.
// Read another byte...
//
data.endBuffer++;
if ( checkBufferSize() ) {
enclosureFound = false;
break;
}
// If this character is also an enclosure, we can consider the enclosure "escaped".
// As such, if this is an enclosure, we keep going...
//
keepGoing = data.byteBuffer[data.endBuffer] == data.enclosure[0];
if ( keepGoing ) {
escapedEnclosureFound++;
}
}
} while ( keepGoing );
// Did we reach the end of the buffer?
//
if ( data.endBuffer >= data.bufferSize ) {
newLineFound = true; // consider it a newline to break out of the upper while loop
newLines += 2; // to remove the enclosures in case of missing newline on last line.
break;
}
} else {
data.endBuffer++;
data.totalBytesRead++;
if ( checkBufferSize() ) {
if ( data.endBuffer >= data.bufferSize ) {
newLineFound = true;
break;
}
}
}
}
// If we're still here, we found a delimiter..
// Since the starting point never changed really, we just can grab range:
//
// [startBuffer-endBuffer[
//
// This is the part we want.
//
int length = data.endBuffer - data.startBuffer;
if ( newLineFound ) {
length -= newLines;
if ( length <= 0 ) {
length = 0;
}
}
if ( enclosureFound ) {
data.startBuffer++;
length -= 2;
if ( length <= 0 ) {
length = 0;
}
}
if ( length <= 0 ) {
length = 0;
}
byte[] field = new byte[length];
System.arraycopy( data.byteBuffer, data.startBuffer, field, 0, length );
// Did we have any escaped characters in there?
//
if ( escapedEnclosureFound > 0 ) {
if ( log.isRowLevel() ) {
logRowlevel( "Escaped enclosures found in " + new String( field ) );
}
field = data.removeEscapedEnclosures( field, escapedEnclosureFound );
}
if ( doConversions ) {
if ( meta.isLazyConversionActive() ) {
outputRowData[outputIndex++] = field;
} else {
// We're not lazy so we convert the data right here and now.
// The convert object uses binary storage as such we just have to ask the native type from it.
// That will do the actual conversion.
//
ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta( outputIndex );
outputRowData[outputIndex++] = sourceValueMeta.convertBinaryStringToNativeType( field );
}
} else {
outputRowData[outputIndex++] = null; // nothing for the header, no conversions here.
}
// if (outputRowData[0]!=null && (outputRowData[0] instanceof Long) &&
// ((Long)outputRowData[0]).longValue()==95174) {
// System.out.println(outputRowData[0]);
// }
// OK, move on to the next field...
if ( !newLineFound ) {
data.endBuffer++;
data.totalBytesRead++;
}
data.startBuffer = data.endBuffer;
}
// See if we reached the end of the line.
// If not, we need to skip the remaining items on the line until the next newline...
//
if ( !newLineFound && !checkBufferSize() ) {
do {
data.endBuffer++;
data.totalBytesRead++;
if ( checkBufferSize() ) {
break; // nothing more to read.
}
// TODO: if we're using quoting we might be dealing with a very dirty file with quoted newlines in trailing
// fields. (imagine that)
// In that particular case we want to use the same logic we use above (refactored a bit) to skip these fields.
} while ( data.byteBuffer[data.endBuffer] != '\n' && data.byteBuffer[data.endBuffer] != '\r' );
if ( !checkBufferSize() ) {
while ( data.byteBuffer[data.endBuffer] == '\n' || data.byteBuffer[data.endBuffer] == '\r' ) {
data.endBuffer++;
data.totalBytesRead++;
if ( checkBufferSize() ) {
break; // nothing more to read.
}
}
}
// Make sure we start at the right position the next time around.
data.startBuffer = data.endBuffer;
}
// Optionally add the current filename to the mix as well...
//
if ( meta.isIncludingFilename() && !Utils.isEmpty( meta.getFilenameField() ) ) {
if ( meta.isLazyConversionActive() ) {
outputRowData[data.filenameFieldIndex] = data.binaryFilename;
} else {
outputRowData[data.filenameFieldIndex] = data.filenames[data.filenr - 1];
}
}
if ( data.isAddingRowNumber ) {
outputRowData[data.rownumFieldIndex] = new Long( data.rowNumber++ );
}
incrementLinesInput();
return outputRowData;
} catch ( Exception e ) {
throw new KettleFileException( "Exception reading line of data", e );
}
}
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (ParGzipCsvInputMeta) smi;
data = (ParGzipCsvInputData) sdi;
if ( super.init( smi, sdi ) ) {
data.bufferSize = Integer.parseInt( environmentSubstitute( meta.getBufferSize() ) );
data.byteBuffer = new byte[] {}; // empty
// If the step doesn't have any previous steps, we just get the filename.
// Otherwise, we'll grab the list of filenames later...
//
if ( getTransMeta().findNrPrevSteps( getStepMeta() ) == 0 ) {
String filename = environmentSubstitute( meta.getFilename() );
if ( Utils.isEmpty( filename ) ) {
logError( BaseMessages.getString( PKG, "ParGzipCsvInput.MissingFilename.Message" ) );
return false;
}
data.filenames = new String[] { filename, };
} else {
data.filenames = null;
data.filenr = 0;
}
data.delimiter = environmentSubstitute( meta.getDelimiter() ).getBytes();
if ( Utils.isEmpty( meta.getEnclosure() ) ) {
data.enclosure = null;
} else {
data.enclosure = environmentSubstitute( meta.getEnclosure() ).getBytes();
}
data.isAddingRowNumber = !Utils.isEmpty( meta.getRowNumField() );
// Handle parallel reading capabilities...
//
if ( meta.isRunningInParallel() ) {
data.stepNumber = getUniqueStepNrAcrossSlaves();
data.totalNumberOfSteps = getUniqueStepCountAcrossSlaves();
}
return true;
}
return false;
}
public void closeFile() throws KettleException {
try {
if ( data.gzis != null ) {
data.gzis.close();
}
if ( data.fis != null ) {
incrementLinesUpdated();
data.fis.close();
}
} catch ( IOException e ) {
throw new KettleException( "Unable to close file '" + data.filenames[data.filenr - 1], e );
}
}
}