org.pentaho.di.trans.steps.fixedinput.FixedInput Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.fixedinput;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
import org.apache.commons.io.FileUtils;
import org.apache.commons.vfs2.FileObject;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
/**
* Read a simple fixed width file Just output fields found in the file...
*
* @author Matt
* @since 2007-07-06
*/
public class FixedInput extends BaseStep implements StepInterface {
private static Class> PKG = FixedInputMeta.class; // for i18n purposes, needed by Translator2!!
private FixedInputMeta meta;
private FixedInputData data;
public FixedInput( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
meta = (FixedInputMeta) smi;
data = (FixedInputData) sdi;
if ( first ) {
first = false;
data.outputRowMeta = new RowMeta();
meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore );
// The conversion logic for when the lazy conversion is turned of is simple:
// Pretend it's a lazy conversion object anyway and get the native type during conversion.
//
data.convertRowMeta = data.outputRowMeta.clone();
for ( ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList() ) {
valueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_BINARY_STRING );
}
if ( meta.isHeaderPresent() ) {
readOneRow( false ); // skip this row.
}
}
Object[] outputRowData = readOneRow( true );
if ( outputRowData == null ) { // no more input to be expected...
setOutputDone();
return false;
}
putRow( data.outputRowMeta, outputRowData ); // copy row to possible alternate rowset(s).
if ( checkFeedback( getLinesInput() ) ) {
logBasic( BaseMessages.getString( PKG, "FixedInput.Log.LineNumber", Long.toString( getLinesInput() ) ) );
}
return true;
}
/**
* Read a single row of data from the file...
*
* @param doConversions
* if you want to do conversions, set to false for the header row.
* @return a row of data...
* @throws KettleException
*/
private Object[] readOneRow( boolean doConversions ) throws KettleException {
try {
// See if we need to call it a day...
//
if ( meta.isRunningInParallel() ) {
if ( getLinesInput() >= data.rowsToRead ) {
return null; // We're done. The rest is for the other steps in the cluster
}
}
Object[] outputRowData = RowDataUtil.allocateRowData( data.convertRowMeta.size() );
int outputIndex = 0;
// The strategy is as follows...
// We read a block of byte[] from the file.
//
// Then we scan that block of data.
// We keep a byte[] that we extend if needed..
// At the end of the block we read another, etc.
//
// Let's start by looking where we left off reading.
//
if ( data.stopReading ) {
return null;
}
FixedFileInputField[] fieldDefinitions = meta.getFieldDefinition();
for ( int i = 0; i < fieldDefinitions.length; i++ ) {
int fieldWidth = fieldDefinitions[i].getWidth();
data.endBuffer = data.startBuffer + fieldWidth;
if ( data.endBuffer > data.bufferSize ) {
// Oops, we need to read more data...
// Better resize this before we read other things in it...
//
data.resizeByteBuffer();
// Also read another chunk of data, now that we have the space for it...
// Ignore EOF, there might be other stuff in the buffer.
//
data.readBufferFromFile();
}
// re-verify the buffer after we tried to read extra data from file...
//
if ( data.endBuffer > data.bufferSize ) {
// still a problem?
// We hit an EOF and are trying to read beyond the EOF...
// If we are on the first field and there
// is nothing left in the buffer, don't return
// a row because we're done.
if ( ( 0 == i ) && data.bufferSize <= 0 ) {
return null;
}
// This is the last record of data in the file.
data.stopReading = true;
// Just take what's left for the current field.
fieldWidth = data.bufferSize;
}
byte[] field = new byte[fieldWidth];
System.arraycopy( data.byteBuffer, data.startBuffer, field, 0, fieldWidth );
if ( doConversions ) {
if ( meta.isLazyConversionActive() ) {
outputRowData[outputIndex++] = field;
} else {
// We're not lazy so we convert the data right here and now.
// The convert object uses binary storage as such we just have to ask the native type from it.
// That will do the actual conversion.
//
ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta( outputIndex );
outputRowData[outputIndex++] = sourceValueMeta.convertBinaryStringToNativeType( field );
}
} else {
outputRowData[outputIndex++] = null; // nothing for the header, no conversions here.
}
// OK, onto the next field...
//
data.startBuffer = data.endBuffer;
}
// Now that we have all the data, see if there are any linefeed characters to remove from the buffer...
//
if ( meta.isLineFeedPresent() ) {
data.endBuffer += 2;
if ( data.endBuffer >= data.bufferSize ) {
// Oops, we need to read more data...
// Better resize this before we read other things in it...
//
data.resizeByteBuffer();
// Also read another chunk of data, now that we have the space for it...
data.readBufferFromFile();
}
// CR + Line feed in the worst case.
//
if ( data.byteBuffer[data.startBuffer] == '\n' || data.byteBuffer[data.startBuffer] == '\r' ) {
data.startBuffer++;
if ( data.byteBuffer[data.startBuffer] == '\n' || data.byteBuffer[data.startBuffer] == '\r' ) {
data.startBuffer++;
}
}
data.endBuffer = data.startBuffer;
}
incrementLinesInput();
return outputRowData;
} catch ( Exception e ) {
throw new KettleFileException( "Exception reading line using NIO: " + e.toString(), e );
}
}
private FileInputStream getFileInputStream( URL url ) throws FileNotFoundException {
return new FileInputStream( FileUtils.toFile( url ) );
}
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (FixedInputMeta) smi;
data = (FixedInputData) sdi;
if ( super.init( smi, sdi ) ) {
try {
data.preferredBufferSize = Integer.parseInt( environmentSubstitute( meta.getBufferSize() ) );
data.lineWidth = Integer.parseInt( environmentSubstitute( meta.getLineWidth() ) );
data.filename = environmentSubstitute( meta.getFilename() );
if ( Utils.isEmpty( data.filename ) ) {
logError( BaseMessages.getString( PKG, "FixedInput.MissingFilename.Message" ) );
return false;
}
FileObject fileObject = KettleVFS.getFileObject( data.filename, getTransMeta() );
try {
data.fis = getFileInputStream( fileObject.getURL() );
data.fc = data.fis.getChannel();
data.bb = ByteBuffer.allocateDirect( data.preferredBufferSize );
} catch ( IOException e ) {
logError( e.toString() );
return false;
}
// Add filename to result filenames ?
if ( meta.isAddResultFile() ) {
ResultFile resultFile =
new ResultFile( ResultFile.FILE_TYPE_GENERAL, fileObject, getTransMeta().getName(), toString() );
resultFile.setComment( "File was read by a Fixed input step" );
addResultFile( resultFile );
}
logBasic( "Opened file with name [" + data.filename + "]" );
data.stopReading = false;
if ( meta.isRunningInParallel() ) {
data.stepNumber = getUniqueStepNrAcrossSlaves();
data.totalNumberOfSteps = getUniqueStepCountAcrossSlaves();
data.fileSize = fileObject.getContent().getSize();
}
// OK, now we need to skip a number of bytes in case we're doing a parallel read.
//
if ( meta.isRunningInParallel() ) {
int totalLineWidth = data.lineWidth + meta.getLineSeparatorLength(); // including line separator bytes
long nrRows = data.fileSize / totalLineWidth; // 100.000 / 100 = 1000 rows
long rowsToSkip = Math.round( data.stepNumber * nrRows / (double) data.totalNumberOfSteps ); // 0, 333, 667
// 333, 667, 1000
long nextRowsToSkip = Math.round( ( data.stepNumber + 1 ) * nrRows / (double) data.totalNumberOfSteps );
data.rowsToRead = nextRowsToSkip - rowsToSkip;
long bytesToSkip = rowsToSkip * totalLineWidth;
logBasic( "Step #"
+ data.stepNumber + " is skipping " + bytesToSkip + " to position in file, then it's reading "
+ data.rowsToRead + " rows." );
data.fc.position( bytesToSkip );
}
return true;
} catch ( Exception e ) {
logError( "Error opening file '" + meta.getFilename() + "'", e );
}
}
return false;
}
@Override
public void dispose( StepMetaInterface smi, StepDataInterface sdi ) {
try {
if ( data.fc != null ) {
data.fc.close();
}
if ( data.fis != null ) {
data.fis.close();
}
} catch ( IOException e ) {
logError( "Unable to close file channel for file '" + meta.getFilename() + "' : " + e.toString() );
logError( Const.getStackTracker( e ) );
}
super.dispose( smi, sdi );
}
}