org.pentaho.di.trans.steps.fileinput.text.TextFileInputReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kettle-engine Show documentation
Container pom for Pentaho Data Integration modules
The newest version!
/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.fileinput.text;

import java.io.BufferedInputStream;
import java.io.InputStreamReader;

import org.apache.commons.vfs2.FileObject;
import org.pentaho.di.core.compress.CompressionInputStream;
import org.pentaho.di.core.compress.CompressionProvider;
import org.pentaho.di.core.compress.CompressionProviderFactory;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.logging.LogChannelInterface;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.errorhandling.AbstractFileErrorHandler;
import org.pentaho.di.trans.steps.file.IBaseFileInputReader;
import org.pentaho.di.trans.steps.file.IBaseFileInputStepControl;

/**
 * Reader for one text file.
 *
 * @author Alexander Buloichik
 */
public class TextFileInputReader implements IBaseFileInputReader {
  private static final int BUFFER_SIZE_INPUT_STREAM = 8192;

  private final IBaseFileInputStepControl step;
  private final TextFileInputMeta meta;
  private final TextFileInputData data;
  private final LogChannelInterface log;

  private final CompressionInputStream in;

  private final InputStreamReader isr;

  protected long lineInFile;

  private boolean first;

  protected long lineNumberInFile;

  public TextFileInputReader( IBaseFileInputStepControl step, TextFileInputMeta meta, TextFileInputData data,
      FileObject file, LogChannelInterface log ) throws Exception {
    this.step = step;
    this.meta = meta;
    this.data = data;
    this.log = log;

    CompressionProvider provider =
        CompressionProviderFactory.getInstance().getCompressionProviderByName( meta.content.fileCompression );

    if ( log.isDetailed() ) {
      log.logDetailed( "This is a compressed file being handled by the " + provider.getName() + " provider" );
    }

    in = provider.createInputStream( KettleVFS.getInputStream( file ) );

    in.nextEntry();

    BufferedInputStream inStream = new BufferedInputStream( in, BUFFER_SIZE_INPUT_STREAM );
    BOMDetector bom = new BOMDetector( inStream );

    if ( bom.bomExist() ) {
      // if BOM exist, use it instead defined charset
      isr = new InputStreamReader( inStream, bom.getCharset() );
    } else if ( meta.getEncoding() != null && meta.getEncoding().length() > 0 ) {
      isr = new InputStreamReader( inStream, meta.getEncoding() );
    } else {
      isr = new InputStreamReader( inStream );
    }

    String encoding = isr.getEncoding();
    data.encodingType = EncodingType.guessEncodingType( encoding );

    readInitial();
  }

  protected void readInitial() throws Exception {
    data.doneWithHeader = !meta.content.header;
    // /////////////////////////////////////////////////////////////////////////////
    // Read the first lines...

    /*
     * Keep track of the status of the file: are there any lines left to read?
     */
    data.doneReading = false;

    /*
     * OK, read a number of lines in the buffer: The header rows The nr rows in the page : optional The footer rows
     */
    int bufferSize = 1;
    bufferSize += meta.content.header ? meta.content.nrHeaderLines : 0;
    bufferSize +=
        meta.content.layoutPaged ? meta.content.nrLinesPerPage * ( Math.max( 0, meta.content.nrWraps ) + 1 ) : Math.max(
            0, meta.content.nrWraps ); // it helps when we have wrapped input w/o header

    bufferSize += meta.content.footer ? meta.content.nrFooterLines : 0;

    // See if we need to skip the document header lines...
    if ( meta.content.layoutPaged ) {
      for ( int i = 0; i < meta.content.nrLinesDocHeader; i++ ) {
        // Just skip these...
        TextFileInputUtils.getLine( log, isr, data.encodingType, data.fileFormatType, data.lineStringBuilder ); // header
                                                                                                                // and
        // footer: not
        // wrapped
        lineNumberInFile++;
      }
    }

    for ( int i = 0; i < bufferSize && !data.doneReading; i++ ) {
      boolean wasNotFiltered = tryToReadLine( !meta.content.header || i >= meta.content.nrHeaderLines );
      if ( !wasNotFiltered ) {
        // grab another line, this one got filtered
        bufferSize++;
      }
    }

    // Reset counters etc.
    data.headerLinesRead = 0;
    data.footerLinesRead = 0;
    data.pageLinesRead = 0;

  }

  @Override
  public boolean readRow() throws KettleException {
    Object[] r = null;
    boolean retval = true;
    boolean putrow = false;

    if ( !data.doneReading ) {
      int repeats = 1;
      if ( meta.content.lineWrapped ) {
        repeats = meta.content.nrWraps > 0 ? meta.content.nrWraps : repeats;
      }

      if ( !data.doneWithHeader && data.headerLinesRead == 0 ) {
        // We are just starting to read header lines, read them all
        repeats += meta.content.nrHeaderLines + 1;
      }

      // Read a number of lines...
      for ( int i = 0; i < repeats && !data.doneReading; i++ ) {
        if ( !tryToReadLine( true ) ) {
          repeats++;
        }
      }
    }

    if ( data.lineBuffer.isEmpty() ) {
      return false;
    }

    /*
     * Take the first line available in the buffer & remove the line from the buffer
     */
    TextFileLine textLine = data.lineBuffer.get( 0 );
    step.incrementLinesInput();

    data.lineBuffer.remove( 0 );

    if ( meta.content.layoutPaged ) {
      /*
       * Different rules apply: on each page: a header a number of data lines a footer
       */
      if ( !data.doneWithHeader && data.pageLinesRead == 0 ) { // We are reading header lines
        if ( log.isRowLevel() ) {
          log.logRowlevel( "P-HEADER (" + data.headerLinesRead + ") : " + textLine.line );
        }
        data.headerLinesRead++;
        if ( data.headerLinesRead >= meta.content.nrHeaderLines ) {
          data.doneWithHeader = true;
        }
      } else {
        // data lines or footer on a page

        if ( data.pageLinesRead < meta.content.nrLinesPerPage ) {
          // See if we are dealing with wrapped lines:
          if ( meta.content.lineWrapped ) {
            for ( int i = 0; i < meta.content.nrWraps; i++ ) {
              String extra = "";
              if ( data.lineBuffer.size() > 0 ) {
                extra = data.lineBuffer.get( 0 ).line;
                data.lineBuffer.remove( 0 );
              }
              textLine.line += extra;
            }
          }

          if ( log.isRowLevel() ) {
            log.logRowlevel( "P-DATA: " + textLine.line );
          }
          // Read a normal line on a page of data.
          data.pageLinesRead++;
          lineInFile++;
          long useNumber = meta.content.rowNumberByFile ? lineInFile : step.getLinesWritten() + 1;
          r =
              TextFileInputUtils.convertLineToRow( log, textLine, meta, data.currentPassThruFieldsRow,
                  data.nrPassThruFields, data.outputRowMeta, data.convertRowMeta, data.filename, useNumber,
                  data.separator, data.enclosure, data.escapeCharacter, data.dataErrorLineHandler,
                  meta.additionalOutputFields, data.shortFilename, data.path, data.hidden,
                  data.lastModificationDateTime, data.uriName, data.rootUriName, data.extension, data.size );
          if ( r != null ) {
            putrow = true;
          }

          // Possible fix for bug PDI-1121 - paged layout header and line count off by 1
          // We need to reset these BEFORE the next header line is read, so that it
          // is treated as a header ... obviously, only if there is no footer, and we are
          // done reading data.
          if ( !meta.content.footer && ( data.pageLinesRead == meta.content.nrLinesPerPage ) ) {
            /*
             * OK, we are done reading the footer lines, start again on 'next page' with the header
             */
            data.doneWithHeader = false;
            data.headerLinesRead = 0;
            data.pageLinesRead = 0;
            data.footerLinesRead = 0;
            if ( log.isRowLevel() ) {
              log.logRowlevel( "RESTART PAGE" );
            }
          }
        } else {
          // done reading the data lines, skip the footer lines

          if ( meta.content.footer && data.footerLinesRead < meta.content.nrFooterLines ) {
            if ( log.isRowLevel() ) {
              log.logRowlevel( "P-FOOTER: " + textLine.line );
            }
            data.footerLinesRead++;
          }

          if ( !meta.content.footer || data.footerLinesRead >= meta.content.nrFooterLines ) {
            /*
             * OK, we are done reading the footer lines, start again on 'next page' with the header
             */
            data.doneWithHeader = false;
            data.headerLinesRead = 0;
            data.pageLinesRead = 0;
            data.footerLinesRead = 0;
            if ( log.isRowLevel() ) {
              log.logRowlevel( "RESTART PAGE" );
            }
          }
        }
      }
    } else {
      // A normal data line, can also be a header or a footer line

      if ( !data.doneWithHeader ) { // We are reading header lines

        data.headerLinesRead++;
        if ( data.headerLinesRead >= meta.content.nrHeaderLines ) {
          data.doneWithHeader = true;
        }
      } else {
        /*
         * IF we are done reading and we have a footer AND the number of lines in the buffer is smaller then the number
         * of footer lines THEN we can remove the remaining rows from the buffer: they are all footer rows.
         */
        if ( data.doneReading && meta.content.footer && data.lineBuffer.size() < meta.content.nrFooterLines ) {
          data.lineBuffer.clear();
        } else {
          // Not yet a footer line: it's a normal data line.

          // See if we are dealing with wrapped lines:
          if ( meta.content.lineWrapped ) {
            for ( int i = 0; i < meta.content.nrWraps; i++ ) {
              String extra = "";
              if ( data.lineBuffer.size() > 0 ) {
                extra = data.lineBuffer.get( 0 ).line;
                data.lineBuffer.remove( 0 );
              } else {
                tryToReadLine( true );
                if ( !data.lineBuffer.isEmpty() ) {
                  extra = data.lineBuffer.remove( 0 ).line;
                }
              }
              textLine.line += extra;
            }
          }
          if ( data.filePlayList.isProcessingNeeded( textLine.file, textLine.lineNumber,
              AbstractFileErrorHandler.NO_PARTS ) ) {
            lineInFile++;
            long useNumber = meta.content.rowNumberByFile ? lineInFile : step.getLinesWritten() + 1;
            r =
                TextFileInputUtils.convertLineToRow( log, textLine, meta, data.currentPassThruFieldsRow,
                    data.nrPassThruFields, data.outputRowMeta, data.convertRowMeta, data.filename, useNumber,
                    data.separator, data.enclosure, data.escapeCharacter, data.dataErrorLineHandler,
                    meta.additionalOutputFields, data.shortFilename, data.path, data.hidden,
                    data.lastModificationDateTime, data.uriName, data.rootUriName, data.extension, data.size );
            if ( r != null ) {
              if ( log.isRowLevel() ) {
                log.logRowlevel( "Found data row: " + data.outputRowMeta.getString( r ) );
              }
              putrow = true;
            }
          } else {
            putrow = false;
          }
        }
      }
    }

    if ( putrow && r != null ) {
      // See if the previous values need to be repeated!
      if ( data.nr_repeats > 0 ) {
        if ( data.previous_row == null ) { // First invocation...

          data.previous_row = data.outputRowMeta.cloneRow( r );
        } else {
          // int repnr = 0;
          for ( int i = 0; i < meta.inputFields.length; i++ ) {
            if ( meta.inputFields[i].isRepeated() ) {
              if ( r[i] == null ) {
                // if it is empty: take the previous value!

                r[i] = data.previous_row[i];
              } else {
                // not empty: change the previous_row entry!

                data.previous_row[i] = r[i];
              }
              // repnr++;
            }
          }
        }
      }

      if ( log.isRowLevel() ) {
        log.logRowlevel( "Putting row: " + data.outputRowMeta.getString( r ) );
      }
      step.putRow( data.outputRowMeta, r );

      if ( step.getLinesInput() >= meta.content.rowLimit && meta.content.rowLimit > 0 ) {
        close();
        return false;
      }
    }

    if ( step.checkFeedback( step.getLinesInput() ) ) {
      if ( log.isBasic() ) {
        log.logBasic( "linenr " + step.getLinesInput() );
      }
    }

    return retval;
  }

  @Override
  public void close() {
    try {
      // Close previous file!
      if ( data.filename != null ) {
        // Clear any remaining rows that have already been read
        data.lineBuffer.clear();

        // Increment the lines updated to reflect another file has been finished.
        // This allows us to give a state of progress in the run time metrics
        step.incrementLinesUpdated();
        /*
         * } else if ( sFileCompression != null && sFileCompression.equals( "Snappy" ) && data.sis != null ) {
         * data.sis.close(); }
         */
        if ( in != null ) {
          BaseStep.closeQuietly( in );
        }
        isr.close();
        data.filename = null; // send it down the next time.
        if ( data.file != null ) {
          try {
            data.file.close();
            data.file = null;
          } catch ( Exception e ) {
            log.logError( "Error closing file", e );
          }
          data.file = null;
        }
      }
      data.dataErrorLineHandler.close();
    } catch ( Exception e ) {
      String errorMsg = "Couldn't close file : " + data.file.getName().getFriendlyURI() + " --> " + e.toString();
      log.logError( errorMsg );
      if ( step.failAfterBadFile( errorMsg ) ) { // ( !meta.isSkipBadFiles() || data.isLastFile ){
        step.stopAll();
      }
      step.setErrors( step.getErrors() + 1 );
    } // finally {
      // This is for bug #5797 : it tries to assure that the file handle
      // is actually freed/garbarge collected.
      // XXX deinspanjer 2009-07-07: I'm stubbing this out. The bug was ancient and it is worth reevaluating
      // to avoid the performance hit of a System GC on every file close
      // System.gc();
      // }
  }

  protected boolean tryToReadLine( boolean applyFilter ) throws KettleFileException {
    String line;
    line = TextFileInputUtils.getLine( log, isr, data.encodingType, data.fileFormatType, data.lineStringBuilder );
    if ( line != null ) {
      // when there is no header, check the filter for the first line
      if ( applyFilter ) {
        // Filter row?
        boolean isFilterLastLine = false;
        boolean filterOK = checkFilterRow( line, isFilterLastLine );
        if ( filterOK ) {
          data.lineBuffer.add( new TextFileLine( line, lineNumberInFile++, data.file ) ); // Store it in the
          // line buffer...
        } else {
          return false;
        }
      } else { // don't checkFilterRow

        if ( !meta.content.noEmptyLines || line.length() != 0 ) {
          data.lineBuffer.add( new TextFileLine( line, lineNumberInFile++, data.file ) ); // Store it in the line
                                                                                        // buffer...
        }
      }
    } else {
      data.doneReading = true;
    }
    return true;
  }

  /**
   * Check if the line should be taken.
   *
   * @param line
   * @param isFilterLastLine
   *          (dummy input param, only set when return value is false)
   * @return true when the line should be taken (when false, isFilterLastLine will be set)
   */
  private boolean checkFilterRow( String line, boolean isFilterLastLine ) {
    boolean filterOK = true;

    // check for noEmptyLines
    if ( meta.content.noEmptyLines && line.length() == 0 ) {
      filterOK = false;
    } else {
      // check the filters
      filterOK = data.filterProcessor.doFilters( line );
      if ( !filterOK ) {
        if ( data.filterProcessor.isStopProcessing() ) {
          data.doneReading = true;
        }
      }
    }

    return filterOK;
  }
}