All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pentaho.di.trans.steps.file.BaseFileField Maven / Gradle / Ivy

The newest version!
/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.file;

import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.pentaho.di.core.Const;
import org.pentaho.di.core.gui.TextFileInputFieldInterface;
import org.pentaho.di.core.injection.Injection;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaFactory;
import org.pentaho.di.core.row.value.ValueMetaString;

/**
 * Describes a single field in a text file
 *
 * @author Matt
 * @since 19-04-2004
 */
public class BaseFileField implements Cloneable, TextFileInputFieldInterface {
  @Injection( name = "FIELD_NAME", group = "FIELDS" )
  private String name;

  @Injection( name = "FIELD_POSITION", group = "FIELDS" )
  private int position = -1;

  @Injection( name = "FIELD_LENGTH", group = "FIELDS" )
  private int length = -1;

  private int type;

  @Injection( name = "FIELD_IGNORE", group = "FIELDS" )
  private boolean ignore;

  @Injection( name = "FIELD_FORMAT", group = "FIELDS" )
  private String format;

  private int trimtype;

  @Injection( name = "FIELD_PRECISION", group = "FIELDS" )
  private int precision = -1;

  @Injection( name = "FIELD_CURRENCY", group = "FIELDS" )
  private String currencySymbol;

  @Injection( name = "FIELD_DECIMAL", group = "FIELDS" )
  private String decimalSymbol;

  @Injection( name = "FIELD_GROUP", group = "FIELDS" )
  private String groupSymbol;

  @Injection( name = "FIELD_REPEAT", group = "FIELDS" )
  private boolean repeat;

  @Injection( name = "FIELD_NULL_STRING", group = "FIELDS" )
  private String nullString;

  @Injection( name = "FIELD_IF_NULL", group = "FIELDS" )
  private String ifNullValue;

  private String[] samples;

  // Guess fields...
  private NumberFormat nf;
  private DecimalFormat df;
  private DecimalFormatSymbols dfs;
  private SimpleDateFormat daf;

  // private boolean containsDot;
  // private boolean containsComma;

  private static final String[] date_formats =
      new String[] { "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss", "dd/MM/yyyy", "dd-MM-yyyy", "yyyy/MM/dd",
        "yyyy-MM-dd", "yyyyMMdd", "ddMMyyyy", "d-M-yyyy", "d/M/yyyy", "d-M-yy", "d/M/yy", };

  private static final String[] number_formats =
      new String[] { "", "#", Const.DEFAULT_NUMBER_FORMAT, "0.00", "0000000000000", "###,###,###.#######",
        "###############.###############", "#####.###############%", };

  public BaseFileField( String fieldname, int position, int length ) {
    this.name = fieldname;
    this.position = position;
    this.length = length;
    this.type = ValueMetaInterface.TYPE_STRING;
    this.ignore = false;
    this.format = "";
    this.trimtype = ValueMetaInterface.TRIM_TYPE_NONE;
    this.groupSymbol = "";
    this.decimalSymbol = "";
    this.currencySymbol = "";
    this.precision = -1;
    this.repeat = false;
    this.nullString = "";
    this.ifNullValue = "";
    // this.containsDot=false;
    // this.containsComma=false;
  }

  public BaseFileField() {
    this( null, -1, -1 );
  }

  public int compare( Object obj ) {
    BaseFileField field = (BaseFileField) obj;

    return position - field.getPosition();
  }

  @Override
  public int compareTo( TextFileInputFieldInterface field ) {
    return position - field.getPosition();
  }

  public boolean equal( Object obj ) {
    BaseFileField field = (BaseFileField) obj;

    return ( position == field.getPosition() );
  }

  @Override
  public Object clone() {
    try {
      return super.clone();
    } catch ( CloneNotSupportedException ex ) {
      throw new IllegalArgumentException( "Clone not supported for " + this.getClass().getName() );
    }
  }

  @Override
  public int getPosition() {
    return position;
  }

  public void setPosition( int position ) {
    this.position = position;
  }

  @Override
  public int getLength() {
    return length;
  }

  @Override
  public void setLength( int length ) {
    this.length = length;
  }

  @Override
  public String getName() {
    return name;
  }

  public void setName( String fieldname ) {
    this.name = fieldname;
  }

  public int getType() {
    return type;
  }

  public String getTypeDesc() {
    return ValueMetaFactory.getValueMetaName( type );
  }

  public void setType( int type ) {
    this.type = type;
  }

  @Injection( name = "FIELD_TYPE", group = "FIELDS" )
  public void setType( String value ) {
    this.type = ValueMetaFactory.getIdForValueMeta( value );
  }

  public boolean isIgnored() {
    return ignore;
  }

  public void setIgnored( boolean ignore ) {
    this.ignore = ignore;
  }

  public void flipIgnored() {
    ignore = !ignore;
  }

  public String getFormat() {
    return format;
  }

  public void setFormat( String format ) {
    this.format = format;
  }

  public void setSamples( String[] samples ) {
    this.samples = samples;
  }

  public int getTrimType() {
    return trimtype;
  }

  public String getTrimTypeCode() {
    return ValueMetaString.getTrimTypeCode( trimtype );
  }

  public String getTrimTypeDesc() {
    return ValueMetaString.getTrimTypeDesc( trimtype );
  }

  public void setTrimType( int trimtype ) {
    this.trimtype = trimtype;
  }

  @Injection( name = "FIELD_TRIM_TYPE", group = "FIELDS" )
  public void setTrimType( String value ) {
    this.trimtype = ValueMetaString.getTrimTypeByCode( value );
  }

  public String getGroupSymbol() {
    return groupSymbol;
  }

  public void setGroupSymbol( String group_symbol ) {
    this.groupSymbol = group_symbol;
  }

  public String getDecimalSymbol() {
    return decimalSymbol;
  }

  public void setDecimalSymbol( String decimal_symbol ) {
    this.decimalSymbol = decimal_symbol;
  }

  public String getCurrencySymbol() {
    return currencySymbol;
  }

  public void setCurrencySymbol( String currency_symbol ) {
    this.currencySymbol = currency_symbol;
  }

  public int getPrecision() {
    return precision;
  }

  public void setPrecision( int precision ) {
    this.precision = precision;
  }

  public boolean isRepeated() {
    return repeat;
  }

  public void setRepeated( boolean repeat ) {
    this.repeat = repeat;
  }

  public void flipRepeated() {
    repeat = !repeat;
  }

  public String getNullString() {
    return nullString;
  }

  public void setNullString( String null_string ) {
    this.nullString = null_string;
  }

  public String getIfNullValue() {
    return ifNullValue;
  }

  public void setIfNullValue( String ifNullValue ) {
    this.ifNullValue = ifNullValue;
  }

  @Override
  public String toString() {
    return name + "@" + position + ":" + length;
  }

  public void guess() {
    guessTrimType();
    guessType();
    guessIgnore();
  }

  public void guessTrimType() {
    boolean spaces_before = false;
    boolean spaces_after = false;

    for ( int i = 0; i < samples.length; i++ ) {
      spaces_before |= Const.nrSpacesBefore( samples[i] ) > 0;
      spaces_after |= Const.nrSpacesAfter( samples[i] ) > 0;
      samples[i] = Const.trim( samples[i] );
    }

    trimtype = ValueMetaInterface.TRIM_TYPE_NONE;

    if ( spaces_before ) {
      trimtype |= ValueMetaInterface.TRIM_TYPE_LEFT;
    }
    if ( spaces_after ) {
      trimtype |= ValueMetaInterface.TRIM_TYPE_RIGHT;
    }
  }

  public void guessType() {
    nf = NumberFormat.getInstance();
    df = (DecimalFormat) nf;
    dfs = new DecimalFormatSymbols();
    daf = new SimpleDateFormat();

    daf.setLenient( false );

    // Start with a string...
    type = ValueMetaInterface.TYPE_STRING;

    // If we have no samples, we assume a String...
    if ( samples == null ) {
      return;
    }

    // ////////////////////////////
    // DATES
    // ////////////////////////////

    // See if all samples can be transformed into a date...
    int datefmt_cnt = date_formats.length;
    boolean[] datefmt = new boolean[date_formats.length];
    for ( int i = 0; i < date_formats.length; i++ ) {
      datefmt[i] = true;
    }
    int datenul = 0;

    for ( int i = 0; i < samples.length; i++ ) {
      if ( samples[i].length() > 0 && samples[i].equalsIgnoreCase( nullString ) ) {
        datenul++;
      } else {
        for ( int x = 0; x < date_formats.length; x++ ) {
          if ( samples[i] == null || Const.onlySpaces( samples[i] ) || samples[i].length() == 0 ) {
            datefmt[x] = false;
            datefmt_cnt--;
          }

          if ( datefmt[x] ) {
            try {
              daf.applyPattern( date_formats[x] );
              Date date = daf.parse( samples[i] );

              Calendar cal = Calendar.getInstance();
              cal.setTime( date );
              int year = cal.get( Calendar.YEAR );

              if ( year < 1800 || year > 2200 ) {
                datefmt[x] = false; // Don't try it again in the future.
                datefmt_cnt--; // One less that works..
              }
            } catch ( Exception e ) {
              datefmt[x] = false; // Don't try it again in the future.
              datefmt_cnt--; // One less that works..
            }
          }
        }
      }
    }

    // If it is a date, copy info over to the format etc. Then return with the info.
    // If all samples where NULL values, we can't really decide what the type is.
    // So we're certainly not going to take a date, just take a string in that case.
    if ( datefmt_cnt > 0 && datenul != samples.length ) {
      int first = -1;
      for ( int i = 0; i < date_formats.length && first < 0; i++ ) {
        if ( datefmt[i] ) {
          first = i;
        }
      }

      type = ValueMetaInterface.TYPE_DATE;
      format = date_formats[first];

      return;
    }

    // ////////////////////////////
    // NUMBERS
    // ////////////////////////////

    boolean isnumber = true;

    // Set decimal symbols to default
    decimalSymbol = "" + dfs.getDecimalSeparator();
    groupSymbol = "" + dfs.getGroupingSeparator();

    boolean[] numfmt = new boolean[number_formats.length];
    int[] maxprecision = new int[number_formats.length];
    for ( int i = 0; i < numfmt.length; i++ ) {
      numfmt[i] = true;
      maxprecision[i] = -1;
    }
    int numfmt_cnt = number_formats.length;
    int numnul = 0;

    for ( int i = 0; i < samples.length && isnumber; i++ ) {
      boolean contains_dot = false;
      boolean contains_comma = false;

      String field = samples[i];

      if ( field.length() > 0 && field.equalsIgnoreCase( nullString ) ) {
        numnul++;
      } else {
        for ( int x = 0; x < field.length() && isnumber; x++ ) {
          char ch = field.charAt( x );
          if ( !Character.isDigit( ch ) && ch != '.' && ch != ',' && ( ch != '-' || x > 0 ) && ch != 'E' && ch != 'e' // exponential
          ) {
            isnumber = false;
            numfmt_cnt = 0;
          } else {
            if ( ch == '.' ) {
              contains_dot = true;
              // containsDot = true;
            }
            if ( ch == ',' ) {
              contains_comma = true;
              // containsComma = true;
            }
          }
        }
        // If it's still a number, try to parse it as a double
        if ( isnumber ) {
          if ( contains_dot && !contains_comma ) { // American style 174.5

            dfs.setDecimalSeparator( '.' );
            decimalSymbol = ".";
            dfs.setGroupingSeparator( ',' );
            groupSymbol = ",";
          } else if ( !contains_dot && contains_comma ) { // European style 174,5

            dfs.setDecimalSeparator( ',' );
            decimalSymbol = ",";
            dfs.setGroupingSeparator( '.' );
            groupSymbol = ".";
          } else if ( contains_dot && contains_comma ) { // Both appear!

            // What's the last occurance: decimal point!
            int idx_dot = field.indexOf( '.' );
            int idx_com = field.indexOf( ',' );
            if ( idx_dot > idx_com ) {
              dfs.setDecimalSeparator( '.' );
              decimalSymbol = ".";
              dfs.setGroupingSeparator( ',' );
              groupSymbol = ",";
            } else {
              dfs.setDecimalSeparator( ',' );
              decimalSymbol = ",";
              dfs.setGroupingSeparator( '.' );
              groupSymbol = ".";
            }
          }

          // Try the remaining possible number formats!
          for ( int x = 0; x < number_formats.length; x++ ) {
            if ( numfmt[x] ) {
              boolean islong = true;

              try {
                int prec = -1;
                // Try long integers first....
                if ( !contains_dot && !contains_comma ) {
                  try {
                    Long.parseLong( field );
                    prec = 0;
                  } catch ( Exception e ) {
                    islong = false;
                  }
                }

                if ( !islong ) { // Try the double

                  df.setDecimalFormatSymbols( dfs );
                  df.applyPattern( number_formats[x] );

                  double d = df.parse( field ).doubleValue();
                  prec = guessPrecision( d );
                }
                if ( prec > maxprecision[x] ) {
                  maxprecision[x] = prec;
                }
              } catch ( Exception e ) {
                numfmt[x] = false; // Don't try it again in the future.
                numfmt_cnt--; // One less that works..
              }
            }
          }
        }
      }
    }

    // Still a number? Grab the result and return.
    // If all sample strings are empty or represent NULL values we can't take a number as type.
    if ( numfmt_cnt > 0 && numnul != samples.length ) {
      int first = -1;
      for ( int i = 0; i < number_formats.length && first < 0; i++ ) {
        if ( numfmt[i] ) {
          first = i;
        }
      }

      type = ValueMetaInterface.TYPE_NUMBER;
      format = number_formats[first];
      precision = maxprecision[first];

      // Wait a minute!!! What about Integers?
      // OK, only if the precision is 0 and the length <19 (java long integer)
      /*
       * if (length<19 && precision==0 && !containsDot && !containsComma) { type=ValueMetaInterface.TYPE_INTEGER;
       * decimalSymbol=""; groupSymbol=""; }
       */

      return;
    }

    //
    // Assume it's a string...
    //
    type = ValueMetaInterface.TYPE_STRING;
    format = "";
    precision = -1;
    decimalSymbol = "";
    groupSymbol = "";
    currencySymbol = "";
  }

  public static final int guessPrecision( double d ) {
    int maxprec = 4;
    double maxdiff = 0.00005;

    // Make sure that 7.99995 == 8.00000
    // This is usually a rounding error!
    double diff = Math.abs( Math.floor( d ) - d );
    if ( diff < maxdiff ) {
      return 0; // nothing behind decimal point...
    }

    // System.out.println("d="+d+", diff="+diff);

    // remainder: 12.345678 --> 0.345678
    for ( int i = 1; i < maxprec; i++ ) {
      // cap off precision at a reasonable maximum
      double factor = Math.pow( 10.0, i );
      diff = Math.abs( Math.floor( d * factor ) - ( d * factor ) );
      if ( diff < maxdiff ) {
        return i;
      }

      // System.out.println("d="+d+", diff="+diff+", factor="+factor);

      factor *= 10;
    }

    // Unknown length!
    return -1;
  }

  // Should a field be ignored?
  public void guessIgnore() {
    // If the string contains only spaces?
    boolean stop = false;
    for ( int i = 0; i < samples.length && !stop; i++ ) {
      if ( !Const.onlySpaces( samples[i] ) ) {
        stop = true;
      }
    }
    if ( !stop ) {
      ignore = true;
      return;
    }

    // If all the strings are empty
    stop = false;
    for ( int i = 0; i < samples.length && !stop; i++ ) {
      if ( samples[i].length() > 0 ) {
        stop = true;
      }
    }
    if ( !stop ) {
      ignore = true;
      return;
    }

    // If all the strings are equivalent to NULL
    stop = false;
    for ( int i = 0; i < samples.length && !stop; i++ ) {
      if ( !samples[i].equalsIgnoreCase( nullString ) ) {
        stop = true;
      }
    }
    if ( !stop ) {
      ignore = true;
      return;
    }
  }

  @Override
  public TextFileInputFieldInterface createNewInstance( String newFieldname, int x, int newlength ) {
    return new BaseFileField( newFieldname, x, newlength );
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy