All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pentaho.di.trans.steps.fuzzymatch.FuzzyMatch Maven / Gradle / Ivy

The newest version!
/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.fuzzymatch;

import java.util.Iterator;

import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang.StringUtils;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.RowSet;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;

import com.wcohen.ss.Jaro;
import com.wcohen.ss.JaroWinkler;
import com.wcohen.ss.NeedlemanWunsch;

/**
 * Performs a fuzzy match for each main stream field row An approximative match is done in a lookup stream
 *
 * @author Samatar
 * @since 03-mars-2008
 */
public class FuzzyMatch extends BaseStep implements StepInterface {
  private static Class PKG = FuzzyMatchMeta.class; // for i18n purposes, needed by Translator2!!

  private FuzzyMatchMeta meta;
  private FuzzyMatchData data;

  public FuzzyMatch( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
    Trans trans ) {
    super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
  }

  private boolean readLookupValues() throws KettleException {
    data.infoStream = meta.getStepIOMeta().getInfoStreams().get( 0 );
    if ( data.infoStream.getStepMeta() == null ) {
      logError( BaseMessages.getString( PKG, "FuzzyMatch.Log.NoLookupStepSpecified" ) );
      return false;
    }

    if ( isDetailed() ) {
      logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadingFromStream" )
        + data.infoStream.getStepname() + "]" );
    }

    boolean firstRun = true;
    // Which row set do we read from?
    //
    RowSet rowSet = findInputRowSet( data.infoStream.getStepname() );
    Object[] rowData = getRowFrom( rowSet ); // rows are originating from "lookup_from"

    while ( rowData != null ) {
      if ( firstRun ) {
        data.infoMeta = rowSet.getRowMeta().clone();
        // Check lookup field
        int indexOfLookupField = data.infoMeta.indexOfValue( environmentSubstitute( meta.getLookupField() ) );
        if ( indexOfLookupField < 0 ) {
          // The field is unreachable !
          throw new KettleException( BaseMessages.getString(
            PKG, "FuzzyMatch.Exception.CouldnotFindLookField", meta.getLookupField() ) );
        }
        data.infoCache = new RowMeta();
        ValueMetaInterface keyValueMeta = data.infoMeta.getValueMeta( indexOfLookupField );
        keyValueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_NORMAL );
        data.infoCache.addValueMeta( keyValueMeta );
        // Add key
        data.indexOfCachedFields[0] = indexOfLookupField;

        // Check additional fields
        if ( data.addAdditionalFields ) {
          ValueMetaInterface additionalFieldValueMeta;
          for ( int i = 0; i < meta.getValue().length; i++ ) {
            int fi = i + 1;
            data.indexOfCachedFields[fi] = data.infoMeta.indexOfValue( meta.getValue()[i] );
            if ( data.indexOfCachedFields[fi] < 0 ) {
              // The field is unreachable !
              throw new KettleException( BaseMessages.getString(
                PKG, "FuzzyMatch.Exception.CouldnotFindLookField", meta.getValue()[i] ) );
            }
            additionalFieldValueMeta = data.infoMeta.getValueMeta( data.indexOfCachedFields[fi] );
            additionalFieldValueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_NORMAL );
            data.infoCache.addValueMeta( additionalFieldValueMeta );
          }
          data.nrCachedFields += meta.getValue().length;
        }
      }
      if ( log.isRowLevel() ) {
        logRowlevel( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadLookupRow" )
          + rowSet.getRowMeta().getString( rowData ) );
      }

      // Look up the keys in the source rows
      // and store values in cache

      Object[] storeData = new Object[data.nrCachedFields];
      // Add key field
      if ( rowData[data.indexOfCachedFields[0]] == null ) {
        storeData[0] = "";
      } else {
        ValueMetaInterface fromStreamRowMeta = rowSet.getRowMeta().getValueMeta( data.indexOfCachedFields[0] );
        if ( fromStreamRowMeta.isStorageBinaryString() ) {
          storeData[0] = fromStreamRowMeta.convertToNormalStorageType( rowData[data.indexOfCachedFields[0]] );
        } else {
          storeData[0] = rowData[data.indexOfCachedFields[0]];
        }
      }

      // Add additional fields?
      for ( int i = 1; i < data.nrCachedFields; i++ ) {
        ValueMetaInterface fromStreamRowMeta = rowSet.getRowMeta().getValueMeta( data.indexOfCachedFields[i] );
        if ( fromStreamRowMeta.isStorageBinaryString() ) {
          storeData[i] = fromStreamRowMeta.convertToNormalStorageType( rowData[data.indexOfCachedFields[i]] );
        } else {
          storeData[i] = rowData[data.indexOfCachedFields[i]];
        }
      }
      if ( isDebug() ) {
        logDebug( BaseMessages.getString( PKG, "FuzzyMatch.Log.AddingValueToCache", data.infoCache
          .getString( storeData ) ) );
      }

      addToCache( storeData );

      rowData = getRowFrom( rowSet );

      if ( firstRun ) {
        firstRun = false;
      }
    }

    return true;
  }

  private Object[] lookupValues( RowMetaInterface rowMeta, Object[] row ) throws KettleException {
    if ( first ) {
      first = false;

      data.outputRowMeta = getInputRowMeta().clone();
      meta.getFields(
        data.outputRowMeta, getStepname(), new RowMetaInterface[] { data.infoMeta }, null, this, repository,
        metaStore );

      // Check lookup field
      data.indexOfMainField = getInputRowMeta().indexOfValue( environmentSubstitute( meta.getMainStreamField() ) );
      if ( data.indexOfMainField < 0 ) {
        // The field is unreachable !
        throw new KettleException( BaseMessages.getString( PKG, "FuzzyMatch.Exception.CouldnotFindMainField", meta
          .getMainStreamField() ) );
      }
    }
    Object[] add = null;
    if ( row[ data.indexOfMainField ] == null ) {
      add = buildEmptyRow();
    } else {
      try {
        add = getFromCache( row );
      } catch ( Exception e ) {
        throw new KettleStepException( e );
      }
    }
    return RowDataUtil.addRowData( row, rowMeta.size(), add );
  }

  private void addToCache( Object[] value ) throws KettleException {
    try {
      data.look.add( value );
    } catch ( java.lang.OutOfMemoryError o ) {
      // exception out of memory
      throw new KettleException( BaseMessages.getString( PKG, "FuzzyMatch.Error.JavaHeap", o.toString() ) );
    }
  }

  private Object[] getFromCache( Object[] keyRow ) throws KettleValueException {
    if ( isDebug() ) {
      logDebug( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadingMainStreamRow", getInputRowMeta().getString(
        keyRow ) ) );
    }
    Object[] retval = null;
    switch ( meta.getAlgorithmType() ) {
      case FuzzyMatchMeta.OPERATION_TYPE_LEVENSHTEIN:
      case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
      case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
        retval = doDistance( keyRow );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE:
      case FuzzyMatchMeta.OPERATION_TYPE_METAPHONE:
      case FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX:
      case FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX:
        retval = doPhonetic( keyRow );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_JARO:
      case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER:
      case FuzzyMatchMeta.OPERATION_TYPE_PAIR_SIMILARITY:
        retval = doSimilarity( keyRow );
        break;
      default:

        break;
    }

    return retval;
  }

  private Object[] doDistance( Object[] row ) throws KettleValueException {
    // Reserve room
    Object[] rowData = buildEmptyRow();

    Iterator it = data.look.iterator();

    long distance = -1;

    // Object o=row[data.indexOfMainField];
    String lookupvalue = getInputRowMeta().getString( row, data.indexOfMainField );

    while ( it.hasNext() ) {
      // Get cached row data
      Object[] cachedData = it.next();
      // Key value is the first value
      String cacheValue = (String) cachedData[0];

      int cdistance = -1;
      String usecacheValue = cacheValue;
      String uselookupvalue = lookupvalue;
      if ( !meta.isCaseSensitive() ) {
        usecacheValue = cacheValue.toLowerCase();
        uselookupvalue = lookupvalue.toLowerCase();
      }

      switch ( meta.getAlgorithmType() ) {
        case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
          cdistance = Utils.getDamerauLevenshteinDistance( usecacheValue, uselookupvalue );
          break;
        case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
          cdistance = Math.abs( (int) new NeedlemanWunsch().score( usecacheValue, uselookupvalue ) );
          break;
        default:
          cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue );
          break;
      }

      if ( data.minimalDistance <= cdistance && cdistance <= data.maximalDistance ) {
        if ( meta.isGetCloserValue() ) {
          if ( cdistance < distance || distance == -1 ) {
            // Get closer value
            // minimal distance
            distance = cdistance;
            int index = 0;
            rowData[index++] = cacheValue;
            // Add metric value?
            if ( data.addValueFieldName ) {
              rowData[index++] = distance;
            }
            // Add additional return values?
            if ( data.addAdditionalFields ) {
              for ( int i = 0; i < meta.getValue().length; i++ ) {
                int nr = i + 1;
                int nf = i + index;
                rowData[nf] = cachedData[nr];
              }
            }
          }
        } else {
          // get all values separated by values separator
          if ( rowData[0] == null ) {
            rowData[0] = cacheValue;
          } else {
            rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue;
          }
        }
      }
    }

    return rowData;
  }

  private Object[] doPhonetic( Object[] row ) {
    // Reserve room
    Object[] rowData = buildEmptyRow();

    Iterator it = data.look.iterator();

    Object o = row[data.indexOfMainField];
    String lookupvalue = (String) o;

    String lookupValueMF = getEncodedMF( lookupvalue, meta.getAlgorithmType() );

    while ( it.hasNext() ) {
      // Get cached row data
      Object[] cachedData = it.next();
      // Key value is the first value
      String cacheValue = (String) cachedData[0];

      String cacheValueMF = getEncodedMF( cacheValue, meta.getAlgorithmType() );

      if ( lookupValueMF.equals( cacheValueMF ) ) {

        // Add match value
        int index = 0;
        rowData[index++] = cacheValue;

        // Add metric value?
        if ( data.addValueFieldName ) {
          rowData[index++] = cacheValueMF;
        }
        // Add additional return values?
        if ( data.addAdditionalFields ) {
          for ( int i = 0; i < meta.getValue().length; i++ ) {
            int nf = i + index;
            int nr = i + 1;
            rowData[nf] = cachedData[nr];
          }
        }
      }
    }

    return rowData;
  }

  private String getEncodedMF( String value, Integer algorithmType ) {
    String encodedValueMF = "";
    switch ( algorithmType ) {
      case FuzzyMatchMeta.OPERATION_TYPE_METAPHONE:
        encodedValueMF = ( new Metaphone() ).metaphone( value );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE:
        encodedValueMF = ( ( new DoubleMetaphone() ).doubleMetaphone( value ) );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX:
        encodedValueMF = ( new Soundex() ).encode( value );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX:
        encodedValueMF = ( new RefinedSoundex() ).encode( value );
        break;
      default:
        break;
    }
    return encodedValueMF;
  }

  private Object[] doSimilarity( Object[] row ) {

    // Reserve room
    Object[] rowData = buildEmptyRow();
    // prepare to read from cache ...
    Iterator it = data.look.iterator();
    double similarity = 0;

    // get current value from main stream
    Object o = row[data.indexOfMainField];

    String lookupvalue = o == null ? "" : (String) o;

    while ( it.hasNext() ) {
      // Get cached row data
      Object[] cachedData = it.next();
      // Key value is the first value
      String cacheValue = (String) cachedData[0];

      double csimilarity = new Double( 0 );

      switch ( meta.getAlgorithmType() ) {
        case FuzzyMatchMeta.OPERATION_TYPE_JARO:
          csimilarity = new Jaro().score( cacheValue, lookupvalue );
          break;
        case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER:
          csimilarity = new JaroWinkler().score( cacheValue, lookupvalue );
          break;
        default:
          // Letters pair similarity
          csimilarity = LetterPairSimilarity.getSimiliarity( cacheValue, lookupvalue );
          break;
      }

      if ( data.minimalSimilarity <= csimilarity && csimilarity <= data.maximalSimilarity ) {
        if ( meta.isGetCloserValue() ) {
          if ( csimilarity > similarity || ( csimilarity == 0 && cacheValue.equals( lookupvalue ) ) ) {
            similarity = csimilarity;
            // Update match value
            int index = 0;
            rowData[index++] = cacheValue;
            // Add metric value?
            if ( data.addValueFieldName ) {
              rowData[index++] = new Double( similarity );
            }

            // Add additional return values?
            if ( data.addAdditionalFields ) {
              for ( int i = 0; i < meta.getValue().length; i++ ) {
                int nf = i + index;
                int nr = i + 1;
                rowData[nf] = cachedData[nr];
              }
            }
          }
        } else {
          // get all values separated by values separator
          if ( rowData[0] == null ) {
            rowData[0] = cacheValue;
          } else {
            rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue;
          }
        }
      }
    }

    return rowData;
  }

  /**
   * Build an empty row based on the meta-data...
   *
   * @return
   */

  private Object[] buildEmptyRow() {
    Object[] rowData = RowDataUtil.allocateRowData( data.outputRowMeta.size() );

    return rowData;
  }

  public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
    meta = (FuzzyMatchMeta) smi;
    data = (FuzzyMatchData) sdi;

    if ( data.readLookupValues ) {
      data.readLookupValues = false;

      // Read values from lookup step (look)
      if ( !readLookupValues() ) {
        logError( BaseMessages.getString( PKG, "FuzzyMatch.Log.UnableToReadDataFromLookupStream" ) );
        setErrors( 1 );
        stopAll();
        return false;
      }
      if ( isDetailed() ) {
        logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadValuesInMemory", data.look.size() ) );
      }
    }

    Object[] r = getRow(); // Get row from input rowset & set row busy!
    if ( r == null ) {
      // no more input to be expected...
      if ( isDetailed() ) {
        logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.StoppedProcessingWithEmpty", getLinesRead() ) );
      }
      setOutputDone();
      return false;
    }

    try {

      // Do the actual lookup in the hastable.
      Object[] outputRow = lookupValues( getInputRowMeta(), r );
      if ( outputRow == null ) {
        setOutputDone(); // signal end to receiver(s)
        return false;
      }
      putRow( data.outputRowMeta, outputRow ); // copy row to output rowset(s);

      if ( checkFeedback( getLinesRead() ) ) {
        if ( log.isBasic() ) {
          logBasic( BaseMessages.getString( PKG, "FuzzyMatch.Log.LineNumber" ) + getLinesRead() );
        }
      }
    } catch ( KettleException e ) {
      boolean sendToErrorRow = false;
      String errorMessage = null;

      if ( getStepMeta().isDoingErrorHandling() ) {
        sendToErrorRow = true;
        errorMessage = e.toString();
      } else {
        logError( BaseMessages.getString( PKG, "FuzzyMatch.Log.ErrorInStepRunning" ) + e.getMessage() );
        setErrors( 1 );
        stopAll();
        setOutputDone(); // signal end to receiver(s)
        return false;
      }
      if ( sendToErrorRow ) {
        // Simply add this row to the error row
        putError( getInputRowMeta(), r, 1, errorMessage, meta.getMainStreamField(), "FuzzyMatch001" );
      }

    }

    return true;
  }

  public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
    meta = (FuzzyMatchMeta) smi;
    data = (FuzzyMatchData) sdi;

    if ( super.init( smi, sdi ) ) {

      // Check lookup and main stream field
      if ( Utils.isEmpty( meta.getMainStreamField() ) ) {
        logError( BaseMessages.getString( PKG, "FuzzyMatch.Error.MainStreamFieldMissing" ) );
        return false;
      }
      if ( Utils.isEmpty( meta.getLookupField() ) ) {
        logError( BaseMessages.getString( PKG, "FuzzyMatch.Error.LookupStreamFieldMissing" ) );
        return false;
      }

      // Checks output fields
      String matchField = environmentSubstitute( meta.getOutputMatchField() );
      if ( Utils.isEmpty( matchField ) ) {
        logError( BaseMessages.getString( PKG, "FuzzyMatch.Error.OutputMatchFieldMissing" ) );
        return false;
      }

      // We need to add metrics (distance, similarity, ...)
      // only when the fieldname is provided
      // and user want to return the closer value
      data.addValueFieldName =
        ( !Utils.isEmpty( environmentSubstitute( meta.getOutputValueField() ) ) && meta.isGetCloserValue() );

      // Set the number of fields to cache
      // default value is one
      int nrFields = 1;

      if ( meta.getValue() != null && meta.getValue().length > 0 ) {

        if ( meta.isGetCloserValue()
          || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE )
          || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX )
          || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX )
          || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_METAPHONE ) ) {
          // cache also additional fields
          data.addAdditionalFields = true;
          nrFields += meta.getValue().length;
        }
      }
      data.indexOfCachedFields = new int[nrFields];

      switch ( meta.getAlgorithmType() ) {
        case FuzzyMatchMeta.OPERATION_TYPE_LEVENSHTEIN:
        case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
        case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
          data.minimalDistance = Const.toInt( environmentSubstitute( meta.getMinimalValue() ), 0 );
          if ( isDetailed() ) {
            logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MinimalDistance", data.minimalDistance ) );
          }
          data.maximalDistance = Const.toInt( environmentSubstitute( meta.getMaximalValue() ), 5 );
          if ( isDetailed() ) {
            logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MaximalDistance", data.maximalDistance ) );
          }
          if ( !meta.isGetCloserValue() ) {
            data.valueSeparator = environmentSubstitute( meta.getSeparator() );
            if ( isDetailed() ) {
              logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.Separator", data.valueSeparator ) );
            }
          }
          break;
        case FuzzyMatchMeta.OPERATION_TYPE_JARO:
        case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER:
        case FuzzyMatchMeta.OPERATION_TYPE_PAIR_SIMILARITY:
          data.minimalSimilarity = Const.toDouble( environmentSubstitute( meta.getMinimalValue() ), 0 );
          if ( isDetailed() ) {
            logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MinimalSimilarity", data.minimalSimilarity ) );
          }
          data.maximalSimilarity = Const.toDouble( environmentSubstitute( meta.getMaximalValue() ), 1 );
          if ( isDetailed() ) {
            logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MaximalSimilarity", data.maximalSimilarity ) );
          }
          if ( !meta.isGetCloserValue() ) {
            data.valueSeparator = environmentSubstitute( meta.getSeparator() );
            if ( isDetailed() ) {
              logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.Separator", data.valueSeparator ) );
            }
          }
          break;
        default:
          break;
      }

      data.readLookupValues = true;

      return true;
    }
    return false;
  }

  public void dispose( StepMetaInterface smi, StepDataInterface sdi ) {
    meta = (FuzzyMatchMeta) smi;
    data = (FuzzyMatchData) sdi;
    data.look.clear();
    super.dispose( smi, sdi );
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy