All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pentaho.di.trans.TransProfileFactory Maven / Gradle / Ivy

The newest version!
/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans;

import org.pentaho.di.core.database.Database;
import org.pentaho.di.core.database.DatabaseMeta;
import org.pentaho.di.core.exception.KettleDatabaseException;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.logging.LoggingObjectInterface;
import org.pentaho.di.core.plugins.PluginRegistry;
import org.pentaho.di.core.plugins.StepPluginType;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.steps.dummytrans.DummyTransMeta;
import org.pentaho.di.trans.steps.groupby.GroupByMeta;
import org.pentaho.di.trans.steps.tableinput.TableInputMeta;

/**
 * Helper class to generate profiling transformations...
 *
 * @author Matt Casters ([email protected])
 *
 */
public class TransProfileFactory {
  public static final String RESULT_STEP_NAME = "calc stats";

  private DatabaseMeta databaseMeta;
  private String schemaTable;

  private RowMetaInterface tableLayout;

  /**
   *
   * @param parent
   * @param databaseMeta
   * @param schemaTable
   *          the properly quoted schema-table combination
   * @param degreeParallelism
   *          The degree of parallelism or the number of copies that the calculation module is running with.
   */
  public TransProfileFactory( DatabaseMeta databaseMeta, String schemaTable ) {
    this.databaseMeta = databaseMeta;
    this.schemaTable = schemaTable;
  }

  public TransMeta generateTransformation( LoggingObjectInterface parentLoggingInterface ) throws KettleException {
    PluginRegistry registry = PluginRegistry.getInstance();

    // Get the list of fields from the table...
    //
    tableLayout = getTableFields( parentLoggingInterface );

    // Now start building the transformation...
    //
    TransMeta transMeta = new TransMeta( databaseMeta );
    transMeta.addDatabase( databaseMeta );

    // Create a step to read the content of the table
    // Read the data from the database table...
    // For now we read it all, later we add options to only read the first X rows
    //
    TableInputMeta readMeta = new TableInputMeta();
    readMeta.setSQL( "SELECT * FROM " + schemaTable );
    readMeta.setDatabaseMeta( databaseMeta );
    StepMeta read = new StepMeta( registry.getPluginId( StepPluginType.class, readMeta ), "Read data", readMeta );
    read.setLocation( 50, 50 );
    read.setDraw( true );
    transMeta.addStep( read );

    // Grab the data types too
    //

    // Now calculate the requested statistics for all fields...
    // TODO: create configuration possibility
    // For now, just do : min, max, sum, count, avg, std dev. (7)
    //
    int[] numericCalculations =
      new int[] {
        GroupByMeta.TYPE_GROUP_MIN, GroupByMeta.TYPE_GROUP_MAX, GroupByMeta.TYPE_GROUP_SUM,
        GroupByMeta.TYPE_GROUP_COUNT_ALL, GroupByMeta.TYPE_GROUP_AVERAGE,
        GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION, };

    int[] stringCalculations =
      new int[] { GroupByMeta.TYPE_GROUP_MIN, GroupByMeta.TYPE_GROUP_MAX, GroupByMeta.TYPE_GROUP_COUNT_ALL, };

    int[] dateCalculations =
      new int[] { GroupByMeta.TYPE_GROUP_MIN, GroupByMeta.TYPE_GROUP_MAX, GroupByMeta.TYPE_GROUP_COUNT_ALL, };

    int[] booleanCalculations =
      new int[] { GroupByMeta.TYPE_GROUP_MIN, GroupByMeta.TYPE_GROUP_MAX, GroupByMeta.TYPE_GROUP_COUNT_ALL, };

    // Run it through the "group by" step without a grouping.
    // Later, we can use the UnivariateStats plugin/step perhaps.
    //
    GroupByMeta statsMeta = new GroupByMeta();
    int nrNumeric = 0;
    int nrDates = 0;
    int nrStrings = 0;
    int nrBooleans = 0;
    for ( ValueMetaInterface valueMeta : tableLayout.getValueMetaList() ) {
      if ( valueMeta.isNumeric() ) {
        nrNumeric++;
      }
      if ( valueMeta.isDate() ) {
        nrDates++;
      }
      if ( valueMeta.isString() ) {
        nrStrings++;
      }
      if ( valueMeta.isBoolean() ) {
        nrBooleans++;
      }
    }
    int nrCalculations =
      nrNumeric
        * numericCalculations.length + nrDates * dateCalculations.length + nrStrings
        * stringCalculations.length + nrBooleans * booleanCalculations.length;

    statsMeta.allocate( 0, nrCalculations );
    int calcIndex = 0;
    for ( int i = 0; i < tableLayout.size(); i++ ) {
      ValueMetaInterface valueMeta = tableLayout.getValueMeta( i );
      // Numeric data...
      //
      if ( valueMeta.isNumeric() ) {
        //CHECKSTYLE:Indentation:OFF
        //CHECKSTYLE:LineLength:OFF
        for ( int c = 0; c < numericCalculations.length; c++ ) {
          statsMeta.getAggregateField()[calcIndex] = valueMeta.getName() + "(" + GroupByMeta.getTypeDesc( numericCalculations[c] ) + ")";
          statsMeta.getSubjectField()[calcIndex] = valueMeta.getName();
          statsMeta.getAggregateType()[calcIndex] = numericCalculations[c];
          calcIndex++;
        }
      }

      // String data
      //
      if ( valueMeta.isString() ) {
        //CHECKSTYLE:Indentation:OFF
        //CHECKSTYLE:LineLength:OFF
        for ( int c = 0; c < stringCalculations.length; c++ ) {
          statsMeta.getAggregateField()[calcIndex] = valueMeta.getName() + "(" + GroupByMeta.getTypeDesc( stringCalculations[c] ) + ")";
          statsMeta.getSubjectField()[calcIndex] = valueMeta.getName();
          statsMeta.getAggregateType()[calcIndex] = stringCalculations[c];
          calcIndex++;
        }
      }

      // Date data
      //
      if ( valueMeta.isDate() ) {
        for ( int c = 0; c < dateCalculations.length; c++ ) {
          statsMeta.getAggregateField()[calcIndex] =
            valueMeta.getName() + "(" + GroupByMeta.getTypeDesc( dateCalculations[c] ) + ")";
          statsMeta.getSubjectField()[calcIndex] = valueMeta.getName();
          statsMeta.getAggregateType()[calcIndex] = dateCalculations[c];
          calcIndex++;
        }
      }

      // Boolean data
      //
      if ( valueMeta.isBoolean() ) {
        for ( int c = 0; c < booleanCalculations.length; c++ ) {
          statsMeta.getAggregateField()[calcIndex] =
            valueMeta.getName() + "(" + GroupByMeta.getTypeDesc( booleanCalculations[c] ) + ")";
          statsMeta.getSubjectField()[calcIndex] = valueMeta.getName();
          statsMeta.getAggregateType()[calcIndex] = booleanCalculations[c];
          calcIndex++;
        }
      }
    }
    StepMeta calc = new StepMeta( registry.getPluginId( StepPluginType.class, statsMeta ), "Calc", statsMeta );
    calc.setLocation( 250, 50 );
    calc.setDraw( true );
    transMeta.addStep( calc );

    TransHopMeta hop = new TransHopMeta( read, calc );
    transMeta.addTransHop( hop );

    DummyTransMeta dummyMeta = new DummyTransMeta();
    StepMeta result =
      new StepMeta( registry.getPluginId( StepPluginType.class, dummyMeta ), RESULT_STEP_NAME, dummyMeta );
    result.setLocation( 450, 50 );
    result.setDraw( true );
    transMeta.addStep( result );

    TransHopMeta hop2 = new TransHopMeta( calc, result );
    transMeta.addTransHop( hop2 );

    return transMeta;
  }

  private RowMetaInterface getTableFields( LoggingObjectInterface parentLoggingObject ) throws KettleDatabaseException {
    Database database = new Database( parentLoggingObject, databaseMeta );
    try {
      database.connect();
      return database.getTableFields( schemaTable );
    } finally {
      database.disconnect();
    }

  }

  /**
   * @return the tableLayout
   */
  public RowMetaInterface getTableLayout() {
    return tableLayout;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy