All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twineworks.kettle.ruby.step.execmodels.SimpleExecutionModel Maven / Gradle / Ivy

The newest version!
/*
 * Ruby for pentaho kettle
 * Copyright (C) 2017 Twineworks GmbH
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

package com.twineworks.kettle.ruby.step.execmodels;

import com.twineworks.kettle.ruby.step.*;
import com.twineworks.kettle.ruby.step.meta.RubyScriptMeta;
import com.twineworks.kettle.ruby.step.meta.RubyVariableMeta;
import com.twineworks.kettle.ruby.step.streams.*;
import org.apache.commons.lang.ArrayUtils;
import org.jruby.*;
import org.jruby.embed.EvalFailedException;
import org.jruby.exceptions.ThreadKill;
import org.jruby.javasupport.JavaEmbedUtils;
import org.jruby.javasupport.JavaUtil;
import org.jruby.runtime.builtin.IRubyObject;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.*;
import org.pentaho.di.core.row.value.ValueMetaInternetAddress;
import org.pentaho.di.core.row.value.ValueMetaTimestamp;
import org.pentaho.di.trans.step.errorhandling.StreamInterface;

import java.io.File;
import java.io.StringReader;
import java.math.BigDecimal;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.sql.Timestamp;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;

public class SimpleExecutionModel implements ExecutionModel {

  private RubyStepData data;
  private RubyStepMeta meta;
  private RubyStep step;

  @Override
  public void setEnvironment(RubyStep step, RubyStepData data, RubyStepMeta meta) {
    this.data = data;
    this.meta = meta;
    this.step = step;
  }

  @Override
  public boolean onInit() {

    try {

      data.forcedHalt = false;

      data.container = RubyStepFactory.createScriptingContainer(true);

      data.runtime = data.container.getProvider().getRuntime();

      // set gem home if specified
      setGemHome();

      data.container.setScriptFilename(meta.getRowScript().getTitle());
      data.rubyScriptObject = data.container.parse(meta.getRowScript().getScript(), 0);

      // put the usual stuff into global scope
      data.container.put("$step", step);
      data.container.put("$trans", step.getDispatcher());

      // put all variables into scope
      for (RubyVariableMeta var : meta.getRubyVariables()) {
        data.container.put(var.getName(), step.environmentSubstitute(var.getValue()));
      }

      // put all script tabs into scope
      RubyHash tabs = new RubyHash(data.runtime);

      for (RubyScriptMeta tab : meta.getScripts()) {
        tabs.put(tab.getTitle(), new ScriptTab(tab, data));
      }

      data.container.put("$tabs", tabs);

      // temporary place for the output a script might produce
      data.rowList = new LinkedList();

      // add << aliases to the java stream writers
      data.container.runScriptlet("JavaUtilities.extend_proxy('com.twineworks.kettle.ruby.step.streams.StdStreamWriter') {alias << write}\n");
      data.container.runScriptlet("JavaUtilities.extend_proxy('com.twineworks.kettle.ruby.step.streams.ErrorStreamWriter') {alias << write}\n");
      data.container.runScriptlet("JavaUtilities.extend_proxy('com.twineworks.kettle.ruby.step.streams.StepStreamWriter') {alias << write}\n");

    } catch (Exception e) {
      step.logError("Error Initializing Ruby Scripting Step", e);
      return false;
    }

    return true;
  }

  private void setGemHome() {

    // if specified directly, take it
    String gemHomeString = step.environmentSubstitute(meta.getGemHome());

    // if not, fall back to RUBY_GEM_HOME
    if (Const.isEmpty(gemHomeString) && !Const.isEmpty(step.getVariable("RUBY_GEM_HOME"))) {
      gemHomeString = step.environmentSubstitute("${RUBY_GEM_HOME}");
    }

    // if that fails, use the standard one
    if (Const.isEmpty(gemHomeString)) {
      gemHomeString = step.getPluginDir() + Const.FILE_SEPARATOR + "gems";
    }

    if (!Const.isEmpty(gemHomeString)) {

      File gemHomePath = new File(gemHomeString);
      gemHomePath = gemHomePath.getAbsoluteFile();

      String gemHome = gemHomePath.getAbsolutePath();
      data.container.runScriptlet("require 'rubygems/defaults/jruby';Gem::Specification.add_dir '" + gemHome + "' unless Gem::Specification.dirs.member?( '" + gemHome + "' )");

    }
  }

  @Override
  public void onDispose() {

    data.marshal = null;
    data.bigDecimal = null;

    if (data.container != null) {
      data.container.terminate();
    }

    data.container = null;
    data.rubyScriptObject = null;
    data.runtime = null;

  }

  @Override
  public void onStopRunning() throws KettleException {

    // allow a few seconds for normal shutdown (i.e. completion of single row processing), before forcibly shutting things down
    new Thread() {
      public void run() {
        try {
          Thread.sleep(5000);
          forceStopRubyThreads();
        } catch (InterruptedException ignored) {
        }
      }
    }.start();


  }

  private void forceStopRubyThreads() {

    // if the container is disposed already, bail out
    if (data.container == null) return;

    // try to kill all threads once
    if (!data.forcedHalt) {
      data.forcedHalt = true;
    } else {
      return;
    }

    if (data.runtime != null) {

      RubyThread[] threads = data.runtime.getThreadService().getActiveRubyThreads();

      for (int i = 0; i < threads.length; i++) {
        try {
          threads[i].kill();
        } catch (ThreadKill e) {
        }
      }

      data.runtime.tearDown();

    }


  }

  private IRubyObject getMarshal() {
    if (data.marshal == null) {
      data.marshal = data.container.parse("Marshal").run();
    }
    return data.marshal;
  }

  private IRubyObject getBigDecimal() {
    if (data.bigDecimal == null) {
      data.bigDecimal = data.container.parse("require 'bigdecimal'; BigDecimal").run();
    }
    return data.bigDecimal;
  }

  private IRubyObject getIPAddr() {
    if (data.ipAddr == null) {
      data.ipAddr = data.container.parse("require 'ipaddr'; IPAddr").run();
    }
    return data.ipAddr;
  }

  private void initMainRowStream() throws KettleException {

    // steps inputRowMeta might be null in case we have info steps only, or there's no input to begin with

    RowMetaInterface inputRowMeta = step.getInputRowMeta();
    if (inputRowMeta == null) {
      // when steps connect, but there's no rows, there's also no input row meta
      if (data.hasDirectInput){
        inputRowMeta = step.getTransMeta().getPrevStepFields(step.getStepMeta());
      }
      // when steps don't connect, there's no fields
      else{
        inputRowMeta = new RowMeta();
      }

    }

    data.inputRowMeta = inputRowMeta.clone();
    data.inputFieldNames = data.inputRowMeta.getFieldNames();

    data.outputRowMeta = inputRowMeta.clone();
    meta.getFields(data.outputRowMeta, step.getStepname(), null, null, step, null, null);

    data.cacheFieldNames(data.inputRowMeta);
    data.cacheFieldNames(data.outputRowMeta);

    data.baseRowMeta = meta.isClearInputFields() ? data.emptyRowMeta : data.inputRowMeta;

    // put the standard streams into ruby scope
    data.container.put("$output", new StdStreamWriter(this));
    data.container.put("$input", new StdStreamReader(this));

    if (meta.getParentStepMeta().isDoingErrorHandling()) {

      data.errorRowMeta = meta.getParentStepMeta().getStepErrorMeta().getErrorFields().clone();
      data.stepErrorMeta = meta.getParentStepMeta().getStepErrorMeta();
      data.cacheFieldNames(data.errorRowMeta);

      data.container.put("$error", new ErrorStreamWriter(this));
    }

    // put the target steps into ruby scope
    RubyHash targetSteps = new RubyHash(data.runtime);

    int t = 0;
    for (StreamInterface stream : meta.getStepIOMeta().getTargetStreams()) {
      StepStreamWriter writer = new StepStreamWriter(this, stream.getStepname());
      targetSteps.put(meta.getTargetSteps().get(t).getRoleName(), writer);
      t++;
    }

    data.container.put("$target_steps", targetSteps);

  }

  public RubyHash createRubyInputRow(RowMetaInterface rowMeta, Object[] r) throws KettleException {

    // create a hash for the row, they are not reused on purpose, so the scripting user can safely use them to store entire rows between invocations
    RubyHash rubyRow = new RubyHash(data.runtime);

    String[] fieldNames = rowMeta.getFieldNames();
    for (int i = 0; i < fieldNames.length; i++) {

      String field = fieldNames[i];
      // null values don't need no special treatment, they'll become nil
      if (r[i] == null) {
        rubyRow.put(field, null);
      } else {

        ValueMetaInterface vm = rowMeta.getValueMeta(i);

        switch (vm.getType()) {
          case ValueMetaInterface.TYPE_BOOLEAN:
            rubyRow.put(field, vm.getBoolean(r[i]));
            break;
          case ValueMetaInterface.TYPE_INTEGER:
            rubyRow.put(field, vm.getInteger(r[i]));
            break;
          case ValueMetaInterface.TYPE_STRING:
            rubyRow.put(field, vm.getString(r[i]));
            break;
          case ValueMetaInterface.TYPE_NUMBER:
            rubyRow.put(field, vm.getNumber(r[i]));
            break;
          case ValueMetaInterface.TYPE_NONE:
            rubyRow.put(field, r[i]);
            break;
          case ValueMetaInterface.TYPE_SERIALIZABLE:
            if (r[i] instanceof RubyStepMarshalledObject) {
              Object restoredObject = getMarshal().callMethod(data.runtime.getCurrentContext(), "restore", data.runtime.newString(r[i].toString()));
              rubyRow.put(field, restoredObject);
            } else {
              // try to put the object in there as it is.. should create a nice adapter for the java object
              rubyRow.put(field, r[i]);
            }
            break;
          case ValueMetaInterface.TYPE_BINARY:
            // put a ruby array with bytes in there, that is expensive and should probably be avoided
            rubyRow.put(fieldNames[i],
              data.runtime.newArrayNoCopy(JavaUtil.convertJavaArrayToRuby(data.runtime, ArrayUtils.toObject((byte[]) vm.getBinary(r[i]))))
            );

            break;

          case ValueMetaInterface.TYPE_BIGNUMBER:
            IRubyObject bigDecimalObject = getBigDecimal().callMethod(data.runtime.getCurrentContext(), "new", data.runtime.newString((vm.getBigNumber(r[i])).toString()));
            rubyRow.put(field, bigDecimalObject);
            break;

          case ValueMetaInterface.TYPE_DATE:
            rubyRow.put(field, data.runtime.newTime((vm.getDate(r[i])).getTime()));
            break;

          case ValueMetaInterface.TYPE_TIMESTAMP:
            ValueMetaTimestamp vmTimestamp = (ValueMetaTimestamp) vm;
            Timestamp ts = vmTimestamp.getTimestamp(r[i]);
            RubyTime rubyTime = data.runtime.newTime(ts.getTime()/1000*1000);
            rubyTime.setNSec(ts.getNanos());
            rubyRow.put(field, rubyTime);
            break;

          case ValueMetaInterface.TYPE_INET:
            ValueMetaInternetAddress vmInet = (ValueMetaInternetAddress) vm;
            InetAddress ip = vmInet.getInternetAddress(r[i]);
            IRubyObject ipObject = getIPAddr().callMethod(data.runtime.getCurrentContext(), "new", data.runtime.newString(ip.getHostAddress()));
            rubyRow.put(field, ipObject);
            break;
        }

      }

    }

    return rubyRow;

  }

  private void applyRubyHashToRow(Object[] r, RubyHash resultRow, List forFields, RowMetaInterface forRow) throws KettleException {

    // set each field's value from the resultRow
    for (ValueMetaInterface outField : forFields) {

      IRubyObject rubyVal = resultRow.fastARef(data.rubyStringCache.get(outField.getName()));

      // convert simple cases automatically
      Object javaValue = null;

      // for nil values just put null into the row
      if (rubyVal != null && !rubyVal.isNil()) {

        // TODO: provide a meaningful error message if conversion fails because the user put non-convertible results in there (like a string saying "true"/"false" for the bool type)
        switch (outField.getType()) {
          case ValueMetaInterface.TYPE_BOOLEAN:
            javaValue = JavaEmbedUtils.rubyToJava(data.runtime, rubyVal, Boolean.class);
            break;
          case ValueMetaInterface.TYPE_INTEGER:
            javaValue = JavaEmbedUtils.rubyToJava(data.runtime, rubyVal, Long.class);
            break;
          case ValueMetaInterface.TYPE_STRING:
            javaValue = rubyVal.toString();
            break;
          case ValueMetaInterface.TYPE_NUMBER:
            javaValue = JavaEmbedUtils.rubyToJava(data.runtime, rubyVal, Double.class);
            break;
          case ValueMetaInterface.TYPE_SERIALIZABLE:
            String marshalled = getMarshal().callMethod(data.runtime.getCurrentContext(), "dump", rubyVal).toString();
            javaValue = new RubyStepMarshalledObject(marshalled);
            break;
          case ValueMetaInterface.TYPE_BINARY:
            // TODO: provide meaningful error message if this fails
            RubyArray arr = rubyVal.convertToArray();

            byte[] bytes = new byte[arr.size()];
            for (int i = 0; i < bytes.length; i++) {
              Object rItem = arr.get(i);
              if (rItem instanceof Number) {
                bytes[i] = ((Number) rItem).byteValue();
              } else {
                throw new KettleException("Found a non-number in Binary field " + outField.getName() + ": " + rItem.toString());
              }
            }
            javaValue = bytes;
            break;
          case ValueMetaInterface.TYPE_BIGNUMBER:
            if (rubyVal instanceof RubyFloat) {
              javaValue = new BigDecimal((Double) rubyVal.toJava(Double.class));
            } else {
              javaValue = new BigDecimal(rubyVal.toString());
            }

            break;
          case ValueMetaInterface.TYPE_DATE:
            if (rubyVal instanceof RubyFixnum) {
              javaValue = new Date(((RubyFixnum) rubyVal).getLongValue());
            } else if (rubyVal instanceof RubyTime) {
              javaValue = ((RubyTime) rubyVal).getJavaDate();
            }
            else{
              throw new KettleException("cannot convert ruby value "+rubyVal.toString()+" to java Date");
            }
            break;

          case ValueMetaInterface.TYPE_TIMESTAMP:
            if (rubyVal instanceof RubyFixnum) {
              javaValue = new java.sql.Timestamp(((RubyFixnum) rubyVal).getLongValue());
            } else if (rubyVal instanceof RubyTime) {
              RubyTime time = (RubyTime) rubyVal;
              long millis = time.getDateTime().getMillis();
              Timestamp ts = new java.sql.Timestamp(millis/1000*1000);
              ts.setNanos((int) ((time.getNSec())+(millis%1000*1000000)));
              javaValue = ts;
            }
            else{
              throw new KettleException("cannot convert ruby value "+rubyVal.toString()+" to java.sql.Timestamp");
            }
            break;

          case ValueMetaInterface.TYPE_INET:
            Long longNum = (Long) data.container.callMethod(rubyVal, "to_i");
            javaValue = toInetAddress(longNum.intValue());
            break;

        }

      }

      r[data.fieldIndexCache.get(forRow).get(outField.getName())] = javaValue;
    }

  }

  private byte[] toIPByteArray(int addr){
    return new byte[]{(byte)(addr>>>24), (byte)(addr>>>16), (byte)(addr>>>8), (byte)addr};
  }

  private InetAddress toInetAddress(int addr){
    try {
      return InetAddress.getByAddress(toIPByteArray(addr));
    } catch (UnknownHostException e) {
      //should never happen
      return null;
    }
  }

  public void fetchRowsFromScriptOutput(IRubyObject rubyObject, RowMetaInterface inRow, Object[] r, List rowList, List forFields, RowMetaInterface forRow) throws KettleException {

    // skip nil result rows
    if (rubyObject.isNil()) {
      return;
    }

    // ruby hashes are processed instantly
    if (rubyObject instanceof RubyHash) {
      // clone the row only if necessary
      if (rowList.size() > 0) {
        r = RowDataUtil.resizeArray(inRow.cloneRow(r), forRow.size());
      } else {
        r = RowDataUtil.resizeArray(r, forRow.size());
      }
      applyRubyHashToRow(r, (RubyHash) rubyObject, forFields, forRow);
      rowList.add(r);
      return;
    }

    // arrays are handled recursively:
    if (rubyObject instanceof RubyArray) {
      RubyArray rubyArray = (RubyArray) rubyObject;
      int length = rubyArray.getLength();
      for (int i = 0; i < length; i++) {
        fetchRowsFromScriptOutput(rubyArray.entry(i), inRow, r, rowList, forFields, forRow);
      }
      return;
    }

    // at this point the returned object is not nil, not a hash and not an array, let's ignore the output but warn in the log
    step.logBasic("WARNING: script returned non-hash value: " + rubyObject.toString() + " as a result ");

  }

  @Override
  public boolean onProcessRow() throws KettleException {

    // as calls to getRow() would yield rows from indeterminate sources unless
    // all info streams have been emptied first
    // we opt to enforce to have all info steps or no info steps
    try {

      Object[] r = null;

      if (step.first) {
        data.hasDirectInput = meta.hasDirectInput();
        // call the init script here rather than in the init section. It guarantees that other steps are fully initialized.
        if (meta.getInitScript() != null) {
          data.container.runScriptlet(new StringReader(meta.getInitScript().getScript()), meta.getInitScript().getTitle());
        }

        // this must be done before the first call to getRow() in case there are info streams present
        initInfoRowStreams();
      }

      // directinput means, there's no info steps and at least one step providing data
      if (data.hasDirectInput) {

        r = step.getRow();

        // only now is the metadata available
        if (step.first) {
          initMainRowStream();
          step.first = false;
        }

        // get the next row
        if (r != null) {

          RubyHash rubyRow = createRubyInputRow(data.inputRowMeta, r);

          // put the row into the container
          data.container.put("$row", rubyRow);

          // run the script, the result is one or more rows
          IRubyObject scriptResult = data.rubyScriptObject.run();

          data.rowList.clear();
          fetchRowsFromScriptOutput(scriptResult, data.baseRowMeta, r, data.rowList, meta.getAffectedFields(), data.outputRowMeta);

          // now if the script has output rows, write them to the main output stream
          for (Object[] outrow : data.rowList) {
            step.putRow(data.outputRowMeta, outrow);
          }

          return true;
        } else {

          // run the end script here rather then on dispose end, ensures that the row streams are still up, so user can choose to
          // write "summary" rows and the like
          if (meta.getDisposeScript() != null) {
            data.container.runScriptlet(meta.getDisposeScript().getScript());
          }

          // no more rows coming in
          step.setOutputDone();
          return false;
        }

      }

      // no direct input means the script is not getting an input row and is executed exactly once
      else {
        if (step.first) {
          initMainRowStream();
          step.first = false;
        }
        r = new Object[data.outputRowMeta.size()];

        // run the script, the result is one or more rows
        IRubyObject scriptResult = data.rubyScriptObject.run();

        data.rowList.clear();
        fetchRowsFromScriptOutput(scriptResult, data.baseRowMeta, r, data.rowList, meta.getAffectedFields(), data.outputRowMeta);

        // now if the script has output rows, write them to the main output stream
        for (Object[] outrow : data.rowList) {
          step.putRow(data.outputRowMeta, outrow);
        }

        // run the end script here rather then on dispose end, ensures that the row streams are still up, so user can choose to
        // write "summary" rows and the like
        if (meta.getDisposeScript() != null) {
          data.container.runScriptlet(meta.getDisposeScript().getScript());
        }

        step.setOutputDone();
        return false;
      }

    } catch (EvalFailedException e) {
      if (!data.forcedHalt) {
        throw new KettleException(e);
      }
      // transformation has been stopped
      return false;
    } catch (ThreadKill e) {
      if (!data.forcedHalt) {
        throw new KettleException(e);
      }
      // transformation has been stopped
      return false;
    }

  }

  private void initInfoRowStreams() throws KettleException {

    // put the info steps into ruby scope
    RubyHash infoSteps = new RubyHash(data.runtime);

    int i = 0;
    for (StreamInterface stream : meta.getStepIOMeta().getInfoStreams()) {

      StepStreamReader reader = new StepStreamReader(this, stream.getStepname());

      // if there's direct input connected as well as info streams present, the info streams *must* be prefetched as per 4.0 API
      if (data.hasDirectInput) {
        RubyArray allRows = reader.readAll();
        BufferStreamReader bReader = new BufferStreamReader(this, allRows);
        infoSteps.put(meta.getInfoSteps().get(i).getRoleName(), bReader);
      } else {
        infoSteps.put(meta.getInfoSteps().get(i).getRoleName(), reader);
      }

      i++;
    }

    data.container.put("$info_steps", infoSteps);

  }

  public RubyStep getStep() {
    return step;
  }

  public RubyStepData getData() {
    return data;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy