All Downloads are FREE. Search and download functionalities are using the official Maven repository.

co.cask.wrangler.Wrangler Maven / Gradle / Ivy

/*
 * Copyright © 2016-2018 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.wrangler;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Macro;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.format.UnexpectedFormatException;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.api.plugin.PluginProperties;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.InvalidEntry;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.StageSubmitterContext;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import co.cask.cdap.etl.api.lineage.field.FieldOperation;
import co.cask.cdap.etl.api.lineage.field.FieldTransformOperation;
import co.cask.directives.aggregates.DefaultTransientStore;
import co.cask.wrangler.api.CompileException;
import co.cask.wrangler.api.CompileStatus;
import co.cask.wrangler.api.Compiler;
import co.cask.wrangler.api.Directive;
import co.cask.wrangler.api.DirectiveExecutionException;
import co.cask.wrangler.api.DirectiveInfo;
import co.cask.wrangler.api.DirectiveParseException;
import co.cask.wrangler.api.DirectiveRegistry;
import co.cask.wrangler.api.ErrorRecord;
import co.cask.wrangler.api.ExecutorContext;
import co.cask.wrangler.api.RecipeParser;
import co.cask.wrangler.api.RecipePipeline;
import co.cask.wrangler.api.RecipeSymbol;
import co.cask.wrangler.api.Row;
import co.cask.wrangler.api.TokenGroup;
import co.cask.wrangler.api.TransientStore;
import co.cask.wrangler.executor.RecipePipelineExecutor;
import co.cask.wrangler.parser.ConfigDirectiveContext;
import co.cask.wrangler.parser.GrammarBasedParser;
import co.cask.wrangler.parser.MigrateToV2;
import co.cask.wrangler.parser.RecipeCompiler;
import co.cask.wrangler.registry.CompositeDirectiveRegistry;
import co.cask.wrangler.registry.SystemDirectiveRegistry;
import co.cask.wrangler.registry.UserDirectiveRegistry;
import com.google.common.collect.ImmutableMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
 * Wrangler - A interactive tool for data cleansing and transformation.
 *
 * This plugin is an implementation of the transformation that are performed in the
 * backend for operationalizing all the interactive wrangling that is being performed
 * by the user.
 */
@Plugin(type = "transform")
@Name("Wrangler")
@Description("Wrangler - A interactive tool for data cleansing and transformation.")
public class Wrangler extends Transform {
  private static final Logger LOG = LoggerFactory.getLogger(Wrangler.class);

  // Configuration specifying the dataprep application and service name.
  private static final String APPLICATION_NAME = "dataprep";
  private static final String SERVICE_NAME = "service";
  private static final String CONFIG_METHOD = "config";

  // Plugin configuration.
  private final Config config;

  // Wrangle Execution RecipePipeline
  private RecipePipeline pipeline;

  // Output Schema associated with transform output.
  private Schema oSchema = null;

  // Error counter.
  private long errorCounter;

  // Precondition application
  private Precondition condition = null;

  // Transient Store
  private TransientStore store;

  // Directive registry.
  private DirectiveRegistry registry;

  // This is used only for tests, otherwise this is being injected by the ingestion framework.
  public Wrangler(Config config) {
    this.config = config;
  }

  /**
   * Configures the plugin during deployment of the pipeline that uses the plugin.
   *
   * 

*

    *
  • Parses the directives configured. If there are any issues they will highlighted during deployment
  • *
  • Input schema is validated.
  • *
  • Compiles pre-condition expression.
  • *
*

*/ @Override public void configurePipeline(PipelineConfigurer configurer) throws IllegalArgumentException { super.configurePipeline(configurer); try { Schema iSchema = configurer.getStageConfigurer().getInputSchema(); if (!config.containsMacro("field") && !config.field.equalsIgnoreCase("*")) { validateInputSchema(iSchema); } String directives = config.directives; if (config.udds != null && !config.udds.trim().isEmpty()) { if (config.containsMacro("directives")) { directives = String.format("#pragma load-directives %s;", config.udds); } else { directives = String.format("#pragma load-directives %s;%s", config.udds, config.directives); } } // Validate the DSL by compiling the DSL. In case of macros being // specified, the compilation will them at this phase. Compiler compiler = new RecipeCompiler(); try { // Compile the directive extracting the loadable plugins (a.k.a // Directives in this context). CompileStatus status = compiler.compile(new MigrateToV2(directives).migrate()); RecipeSymbol symbols = status.getSymbols(); Set dynamicDirectives = symbols.getLoadableDirectives(); for (String directive : dynamicDirectives) { Object o = configurer.usePlugin(Directive.Type, directive, directive, PluginProperties.builder().build()); if (o == null) { throw new IllegalArgumentException( String.format("User Defined Directive '%s' is not deployed or is not available.", directive) ); } } // If the 'directives' contains macro, then we would not attempt to compile // it. if(!config.containsMacro("directives")) { // Create the registry that only interacts with system directives. registry = new CompositeDirectiveRegistry(new SystemDirectiveRegistry()); if (symbols != null) { Iterator iterator = symbols.iterator(); while(iterator != null && iterator.hasNext()) { TokenGroup group = iterator.next(); if (group != null) { String directive = (String) group.get(0).value(); DirectiveInfo directiveInfo = registry.get(directive); if (directiveInfo == null && !dynamicDirectives.contains(directive)) { throw new IllegalArgumentException( String.format("Wrangler plugin has a directive '%s' that does not exist in system or " + "user space. Either it is a typographical error or the user directive is " + "not loaded.", directive) ); } } } } } } catch (CompileException e) { throw new IllegalArgumentException(e.getMessage(), e); } catch (DirectiveParseException e) { throw new IllegalArgumentException(e.getMessage()); } // Based on the configuration create output schema. try { if (!config.containsMacro("schema")) { oSchema = Schema.parseJson(config.schema); } } catch (IOException e) { throw new IllegalArgumentException("Format of output schema specified is invalid. Please check the format."); } // Check if configured field is present in the input schema. Schema inputSchema = configurer.getStageConfigurer().getInputSchema(); if (!config.containsMacro("field") && !(config.field.equals("*") || config.field.equals("#") ) && (inputSchema != null && inputSchema.getField(config.field) == null)) { throw new IllegalArgumentException( String.format("Field '%s' configured to wrangler is not present in the input. " + "Only specify fields present in the input", config.field == null ? "null" : config.field) ); } // Check if pre-condition is not null or empty and if so compile expression. if(!config.containsMacro("precondition")) { if (config.precondition != null && !config.precondition.trim().isEmpty()) { try { new Precondition(config.precondition); } catch (PreconditionException e) { throw new IllegalArgumentException(e.getMessage()); } } } // Set the output schema. if (oSchema != null) { configurer.getStageConfigurer().setOutputSchema(oSchema); } } catch (Exception e) { LOG.error(e.getMessage()); throw new IllegalArgumentException(e.getMessage()); } } @Override public void prepareRun(StageSubmitterContext context) throws Exception { super.prepareRun(context); List inputFields = new ArrayList<>(); List outputFields = new ArrayList<>(); Schema inputSchema = context.getInputSchema(); if (checkSchema(inputSchema, "input")) { //noinspection ConstantConditions inputFields = inputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()); } Schema outputSchema = context.getOutputSchema(); if (checkSchema(outputSchema, "output")) { //noinspection ConstantConditions outputFields = outputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()); } FieldOperation dataPrepOperation = new FieldTransformOperation("Prepare Data", new MigrateToV2(config.directives).migrate(), inputFields, outputFields); context.record(Collections.singletonList(dataPrepOperation)); } private boolean checkSchema(Schema schema, String name) { if (schema == null) { LOG.debug(String.format("The %s schema is null. Field level lineage will not be recorded", name)); return false; } if (schema.getFields() == null) { LOG.debug(String.format("The %s schema fields are null. Field level lineage will not be recorded", name)); return false; } return true; } /** * Validates input schema. * * @param inputSchema configured for the plugin. */ private void validateInputSchema(Schema inputSchema) { if (inputSchema != null) { // Check the existence of field in input schema Schema.Field inputSchemaField = inputSchema.getField(config.field); if (inputSchemaField == null) { throw new IllegalArgumentException( "Field " + config.field + " is not present in the input schema"); } } } /** * Initialize the wrangler by parsing the directives and creating the runtime context. * * @param context framework context being passed. */ @Override public void initialize(TransformContext context) throws Exception { super.initialize(context); // Parse DSL and initialize the wrangle pipeline. store = new DefaultTransientStore(); registry = new CompositeDirectiveRegistry( new SystemDirectiveRegistry(), new UserDirectiveRegistry(context) ); String directives = config.directives; if (config.udds != null && !config.udds.trim().isEmpty()) { directives = String.format("#pragma load-directives %s;%s", config.udds, config.directives); } RecipeParser recipe = new GrammarBasedParser(new MigrateToV2(directives).migrate(), registry); ExecutorContext ctx = new WranglerPipelineContext(ExecutorContext.Environment.TRANSFORM, context, store); // Based on the configuration create output schema. try { oSchema = Schema.parseJson(config.schema); } catch (IOException e) { throw new IllegalArgumentException( String.format("Stage:%s - Format of output schema specified is invalid. Please check the format.", context.getStageName()) ); } // Check if pre-condition is not null or empty and if so compile expression. if (config.precondition != null && !config.precondition.trim().isEmpty()) { try { condition = new Precondition(config.precondition); } catch (PreconditionException e) { throw new IllegalArgumentException(e.getMessage()); } } // Make a call to retrieve the directive config and create a context. // In this plugin, a call is made to the service at runtime. // NOTE: This has to be moved to pre-start-plugin phase so that the information // it's not heavily parallelized to crash the service. try { URL url = getDPServiceURL(CONFIG_METHOD); if (url != null) { ConfigDirectiveContext dContext = new ConfigDirectiveContext(url); recipe.initialize(dContext); } else { // this is normal in cloud environments LOG.info(String.format("Stage:%s - The Dataprep service is not accessible in this environment. " + "No aliasing and restriction will be applied.", getContext().getStageName())); recipe.initialize(null); } } catch (IOException | URISyntaxException e) { // If there is a issue, we need to fail the pipeline that has the plugin. throw new IllegalArgumentException( String.format("Stage:%s - Issue in retrieving the configuration from the service. %s", getContext().getStageName(), e.getMessage()) ); } try { // Create the pipeline executor with context being set. pipeline = new RecipePipelineExecutor(); pipeline.initialize(recipe, ctx); } catch (Exception e) { throw new Exception( String.format("Stage:%s - %s", getContext().getStageName(), e.getMessage()) ); } // Initialize the error counter. errorCounter = 0; } @Override public void destroy() { super.destroy(); pipeline.destroy(); try { registry.close(); } catch (IOException e) { LOG.warn("Unable to close the directive registry. You might see increasing number of open file handle.", e.getMessage()); } } /** * Transforms the input record by applying directives on the record being passed. * * @param input record to be transformed. * @param emitter to collect all the output of the transformation. * @throws Exception thrown if there are any issue with the transformation. */ @Override public void transform(StructuredRecord input, Emitter emitter) throws Exception { long start = 0; List records; try { // Creates a row as starting point for input to the pipeline. Row row = new Row(); if ("*".equalsIgnoreCase(config.field)) { for (Schema.Field field : input.getSchema().getFields()) { row.add(field.getName(), getValue(input, field.getName())); } } else if ("#".equalsIgnoreCase(config.field)) { row.add(input.getSchema().getRecordName(), input); } else { row.add(config.field, getValue(input, config.field)); } // If pre-condition is set, then evaluate the precondition if (condition != null) { boolean skip = condition.apply(row); if (skip) { getContext().getMetrics().count("precondition.filtered", 1); return; // Expression evaluated to true, so we skip the record. } } // Reset record aggregation store. store.reset(); start = System.nanoTime(); records = pipeline.execute(Arrays.asList(row), oSchema); // We now extract errors from the execution and pass it on to the error emitter. List errors = pipeline.errors(); if (errors.size() > 0) { getContext().getMetrics().count("errors", errors.size()); for (ErrorRecord error : errors) { emitter.emitError(new InvalidEntry<>(error.getCode(), error.getMessage(), input)); } } } catch (Exception e) { getContext().getMetrics().count("failures", 1); errorCounter++; // If error threshold is reached, then terminate processing // If threshold is set to -1, it tolerant unlimited errors if (config.threshold != -1 && errorCounter > config.threshold) { emitter.emitAlert(ImmutableMap.of( "stage", getContext().getStageName(), "code", String.valueOf(1), "message", "Error threshold reached.", "value", String.valueOf(errorCounter) )); if (e instanceof DirectiveExecutionException) { throw new Exception(String.format("Stage:%s - Reached error threshold %d, terminating processing " + "due to error : %s", getContext().getStageName(), config.threshold, e.getMessage())); } else { throw new Exception(String.format("Stage:%s - Reached error threshold %d, terminating processing " + "due to error : %s", getContext().getStageName(), config.threshold, e.getMessage()), e); } } // Emit error record, if the Error flattener or error handlers are not connected, then // the record is automatically omitted. emitter.emitError(new InvalidEntry<>(0, e.getMessage(), input)); return; } finally { getContext().getMetrics().gauge("process.time", System.nanoTime() - start); } for (StructuredRecord record : records) { StructuredRecord.Builder builder = StructuredRecord.builder(oSchema); // Iterate through output schema, if the 'record' doesn't have it, then // attempt to take if from 'input'. for (Schema.Field field : oSchema.getFields()) { Object wObject = record.get(field.getName()); // wrangled records if (wObject == null) { builder.set(field.getName(), null); } else { if (wObject instanceof String) { builder.convertAndSet(field.getName(), (String) wObject); } else { // No need to use specific methods for fields of logical type - timestamp, date and time. This is because // the wObject should already have correct values for corresponding primitive types. builder.set(field.getName(), wObject); } } } emitter.emit(builder.build()); } } private Object getValue(StructuredRecord input, String fieldName) { Schema fieldSchema = input.getSchema().getField(fieldName).getSchema(); fieldSchema = fieldSchema.isNullable() ? fieldSchema.getNonNullable() : fieldSchema; Schema.LogicalType logicalType = fieldSchema.getLogicalType(); if (logicalType != null) { switch (logicalType) { case DATE: return input.getDate(fieldName); case TIME_MILLIS: case TIME_MICROS: return input.getTime(fieldName); case TIMESTAMP_MILLIS: case TIMESTAMP_MICROS: return input.getTimestamp(fieldName); default: throw new UnexpectedFormatException("Field type " + logicalType + " is not supported."); } } // If the logical type is present in complex types, it will be retrieved as corresponding // simple type (int/long). return input.get(fieldName); } /** * Retrieves the base url from the context and appends method to value to the final url. * * @param method to be invoked. * @return fully formed url to the method. */ private URL getDPServiceURL(String method) throws URISyntaxException, MalformedURLException { URL url = getContext().getServiceURL(APPLICATION_NAME, SERVICE_NAME); if (url == null) { return null; } URI uri = url.toURI(); String path = uri.getPath() + method; return uri.resolve(path).toURL(); } /** * Config for the plugin. */ public static class Config extends PluginConfig { @Name("precondition") @Description("Precondition expression specifying filtering before applying directives (true to filter)") @Macro private String precondition; @Name("directives") @Description("Recipe for wrangling the input records") @Macro @Nullable private String directives; @Name("udd") @Description("List of User Defined Directives (UDD) that have to be loaded.") @Nullable private String udds; @Name("field") @Description("Name of the input field to be wrangled or '*' to wrangle all the fields.") @Macro private final String field; @Name("threshold") @Description("Max number of event failures in wrangling after which to stop the pipeline of processing. " + "Threshold is not aggregate across all instance, but is applied for each running instances. " + "Set to -1 to specify unlimited number of acceptable errors.") @Macro private final int threshold; @Name("schema") @Description("Specifies the schema that has to be output.") @Macro private final String schema; public Config(String precondition, String directives, String udds, String field, int threshold, String schema) { this.precondition = precondition; this.directives = directives; this.udds = udds; this.field = field; this.threshold = threshold; this.schema = schema; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy