co.cask.hydrator.plugin.CSVParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of transform-plugins Show documentation
There is a newer version: 2.1.2
/*
 * Copyright © 2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.Schema.Field;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.List;

/**
 * Transformation that parses a field as CSV Record into {@link StructuredRecord}.
 *
 * 
 * CSVParser supports transforming the input into {@link StructuredRecord}
 * Following are different CSV record types that are supported by this transform.
 * 

 *   DEFAULT
 *   EXCEL
 *   RFC4180
 *   MYSQL
 *   TDF and
 *   PDL   
 * 
 *   
 */
@Plugin(type = "transform")
@Name("CSVParser")
@Description("Parses a field as CSV Record into a Structured Record.")
public final class CSVParser extends Transform {
  private static final Logger LOG = LoggerFactory.getLogger(CSVParser.class);
  private final Config config;

  // Output Schema associated with transform output. 
  private Schema outSchema;

  // List of fields specified in the schema. 
  private List fields;

  // Format of CSV.
  private CSVFormat csvFormat = CSVFormat.DEFAULT;

  // Format of PDL.
  public static final CSVFormat PDL;

  // Initialize Pipe Delimiter CSV Parser. 
  static {
    PDL = CSVFormat.DEFAULT.withDelimiter('|').withEscape('\\').withIgnoreEmptyLines(false)
      .withAllowMissingColumnNames().withQuote((Character) null).withRecordSeparator('\n')
      .withIgnoreSurroundingSpaces();
  }

  // This is used only for tests, otherwise this is being injected by the ingestion framework. 
  public CSVParser(Config config) {
    this.config = config;
  }

  @Override
  public void configurePipeline(PipelineConfigurer configurer) throws IllegalArgumentException {
    super.configurePipeline(configurer);

    // Check if the format specified is valid.
    if (this.config.format == null || this.config.format.isEmpty()) {
      throw new IllegalArgumentException("Format is not specified. Allowed values are DEFAULT, EXCEL, MYSQL," +
                                           " RFC4180, PDL & TDF");
    }

    // Check if format is one of the allowed types.
    if (!this.config.format.equalsIgnoreCase("DEFAULT") && !this.config.format.equalsIgnoreCase("EXCEL") &&
      !this.config.format.equalsIgnoreCase("MYSQL") && !this.config.format.equalsIgnoreCase("RFC4180") &&
      !this.config.format.equalsIgnoreCase("TDF") && !this.config.format.equalsIgnoreCase("PDL")) {
      throw new IllegalArgumentException("Format specified is not one of the allowed values. Allowed values are " +
                                           "DEFAULT, EXCEL, MYSQL, RFC4180, PDL & TDF");
    }

    if (configurer.getStageConfigurer().getInputSchema() != null) {
      Schema.Field inputSchemaField = configurer.getStageConfigurer().getInputSchema().getField(config.field);
      if (inputSchemaField == null) {
        throw new IllegalArgumentException(
          "Field " + config.field + " is not present in the input schema");
      } else {
        if (!inputSchemaField.getSchema().getType().equals(Schema.Type.STRING)) {
          throw new IllegalArgumentException(
            "Type for field  " + config.field + " must be String");
        }
      }
    }

    // Check if schema specified is a valid schema or no.
    try {
      Schema outputSchema = Schema.parseJson(this.config.schema);
      configurer.getStageConfigurer().setOutputSchema(outputSchema);
    } catch (IOException e) {
      throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
    }

  }

  @Override
  public void initialize(TransformContext context) throws Exception {
    super.initialize(context);

    String csvFormatString = config.format.toLowerCase();
    switch (csvFormatString) {
      case "default":
        csvFormat = CSVFormat.DEFAULT;
        break;

      case "excel":
        csvFormat = CSVFormat.EXCEL;
        break;

      case "mysql":
        csvFormat = CSVFormat.MYSQL;
        break;

      case "rfc4180":
        csvFormat = CSVFormat.RFC4180;
        break;

      case "tdf":
        csvFormat = CSVFormat.TDF;
        break;

      case "pdl":
        csvFormat = PDL;
        break;

      default:
        throw new IllegalArgumentException("Format {} specified is not one of the allowed format. Allowed formats are" +
                                             "DEFAULT, EXCEL, MYSQL, RFC4180, PDL and TDF");
    }

    try {
      outSchema = Schema.parseJson(config.schema);
      fields = outSchema.getFields();
    } catch (IOException e) {
      throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
    }
  }

  @Override
  public void transform(StructuredRecord in, Emitter emitter) throws Exception {
    // Field has to string to be parsed correctly. For others throw an exception.
    String body = in.get(config.field);

    // Parse the text as CSV and emit it as structured record.
    try {
      org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(body, csvFormat);
      List records = parser.getRecords();
      for (CSVRecord record : records) {
        if (fields.size() == record.size()) {
          emitter.emit(createStructuredRecord(record));
        } else {
          LOG.warn("Skipping record as output schema specified has '{}' fields, while CSV record has '{}'",
                   fields.size(), record.size());
          // Write the record to error Dataset.
        }
      }
    } catch (IOException e) {
      LOG.error("There was a issue parsing the record. ", e.getLocalizedMessage());
    }
  }

  private StructuredRecord createStructuredRecord(CSVRecord record) {
    StructuredRecord.Builder builder = StructuredRecord.builder(outSchema);
    int i = 0;
    for (Field field : fields) {
      String val = record.get(i);
      Schema fieldSchema = field.getSchema();

      if (val.isEmpty()) {
        boolean isNullable = fieldSchema.isNullable();
        Schema.Type fieldType = isNullable ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
        // if the field is a string or a nullable string, set the value to the empty string
        if (fieldType == Schema.Type.STRING) {
          builder.set(field.getName(), "");
        } else if (!isNullable) {
          // otherwise, error out
          throw new IllegalArgumentException(String.format(
            "Field #%d (named '%s') is of non-nullable type '%s', " +
              "but was parsed as an empty string for CSV record '%s'",
            i, field.getName(), field.getSchema().getType(), record));
        }
      } else {
        builder.convertAndSet(field.getName(), val);
      }
      ++i;
    }
    return builder.build();
  }

  /**
   * Configuration for the plugin.
   */
  public static class Config extends PluginConfig {

    @Name("format")
    @Description("Specify one of the predefined formats. DEFAULT, EXCEL, MYSQL, RFC4180, PDL & TDF " +
      "are supported formats.")
    private final String format;

    @Name("field")
    @Description("Specify the field that should be used for parsing into CSV.")
    private final String field;

    @Name("schema")
    @Description("Specifies the schema that has to be output.")
    private final String schema;

    public Config(String format, String field, String schema) {
      this.format = format;
      this.field = field;
      this.schema = schema;
    }
  }
}