io.cdap.plugin.CSVFormatter Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2015-2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.data.schema.Schema.Field;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageSubmitterContext;
import io.cdap.cdap.etl.api.Transform;
import io.cdap.cdap.etl.api.TransformContext;
import io.cdap.plugin.common.TransformLineageRecorderUtils;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;

/**
 * Transform that formats a {@link StructuredRecord} to CSV.
 * 
 * CSVFormatter supports transforming the input {@link StructuredRecord}
 * into CSV Record of varying types. Following are different CSV record
 * types that are supported by this transform.
 * 

 *   DELIMITED
 *   EXCEL
 *   RFC4180
 *   MYSQL and
 *   TDF
 * 
 * 
 */
@Plugin(type = "transform")
@Name("CSVFormatter")
@Description("Formats a Structured Record to CSV")
public final class CSVFormatter extends Transform {
  private static final Logger LOG = LoggerFactory.getLogger(CSVFormatter.class);

  // Transform configuration.
  private final Config config;

  // Output Schema associated with transform output.
  private Schema outSchema;

  // List of fields specified in the schema.
  private List fields;

  // Mapping from delimiter name to the character to be used as delimiter.
  private static final Map delimMap = Maps.newHashMap();

  // Format of CSV File.
  private CSVFormat csvFileFormat;

  // This is used only for tests, otherwise this is being injected by the ingestion framework.
  public CSVFormatter(Config config) {
    this.config = config;
  }

  // Static collection of delimiter mappings from name to delim.
  static {
    delimMap.put("COMMA", ",");
    delimMap.put("CTRL-A", "\001");
    delimMap.put("TAB", "\t");
    delimMap.put("VBAR", "|");
    delimMap.put("STAR", "*");
    delimMap.put("CARET", "^");
    delimMap.put("DOLLAR", "$");
    delimMap.put("HASH", "#");
    delimMap.put("TILDE", "~");
    delimMap.put("CTRL-B", "\002");
    delimMap.put("CTRL-C", "\003");
    delimMap.put("CTRL-D", "\004");
    delimMap.put("CTRL-E", "\005");
    delimMap.put("CTRL-F", "\006");
  }

  @Override
  public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {
    super.configurePipeline(pipelineConfigurer);
    FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector();
    config.validate(collector);
    Schema schema = config.getSchema(collector);
    pipelineConfigurer.getStageConfigurer().setOutputSchema(schema);
  }

  @Override
  public void prepareRun(StageSubmitterContext context) throws Exception {
    super.prepareRun(context);

    // Map all the input fields to the first output field.
    List outputFields = TransformLineageRecorderUtils.getFields(context.getOutputSchema());
    if (!outputFields.isEmpty()) {
      context.record(
        TransformLineageRecorderUtils
          .generateManyToOne(TransformLineageRecorderUtils.getFields(context.getInputSchema()), outputFields.get(0),
            "csvFormat", "Formatted the input data as CSV."));
    }
    config.validate(context.getFailureCollector());
  }

  @Override
  public void initialize(TransformContext context) throws Exception {
    super.initialize(context);

    try {
      outSchema = Schema.parseJson(config.schema);
      fields = outSchema.getFields();
    } catch (IOException e) {
      throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
    }

    // Based on the delimiter name specified pick the delimiter to be used for the record.
    // This is only applicable when the format type is chosen as DELIMITER
    char delim;
    if (delimMap.containsKey(config.delimiter)) {
      delim = delimMap.get(config.delimiter).charAt(0);
    } else {
      throw new IllegalArgumentException("Unknown delimiter '" + config.delimiter + "' specified. ");
    }

    // Create CSVFileFormat based on the format specified.
    switch (config.format.toLowerCase()) {
      case "delimited":
        csvFileFormat = CSVFormat.newFormat(delim).withQuote('"')
          .withRecordSeparator("\r\n").withIgnoreEmptyLines();
        break;

      case "excel":
        csvFileFormat = CSVFormat.Predefined.Excel.getFormat();
        break;

      case "mysql":
        csvFileFormat = CSVFormat.Predefined.MySQL.getFormat();
        break;

      case "tdf":
        csvFileFormat = CSVFormat.Predefined.TDF.getFormat();
        break;

      case "rfc4180":
        csvFileFormat = CSVFormat.Predefined.RFC4180.getFormat();
        break;

      default:
        throw new RuntimeException("Unknown format specified for CSV. Please check the format.");
    }
  }

  @Override
  public void transform(StructuredRecord record, Emitter emitter) throws Exception {
    List