co.cask.hydrator.plugin.CSVFormatter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of transform-plugins Show documentation
There is a newer version: 2.1.2
/*
 * Copyright © 2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.Schema.Field;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import java.util.Map;

/**
 * Transform that formats a {@link StructuredRecord} to CSV.
 * 
 * CSVFormatter supports transforming the input {@link StructuredRecord}
 * into CSV Record of varying types. Following are different CSV record
 * types that are supported by this transform.
 * 

 *   DELIMITED
 *   EXCEL
 *   RFC4180
 *   MYSQL and
 *   TDF
 * 
 * 
 */
@Plugin(type = "transform")
@Name("CSVFormatter")
@Description("Formats a Structured Record to CSV")
public final class CSVFormatter extends Transform {
  private static final Logger LOG = LoggerFactory.getLogger(CSVFormatter.class);

  // Transform configuraiton.
  private final Config config;

  // Output Schema associated with transform output.
  private Schema outSchema;

  // List of fields specified in the schema.
  private List fields;

  // Mapping from delimiter name to the character to be used as delimiter.
  private static final Map delimMap = Maps.newHashMap();

  // Format of CSV File.
  private CSVFormat csvFileFormat;

  // This is used only for tests, otherwise this is being injected by the ingestion framework.
  public CSVFormatter(Config config) {
    this.config = config;
  }

  // Static collection of delimiter mappings from name to delim.
  static {
    delimMap.put("COMMA", ",");
    delimMap.put("CTRL-A", "\001");
    delimMap.put("TAB", "\t");
    delimMap.put("VBAR", "|");
    delimMap.put("STAR", "*");
    delimMap.put("CARET", "^");
    delimMap.put("DOLLAR", "$");
    delimMap.put("HASH", "#");
    delimMap.put("TILDE", "~");
    delimMap.put("CTRL-B", "\002");
    delimMap.put("CTRL-C", "\003");
    delimMap.put("CTRL-D", "\004");
    delimMap.put("CTRL-E", "\005");
    delimMap.put("CTRL-F", "\006");
  }

  @Override
  public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {
    super.configurePipeline(pipelineConfigurer);
    config.validate();

    // Check if schema specified is a valid schema or no.
    try {
      Schema schema = Schema.parseJson(config.schema);
      List fields = schema.getFields();
      if (fields.size() > 1) {
        throw new IllegalArgumentException("Output schema should have only one field of type String");
      }
      if (fields.get(0).getSchema().getType() != Schema.Type.STRING) {
        throw new IllegalArgumentException("Output field type should be String");
      }
      pipelineConfigurer.getStageConfigurer().setOutputSchema(schema);
    } catch (IOException e) {
      throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
    }

  }

  @Override
  public void initialize(TransformContext context) throws Exception {
    super.initialize(context);
    config.validate();

    try {
      outSchema = Schema.parseJson(config.schema);
      fields = outSchema.getFields();
    } catch (IOException e) {
      throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
    }

    // Based on the delimiter name specified pick the delimiter to be used for the record.
    // This is only applicable when the format type is choosen as DELIMITER
    char delim = ',';
    if (delimMap.containsKey(config.delimiter)) {
      delim = delimMap.get(config.delimiter).charAt(0);
    } else {
      throw new IllegalArgumentException("Unknown delimiter '" + config.delimiter + "' specified. ");
    }

    // Create CSVFileFormat based on the format specified.
    switch (config.format.toLowerCase()) {
      case "delimited":
        csvFileFormat = CSVFormat.newFormat(delim).withQuote('"')
          .withRecordSeparator("\r\n").withIgnoreEmptyLines();
        break;

      case "excel":
        csvFileFormat = CSVFormat.Predefined.Excel.getFormat();
        break;

      case "mysql":
        csvFileFormat = CSVFormat.Predefined.MySQL.getFormat();
        break;

      case "tdf":
        csvFileFormat = CSVFormat.Predefined.TDF.getFormat();
        break;

      case "rfc4180":
        csvFileFormat = CSVFormat.Predefined.TDF.getFormat();
        break;

      default:
        throw new RuntimeException("Unknown format specified for CSV. Please check the format.");
    }

  }

  @Override
  public void transform(StructuredRecord record, Emitter emitter) throws Exception {
    List