io.cdap.plugin.CSVFormatter Maven / Gradle / Ivy
/*
* Copyright © 2015-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.data.schema.Schema.Field;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageSubmitterContext;
import io.cdap.cdap.etl.api.Transform;
import io.cdap.cdap.etl.api.TransformContext;
import io.cdap.plugin.common.TransformLineageRecorderUtils;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
/**
* Transform that formats a {@link StructuredRecord} to CSV.
*
* CSVFormatter supports transforming the input {@link StructuredRecord}
* into CSV Record of varying types. Following are different CSV record
* types that are supported by this transform.
*
* - DELIMITED
* - EXCEL
* - RFC4180
* - MYSQL and
* - TDF
*
*
*/
@Plugin(type = "transform")
@Name("CSVFormatter")
@Description("Formats a Structured Record to CSV")
public final class CSVFormatter extends Transform {
private static final Logger LOG = LoggerFactory.getLogger(CSVFormatter.class);
// Transform configuration.
private final Config config;
// Output Schema associated with transform output.
private Schema outSchema;
// List of fields specified in the schema.
private List fields;
// Mapping from delimiter name to the character to be used as delimiter.
private static final Map delimMap = Maps.newHashMap();
// Format of CSV File.
private CSVFormat csvFileFormat;
// This is used only for tests, otherwise this is being injected by the ingestion framework.
public CSVFormatter(Config config) {
this.config = config;
}
// Static collection of delimiter mappings from name to delim.
static {
delimMap.put("COMMA", ",");
delimMap.put("CTRL-A", "\001");
delimMap.put("TAB", "\t");
delimMap.put("VBAR", "|");
delimMap.put("STAR", "*");
delimMap.put("CARET", "^");
delimMap.put("DOLLAR", "$");
delimMap.put("HASH", "#");
delimMap.put("TILDE", "~");
delimMap.put("CTRL-B", "\002");
delimMap.put("CTRL-C", "\003");
delimMap.put("CTRL-D", "\004");
delimMap.put("CTRL-E", "\005");
delimMap.put("CTRL-F", "\006");
}
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {
super.configurePipeline(pipelineConfigurer);
FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector();
config.validate(collector);
Schema schema = config.getSchema(collector);
pipelineConfigurer.getStageConfigurer().setOutputSchema(schema);
}
@Override
public void prepareRun(StageSubmitterContext context) throws Exception {
super.prepareRun(context);
// Map all the input fields to the first output field.
List outputFields = TransformLineageRecorderUtils.getFields(context.getOutputSchema());
if (!outputFields.isEmpty()) {
context.record(
TransformLineageRecorderUtils
.generateManyToOne(TransformLineageRecorderUtils.getFields(context.getInputSchema()), outputFields.get(0),
"csvFormat", "Formatted the input data as CSV."));
}
config.validate(context.getFailureCollector());
}
@Override
public void initialize(TransformContext context) throws Exception {
super.initialize(context);
try {
outSchema = Schema.parseJson(config.schema);
fields = outSchema.getFields();
} catch (IOException e) {
throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
}
// Based on the delimiter name specified pick the delimiter to be used for the record.
// This is only applicable when the format type is chosen as DELIMITER
char delim;
if (delimMap.containsKey(config.delimiter)) {
delim = delimMap.get(config.delimiter).charAt(0);
} else {
throw new IllegalArgumentException("Unknown delimiter '" + config.delimiter + "' specified. ");
}
// Create CSVFileFormat based on the format specified.
switch (config.format.toLowerCase()) {
case "delimited":
csvFileFormat = CSVFormat.newFormat(delim).withQuote('"')
.withRecordSeparator("\r\n").withIgnoreEmptyLines();
break;
case "excel":
csvFileFormat = CSVFormat.Predefined.Excel.getFormat();
break;
case "mysql":
csvFileFormat = CSVFormat.Predefined.MySQL.getFormat();
break;
case "tdf":
csvFileFormat = CSVFormat.Predefined.TDF.getFormat();
break;
case "rfc4180":
csvFileFormat = CSVFormat.Predefined.RFC4180.getFormat();
break;
default:
throw new RuntimeException("Unknown format specified for CSV. Please check the format.");
}
}
@Override
public void transform(StructuredRecord record, Emitter emitter) throws Exception {
List