co.cask.hydrator.plugin.CSVParser Maven / Gradle / Ivy
/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.hydrator.plugin;
import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.Schema.Field;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
/**
* Transformation that parses a field as CSV Record into {@link StructuredRecord}.
*
*
* CSVParser supports transforming the input into {@link StructuredRecord}
* Following are different CSV record types that are supported by this transform.
*
* - DEFAULT
* - EXCEL
* - RFC4180
* - MYSQL
* - TDF and
* - PDL
*
*
*/
@Plugin(type = "transform")
@Name("CSVParser")
@Description("Parses a field as CSV Record into a Structured Record.")
public final class CSVParser extends Transform {
private static final Logger LOG = LoggerFactory.getLogger(CSVParser.class);
private final Config config;
// Output Schema associated with transform output.
private Schema outSchema;
// List of fields specified in the schema.
private List fields;
// Format of CSV.
private CSVFormat csvFormat = CSVFormat.DEFAULT;
// Format of PDL.
public static final CSVFormat PDL;
// Initialize Pipe Delimiter CSV Parser.
static {
PDL = CSVFormat.DEFAULT.withDelimiter('|').withEscape('\\').withIgnoreEmptyLines(false)
.withAllowMissingColumnNames().withQuote((Character) null).withRecordSeparator('\n')
.withIgnoreSurroundingSpaces();
}
// This is used only for tests, otherwise this is being injected by the ingestion framework.
public CSVParser(Config config) {
this.config = config;
}
@Override
public void configurePipeline(PipelineConfigurer configurer) throws IllegalArgumentException {
super.configurePipeline(configurer);
// Check if the format specified is valid.
if (this.config.format == null || this.config.format.isEmpty()) {
throw new IllegalArgumentException("Format is not specified. Allowed values are DEFAULT, EXCEL, MYSQL," +
" RFC4180, PDL & TDF");
}
// Check if format is one of the allowed types.
if (!this.config.format.equalsIgnoreCase("DEFAULT") && !this.config.format.equalsIgnoreCase("EXCEL") &&
!this.config.format.equalsIgnoreCase("MYSQL") && !this.config.format.equalsIgnoreCase("RFC4180") &&
!this.config.format.equalsIgnoreCase("TDF") && !this.config.format.equalsIgnoreCase("PDL")) {
throw new IllegalArgumentException("Format specified is not one of the allowed values. Allowed values are " +
"DEFAULT, EXCEL, MYSQL, RFC4180, PDL & TDF");
}
if (configurer.getStageConfigurer().getInputSchema() != null) {
Schema.Field inputSchemaField = configurer.getStageConfigurer().getInputSchema().getField(config.field);
if (inputSchemaField == null) {
throw new IllegalArgumentException(
"Field " + config.field + " is not present in the input schema");
} else {
if (!inputSchemaField.getSchema().getType().equals(Schema.Type.STRING)) {
throw new IllegalArgumentException(
"Type for field " + config.field + " must be String");
}
}
}
// Check if schema specified is a valid schema or no.
try {
Schema outputSchema = Schema.parseJson(this.config.schema);
configurer.getStageConfigurer().setOutputSchema(outputSchema);
} catch (IOException e) {
throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
}
}
@Override
public void initialize(TransformContext context) throws Exception {
super.initialize(context);
String csvFormatString = config.format.toLowerCase();
switch (csvFormatString) {
case "default":
csvFormat = CSVFormat.DEFAULT;
break;
case "excel":
csvFormat = CSVFormat.EXCEL;
break;
case "mysql":
csvFormat = CSVFormat.MYSQL;
break;
case "rfc4180":
csvFormat = CSVFormat.RFC4180;
break;
case "tdf":
csvFormat = CSVFormat.TDF;
break;
case "pdl":
csvFormat = PDL;
break;
default:
throw new IllegalArgumentException("Format {} specified is not one of the allowed format. Allowed formats are" +
"DEFAULT, EXCEL, MYSQL, RFC4180, PDL and TDF");
}
try {
outSchema = Schema.parseJson(config.schema);
fields = outSchema.getFields();
} catch (IOException e) {
throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
}
}
@Override
public void transform(StructuredRecord in, Emitter emitter) throws Exception {
// Field has to string to be parsed correctly. For others throw an exception.
String body = in.get(config.field);
// Parse the text as CSV and emit it as structured record.
try {
org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(body, csvFormat);
List records = parser.getRecords();
for (CSVRecord record : records) {
if (fields.size() == record.size()) {
emitter.emit(createStructuredRecord(record));
} else {
LOG.warn("Skipping record as output schema specified has '{}' fields, while CSV record has '{}'",
fields.size(), record.size());
// Write the record to error Dataset.
}
}
} catch (IOException e) {
LOG.error("There was a issue parsing the record. ", e.getLocalizedMessage());
}
}
private StructuredRecord createStructuredRecord(CSVRecord record) {
StructuredRecord.Builder builder = StructuredRecord.builder(outSchema);
int i = 0;
for (Field field : fields) {
String val = record.get(i);
Schema fieldSchema = field.getSchema();
if (val.isEmpty()) {
boolean isNullable = fieldSchema.isNullable();
Schema.Type fieldType = isNullable ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
// if the field is a string or a nullable string, set the value to the empty string
if (fieldType == Schema.Type.STRING) {
builder.set(field.getName(), "");
} else if (!isNullable) {
// otherwise, error out
throw new IllegalArgumentException(String.format(
"Field #%d (named '%s') is of non-nullable type '%s', " +
"but was parsed as an empty string for CSV record '%s'",
i, field.getName(), field.getSchema().getType(), record));
}
} else {
builder.convertAndSet(field.getName(), val);
}
++i;
}
return builder.build();
}
/**
* Configuration for the plugin.
*/
public static class Config extends PluginConfig {
@Name("format")
@Description("Specify one of the predefined formats. DEFAULT, EXCEL, MYSQL, RFC4180, PDL & TDF " +
"are supported formats.")
private final String format;
@Name("field")
@Description("Specify the field that should be used for parsing into CSV.")
private final String field;
@Name("schema")
@Description("Specifies the schema that has to be output.")
private final String schema;
public Config(String format, String field, String schema) {
this.format = format;
this.field = field;
this.schema = schema;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy