All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.CSVParser Maven / Gradle / Ivy

/*
 * Copyright © 2015-2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.data.schema.Schema.Field;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.InvalidEntry;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.StageSubmitterContext;
import io.cdap.cdap.etl.api.Transform;
import io.cdap.cdap.etl.api.TransformContext;
import io.cdap.cdap.etl.api.lineage.field.FieldOperation;
import io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
 * Transformation that parses a field as CSV Record into {@link StructuredRecord}.
 *
 * 

* CSVParser supports transforming the input into {@link StructuredRecord} * Following are different CSV record types that are supported by this transform. User can also define custom delimiter * by selecting tht option "Custom". *

    *
  • DEFAULT
  • *
  • EXCEL
  • *
  • RFC4180
  • *
  • MYSQL
  • *
  • Tab Delimited and
  • *
  • Pipe Delimited
  • *
  • Custom
  • *
*

*/ @Plugin(type = "transform") @Name("CSVParser") @Description("Parses a field as CSV Record into a Structured Record.") public final class CSVParser extends Transform { private final Config config; // Output Schema associated with transform output. private Schema outSchema; static final Set FORMATS = ImmutableSet.of("DEFAULT", "EXCEL", "MYSQL", "RFC4180", "TDF", "Pipe Delimited", "Tab Delimited", "PDL", "Custom"); // List of fields specified in the schema. private List fields; // Format of CSV. private CSVFormat csvFormat = CSVFormat.DEFAULT; // Format of PDL. public static final CSVFormat PDL; // Initialize Pipe Delimiter CSV Parser. static { PDL = CSVFormat.DEFAULT.withDelimiter('|').withEscape('\\').withIgnoreEmptyLines(false) .withAllowMissingColumnNames().withQuote(null).withRecordSeparator('\n') .withIgnoreSurroundingSpaces(); } // This is used only for tests, otherwise this is being injected by the ingestion framework. public CSVParser(Config config) { this.config = config; } @Override public void configurePipeline(PipelineConfigurer configurer) throws IllegalArgumentException { super.configurePipeline(configurer); StageConfigurer stageConfigurer = configurer.getStageConfigurer(); FailureCollector collector = stageConfigurer.getFailureCollector(); config.validate(collector); // perform schema validation Schema inputSchema = stageConfigurer.getInputSchema(); validateInputSchema(inputSchema, collector); Schema schema = parseAndValidateOutputSchema(inputSchema, collector); collector.getOrThrowException(); stageConfigurer.setOutputSchema(schema); } @Override public void prepareRun(StageSubmitterContext context) throws Exception { super.prepareRun(context); FailureCollector collector = context.getFailureCollector(); config.validate(collector); collector.getOrThrowException(); // Read from config.field and output to fields init(); if (fields != null) { FieldOperation operation = new FieldTransformOperation("Parse", "Parsed CSV data from expected field.", Collections.singletonList(config.field), fields.stream().map(Schema.Field::getName) .collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } } @Override public void initialize(TransformContext context) throws Exception { super.initialize(context); init(); } private void init() { String csvFormatString = config.format == null ? "default" : config.format.toLowerCase(); switch (csvFormatString) { case "default": csvFormat = CSVFormat.DEFAULT; break; case "excel": csvFormat = CSVFormat.EXCEL; break; case "mysql": csvFormat = CSVFormat.MYSQL; break; case "rfc4180": csvFormat = CSVFormat.RFC4180; break; case "tdf": case "tab delimited": csvFormat = CSVFormat.TDF; break; case "pdl": case "pipe delimited": csvFormat = PDL; break; case "custom": csvFormat = CSVFormat.DEFAULT.withDelimiter(config.delimiter).withEscape('\\').withIgnoreEmptyLines(false) .withAllowMissingColumnNames().withQuote(null).withRecordSeparator('\n') .withIgnoreSurroundingSpaces(); break; default: throw new IllegalArgumentException("Format {} specified is not one of the allowed format. Allowed formats are" + "DEFAULT, EXCEL, MYSQL, RFC4180, Pipe Delimited and Tab Delimited or " + "Custom"); } try { outSchema = Schema.parseJson(config.schema); fields = outSchema.getFields(); } catch (IOException e) { throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format."); } } @Override public void transform(StructuredRecord in, Emitter emitter) throws Exception { // Field has to string to be parsed correctly. For others throw an exception. String body = in.get(config.field); // Parse the text as CSV and emit it as structured record. try { if (body == null) { emitter.emit(createStructuredRecord(null, in)); } else { org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(body, csvFormat); List records = parser.getRecords(); for (CSVRecord record : records) { emitter.emit(createStructuredRecord(record, in)); } } } catch (IOException e) { emitter.emitError(new InvalidEntry<>(31, e.getStackTrace()[0].toString() + " : " + e.getMessage(), in)); } } private StructuredRecord createStructuredRecord(@Nullable CSVRecord record, StructuredRecord in) { StructuredRecord.Builder builder = StructuredRecord.builder(outSchema); int i = 0; for (Field field : fields) { String name = field.getName(); // If the field specified in the output field is present in the input, then // it's directly copied into the output, else field is parsed in from the CSV parser. // If the input record is null, propagate all supplied input fields and null other fields // assumed to be CSV-parsed fields if (in.get(name) != null) { builder.set(name, in.get(name)); } else if (record == null) { builder.set(name, null); } else { String val = record.get(i); Schema fieldSchema = field.getSchema(); if (val.isEmpty()) { boolean isNullable = fieldSchema.isNullable(); Schema.Type fieldType = isNullable ? fieldSchema.getNonNullable().getType() : fieldSchema.getType(); // if the field is a string or a nullable string, set the value to the empty string if (fieldType == Schema.Type.STRING) { builder.set(field.getName(), ""); } else if (!isNullable) { // otherwise, error out throw new IllegalArgumentException(String.format( "Field #%d (named '%s') is of non-nullable type '%s', " + "but was parsed as an empty string for CSV record '%s'", i, field.getName(), field.getSchema().getType(), record)); } } else { builder.convertAndSet(field.getName(), val); } ++i; } } return builder.build(); } @VisibleForTesting void validateInputSchema(@Nullable Schema inputSchema, FailureCollector collector) { if (inputSchema != null) { // Check the existence of field in input schema Schema.Field inputSchemaField = inputSchema.getField(config.field); if (inputSchemaField == null) { collector.addFailure(String.format("Field '%s' must be present in the input schema.", config.field), null) .withConfigProperty(Config.NAME_FIELD); return; } // Check that the field type is String or Nullable String Schema fieldSchema = inputSchemaField.getSchema(); Schema nonNullableSchema = fieldSchema.isNullable() ? fieldSchema.getNonNullable() : fieldSchema; Schema.Type fieldType = nonNullableSchema.getType(); if (!fieldType.equals(Schema.Type.STRING)) { collector.addFailure(String.format("Field '%s' is of invalid type '%s'.", inputSchemaField.getName(), nonNullableSchema.getDisplayName()), "Ensure it is of type 'string'.") .withConfigProperty(Config.NAME_FIELD).withInputSchemaField(config.field); } } } @VisibleForTesting Schema parseAndValidateOutputSchema(@Nullable Schema inputSchema, FailureCollector collector) { Schema outputSchema = config.getSchema(collector); // When a input field is passed through to output, the type and name should be the same. // If the type is not the same, then we collect failure. if (inputSchema != null) { for (Field field : inputSchema.getFields()) { if (outputSchema.getField(field.getName()) != null) { Schema out = outputSchema.getField(field.getName()).getSchema(); Schema in = field.getSchema(); if (!in.equals(out)) { collector.addFailure( String.format("Output field '%s' must have same schema as input schema.", field.getName()), "Ensure input and output schema for the field is same.") .withInputSchemaField(field.getName()).withOutputSchemaField(field.getName()); } } } } return outputSchema; } /** * Configuration for the plugin. */ public static class Config extends PluginConfig { private static final String NAME_FORMAT = "format"; private static final String NAME_DELIMITER = "delimiter"; private static final String NAME_FIELD = "field"; private static final String NAME_SCHEMA = "schema"; @Nullable @Name(NAME_FORMAT) @Description("Specify one of the predefined formats. DEFAULT, EXCEL, MYSQL, RFC4180, Pipe Delimited, Tab " + "Delimited and Custom are supported formats.") private String format; @Nullable @Name(NAME_DELIMITER) @Description("Custom delimiter to be used for parsing the fields. The custom delimiter can only be specified by " + "selecting the option 'Custom' from the format drop-down. In case of null, defaults to ','.") private Character delimiter; @Name(NAME_FIELD) @Description("Specify the field that should be used for parsing into CSV. Input records with a null input field " + "propagate all other fields and set fields that would otherwise be parsed by the CSVParser to null.") private String field; @Name(NAME_SCHEMA) @Description("Specifies the schema that has to be output.") private String schema; public Config(@Nullable String format, @Nullable Character delimiter, String field, String schema) { this.format = format == null ? "DEFAULT" : format; this.delimiter = delimiter; this.field = field; this.schema = schema; } //Constructor to assign default value to format public Config() { format = "DEFAULT"; } private void validate(FailureCollector collector) { // Check if format is one of the allowed types. if (!format.equalsIgnoreCase("DEFAULT") && !format.equalsIgnoreCase("EXCEL") && !format.equalsIgnoreCase("MYSQL") && !format.equalsIgnoreCase("RFC4180") && !format.equalsIgnoreCase("Tab Delimited") && !format.equalsIgnoreCase("Pipe Delimited") && !format.equalsIgnoreCase("Custom") && !format.equalsIgnoreCase("PDL") && !format.equalsIgnoreCase("TDF")) { collector.addFailure(String.format("Format '%s' is unsupported.", format), String.format("Specify one of the following: %s.", Joiner.on(", ").join(FORMATS))) .withConfigProperty(NAME_FORMAT); } if (format.equalsIgnoreCase("Custom") && (delimiter == null || delimiter == 0)) { collector.addFailure("Delimiter must be specified for format option 'Custom'.", null) .withConfigProperty(NAME_DELIMITER).withConfigProperty(NAME_FORMAT); } if (!format.equalsIgnoreCase("Custom") && delimiter != null && delimiter != 0) { collector.addFailure("Custom delimiter can only be used for format option 'Custom'.", "Remove delimiter.") .withConfigProperty(NAME_DELIMITER).withConfigProperty(NAME_FORMAT); } } @Nullable private Schema getSchema(FailureCollector collector) { try { return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema); } catch (IOException e) { collector.addFailure("Invalid schema : " + e.getMessage(), null); } throw collector.getOrThrowException(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy