
org.apache.flink.formats.csv.CsvRowDataDeserializationSchema Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.formats.csv;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.jackson.JacksonMapperFactory;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.JsonNode;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectReader;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.dataformat.csv.CsvSchema;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
/**
* Deserialization schema from CSV to Flink Table & SQL internal data structures.
*
* Deserializes a byte[]
message as a {@link JsonNode} and converts it to {@link
* RowData}.
*
*
Failure during deserialization are forwarded as wrapped {@link IOException}s.
*/
@Internal
public final class CsvRowDataDeserializationSchema implements DeserializationSchema {
private static final long serialVersionUID = 1L;
/** Type information describing the result type. */
private final TypeInformation resultTypeInfo;
/** Runtime instance that performs the actual work. */
private final CsvToRowDataConverters.CsvToRowDataConverter runtimeConverter;
/** Schema describing the input CSV data. */
private final CsvSchema csvSchema;
/** Object reader used to read rows. It is configured by {@link CsvSchema}. */
private transient ObjectReader objectReader;
/** Flag indicating whether to ignore invalid fields/rows (default: throw an exception). */
private final boolean ignoreParseErrors;
private CsvRowDataDeserializationSchema(
TypeInformation resultTypeInfo,
CsvSchema csvSchema,
CsvToRowDataConverters.CsvToRowDataConverter runtimeConverter,
boolean ignoreParseErrors) {
this.resultTypeInfo = resultTypeInfo;
this.runtimeConverter = runtimeConverter;
this.csvSchema = csvSchema;
this.ignoreParseErrors = ignoreParseErrors;
}
@Override
public void open(InitializationContext context) {
this.objectReader =
JacksonMapperFactory.createCsvMapper().readerFor(JsonNode.class).with(csvSchema);
}
/** A builder for creating a {@link CsvRowDataDeserializationSchema}. */
@Internal
public static class Builder {
private final RowType rowResultType;
private final TypeInformation resultTypeInfo;
private CsvSchema csvSchema;
private boolean ignoreParseErrors;
/**
* Creates a CSV deserialization schema for the given {@link TypeInformation} with optional
* parameters.
*
* @param rowReadType The {@link RowType} used for reading CSV rows.
* @param rowResultType The {@link RowType} of the produced results. It can be different
* from the {@code rowReadType} if the underlying converter supports the discrepancy
* (for instance for filtering/projection pushdown).
* @param resultTypeInfo The result type info.
*/
public Builder(
RowType rowReadType,
RowType rowResultType,
TypeInformation resultTypeInfo) {
Preconditions.checkNotNull(rowReadType, "RowType must not be null.");
Preconditions.checkNotNull(rowResultType, "RowType must not be null.");
Preconditions.checkNotNull(resultTypeInfo, "Result type information must not be null.");
this.rowResultType = rowResultType;
this.resultTypeInfo = resultTypeInfo;
this.csvSchema = CsvRowSchemaConverter.convert(rowReadType);
}
/**
* Creates a CSV deserialization schema for the given {@link TypeInformation} with optional
* parameters.
*/
public Builder(RowType rowType, TypeInformation resultTypeInfo) {
Preconditions.checkNotNull(resultTypeInfo, "Result type information must not be null.");
this.rowResultType = rowType;
this.resultTypeInfo = resultTypeInfo;
this.csvSchema = CsvRowSchemaConverter.convert(rowType);
}
public Builder setFieldDelimiter(char delimiter) {
this.csvSchema = this.csvSchema.rebuild().setColumnSeparator(delimiter).build();
return this;
}
public Builder setAllowComments(boolean allowComments) {
this.csvSchema = this.csvSchema.rebuild().setAllowComments(allowComments).build();
return this;
}
public Builder setArrayElementDelimiter(String delimiter) {
Preconditions.checkNotNull(delimiter, "Array element delimiter must not be null.");
this.csvSchema = this.csvSchema.rebuild().setArrayElementSeparator(delimiter).build();
return this;
}
public Builder disableQuoteCharacter() {
this.csvSchema = this.csvSchema.rebuild().disableQuoteChar().build();
return this;
}
public Builder setQuoteCharacter(char c) {
this.csvSchema = this.csvSchema.rebuild().setQuoteChar(c).build();
return this;
}
public Builder setEscapeCharacter(char c) {
this.csvSchema = this.csvSchema.rebuild().setEscapeChar(c).build();
return this;
}
public Builder setNullLiteral(String nullLiteral) {
Preconditions.checkNotNull(nullLiteral, "Null literal must not be null.");
this.csvSchema = this.csvSchema.rebuild().setNullValue(nullLiteral).build();
return this;
}
public Builder setIgnoreParseErrors(boolean ignoreParseErrors) {
this.ignoreParseErrors = ignoreParseErrors;
return this;
}
public CsvRowDataDeserializationSchema build() {
CsvToRowDataConverters.CsvToRowDataConverter runtimeConverter =
new CsvToRowDataConverters(ignoreParseErrors)
.createRowConverter(rowResultType, true);
return new CsvRowDataDeserializationSchema(
resultTypeInfo, csvSchema, runtimeConverter, ignoreParseErrors);
}
}
@Override
public RowData deserialize(@Nullable byte[] message) throws IOException {
if (message == null) {
return null;
}
try {
final JsonNode root = objectReader.readValue(message);
return (RowData) runtimeConverter.convert(root);
} catch (Throwable t) {
if (ignoreParseErrors) {
return null;
}
throw new IOException(
String.format("Failed to deserialize CSV row '%s'.", new String(message)), t);
}
}
@Override
public boolean isEndOfStream(RowData nextElement) {
return false;
}
@Override
public TypeInformation getProducedType() {
return resultTypeInfo;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || o.getClass() != this.getClass()) {
return false;
}
final CsvRowDataDeserializationSchema that = (CsvRowDataDeserializationSchema) o;
final CsvSchema otherSchema = that.csvSchema;
return resultTypeInfo.equals(that.resultTypeInfo)
&& ignoreParseErrors == that.ignoreParseErrors
&& csvSchema.getColumnSeparator() == otherSchema.getColumnSeparator()
&& csvSchema.allowsComments() == otherSchema.allowsComments()
&& csvSchema
.getArrayElementSeparator()
.equals(otherSchema.getArrayElementSeparator())
&& csvSchema.getQuoteChar() == otherSchema.getQuoteChar()
&& csvSchema.getEscapeChar() == otherSchema.getEscapeChar()
&& Arrays.equals(csvSchema.getNullValue(), otherSchema.getNullValue());
}
@Override
public int hashCode() {
return Objects.hash(
resultTypeInfo,
ignoreParseErrors,
csvSchema.getColumnSeparator(),
csvSchema.allowsComments(),
csvSchema.getArrayElementSeparator(),
csvSchema.getQuoteChar(),
csvSchema.getEscapeChar(),
Arrays.hashCode(csvSchema.getNullValue()));
}
}