All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.table.factories.csv.CsvTableFactory Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.table.factories.csv;

import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.table.api.RichTableSchema;
import org.apache.flink.table.dataformat.BaseRow;
import org.apache.flink.table.factories.BatchCompatibleTableSinkFactory;
import org.apache.flink.table.factories.BatchTableSourceFactory;
import org.apache.flink.table.factories.StreamTableSinkFactory;
import org.apache.flink.table.factories.StreamTableSourceFactory;
import org.apache.flink.table.sinks.BatchCompatibleStreamTableSink;
import org.apache.flink.table.sinks.StreamTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.table.sinks.csv.CsvTableSink;
import org.apache.flink.table.sinks.csv.RetractCsvTableSink;
import org.apache.flink.table.sinks.csv.UpsertCsvTableSink;
import org.apache.flink.table.sources.BatchTableSource;
import org.apache.flink.table.sources.StreamTableSource;
import org.apache.flink.table.sources.csv.CsvTableSource;
import org.apache.flink.table.util.TableProperties;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.StringUtils;

import org.apache.commons.lang3.StringEscapeUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;

import scala.Option;
import scala.Some;

import static org.apache.flink.table.descriptors.ConnectorDescriptorValidator.CONNECTOR_PROPERTY_VERSION;
import static org.apache.flink.table.descriptors.ConnectorDescriptorValidator.CONNECTOR_TYPE;


/**
 * A CSV table factory.
 */
public class CsvTableFactory implements
	StreamTableSourceFactory,
	BatchTableSourceFactory,
	StreamTableSinkFactory,
	BatchCompatibleTableSinkFactory {
	private static final Logger LOG = LoggerFactory.getLogger(CsvTableFactory.class);

	@Override
	public BatchCompatibleStreamTableSink createBatchCompatibleTableSink(Map properties) {
		return (BatchCompatibleStreamTableSink) createCsvTableSink(properties, false);
	}

	@Override
	public BatchTableSource createBatchTableSource(Map properties) {
		return createCsvTableSource(properties);
	}

	@Override
	public StreamTableSink createStreamTableSink(Map properties) {
		return (StreamTableSink) createCsvTableSink(properties, true);
	}

	@Override
	public StreamTableSource createStreamTableSource(Map properties) {
		return createCsvTableSource(properties);
	}

	@Override
	public Map requiredContext() {
		// The connector type should be filesystem and format be CSV, we use CSV as connector type
		// to distinguish between Other Csv factories, these factories should be merged.
		Map context = new HashMap<>();
		context.put(CONNECTOR_TYPE, "CSV");
		context.put(CONNECTOR_PROPERTY_VERSION, "1");
		return context;
	}

	@Override
	public List supportedProperties() {
		return CsvOptions.SUPPORTED_KEYS;
	}

	private CsvTableSource createCsvTableSource(Map props) {
		TableProperties properties = new TableProperties();
		properties.putProperties(props);
		RichTableSchema schema = properties.readSchemaFromProperties(null);

		final String path = properties.getString(CsvOptions.PATH);
		if (StringUtils.isNullOrWhitespaceOnly(path)) {
			throw new IllegalArgumentException(CsvOptions.PARAMS_HELP_MSG);
		}

		final String fieldDelim = getJavaEscapedDelim(properties.getString(CsvOptions.OPTIONAL_FIELD_DELIM));
		final String lineDelim = getJavaEscapedDelim(properties.getString(CsvOptions.OPTIONAL_LINE_DELIM));
		final String charset = properties.getString(CsvOptions.OPTIONAL_CHARSET);
		final boolean emptyColumnAsNull = properties.getBoolean(CsvOptions.EMPTY_COLUMN_AS_NULL);
		final boolean lenient = properties.getBoolean(CsvOptions.LENIENT);
		final String timeZone = properties.getString(CsvOptions.OPTIONAL_TIME_ZONE);
		final TimeZone tz = (timeZone == null) ? TimeZone.getTimeZone("UTC") : TimeZone.getTimeZone(timeZone);
		final boolean enumerateNestedFiles = properties.getBoolean(CsvOptions.OPTIONAL_ENUMERATE_NESTED_FILES);

		CsvTableSource.Builder builder = CsvTableSource.builder()
			.path(path)
			.fieldDelimiter(fieldDelim)
			.lineDelimiter(lineDelim)
			.charset(charset)
			.fields(schema.getColumnNames(), schema.getColumnTypes(), schema.getNullables())
			.timezone(tz)
			.setNestedFileEnumerate(enumerateNestedFiles);

		Set> uniqueKeys = new HashSet<>();
		Set> normalIndexes = new HashSet<>();
		if (!schema.getPrimaryKeys().isEmpty()) {
			uniqueKeys.add(new HashSet<>(schema.getPrimaryKeys()));
		}
		for (List uniqueKey : schema.getUniqueKeys()) {
			uniqueKeys.add(new HashSet<>(uniqueKey));
		}
		for (RichTableSchema.Index index : schema.getIndexes()) {
			if (index.unique) {
				uniqueKeys.add(new HashSet<>(index.keyList));
			} else {
				normalIndexes.add(new HashSet<>(index.keyList));
			}
		}
		if (!uniqueKeys.isEmpty()) {
			builder.uniqueKeys(uniqueKeys);
		}
		if (!normalIndexes.isEmpty()) {
			builder.indexKeys(normalIndexes);
		}

		if (emptyColumnAsNull) {
			builder.enableEmptyColumnAsNull();
		}

		builder.setLenient(lenient);

		final String quoteCharacter = getJavaEscapedDelim(properties.getString(CsvOptions.OPTIONAL_QUOTE_CHARACTER));
		if (quoteCharacter != null) {
			Preconditions.checkArgument(
				quoteCharacter.length() == 1,
				"quote character should be a single character, " + quoteCharacter + " found.");
			builder.quoteCharacter(quoteCharacter.charAt(0));
		}

		final boolean firstLineAsHeader = properties.getBoolean(CsvOptions.OPTIONAL_FIRST_LINE_AS_HEADER);
		if (firstLineAsHeader) {
			builder.ignoreFirstLine();
		}

		final String commentsPrefix = properties.getString(CsvOptions.OPTIONAL_COMMENTS_PREFIX);
		if (commentsPrefix != null) {
			builder.commentPrefix(commentsPrefix);
		}
		return builder.build();
	}

	private TableSink createCsvTableSink(Map props, boolean isStreaming) {
		TableProperties properties = new TableProperties();
		properties.putProperties(props);
		RichTableSchema schema = properties.readSchemaFromProperties(null);

		final String path = properties.getString(CsvOptions.PATH);
		if (StringUtils.isNullOrWhitespaceOnly(path)) {
			throw new IllegalArgumentException(CsvOptions.PARAMS_HELP_MSG);
		}

		final boolean writeModeFlag = properties.getBoolean(CsvOptions.OPTIONAL_OVER_RIDE_MODE);
		final FileSystem.WriteMode writeMode =
			writeModeFlag ? FileSystem.WriteMode.OVERWRITE : FileSystem.WriteMode.NO_OVERWRITE;
		final String fieldDelim = getJavaEscapedDelim(properties.getString(CsvOptions.OPTIONAL_FIELD_DELIM));
		final String lineDelim = getJavaEscapedDelim(properties.getString(CsvOptions.OPTIONAL_LINE_DELIM));
		final String quoteCharacter = getJavaEscapedDelim(properties.getString(CsvOptions.OPTIONAL_QUOTE_CHARACTER));
		if (quoteCharacter != null) {
			Preconditions.checkArgument(
				quoteCharacter.length() == 1,
				"quote character should be a single character, " + quoteCharacter + " found.");
		}
		final int parallelism = properties.getInteger(CsvOptions.PARALLELISM, -1);
		Option numFiles = parallelism == -1 ? Option.apply(null) : new Some(parallelism);
		final String timeZone = properties.getString(CsvOptions.OPTIONAL_TIME_ZONE);
		final TimeZone tz = (timeZone == null) ? TimeZone.getTimeZone("UTC") : TimeZone.getTimeZone(timeZone);

		final String updateMode = properties.getString(CsvOptions.OPTIONAL_UPDATE_MODE);

		final boolean outputFieldNames = properties.getBoolean(CsvOptions.OPTIONAL_FIRST_LINE_AS_HEADER);

		switch (updateMode.toLowerCase()) {
		case "append":
			return
				new CsvTableSink(
					path,
					Option.apply(fieldDelim),
					Option.apply(lineDelim),
					Option.apply(quoteCharacter),
					numFiles,
					Option.apply(writeMode),
					Option.apply(outputFieldNames),
					Option.apply(tz)
				).configure(schema.getColumnNames(), schema.getColumnTypes());
		case "retract":
			return
				new RetractCsvTableSink(
					path,
					Option.apply(fieldDelim),
					Option.apply(lineDelim),
					Option.apply(quoteCharacter),
					numFiles,
					Option.apply(writeMode),
					Option.apply(outputFieldNames),
					Option.apply(tz)
				).configure(schema.getColumnNames(), schema.getColumnTypes());
		case "upsert":
			return new UpsertCsvTableSink(
				path,
				Option.apply(fieldDelim),
				Option.apply(lineDelim),
				Option.apply(quoteCharacter),
				numFiles,
				Option.apply(writeMode),
				Option.apply(outputFieldNames),
				Option.apply(tz)
			).configure(schema.getColumnNames(), schema.getColumnTypes());
		default:
			throw new RuntimeException("Unsupported updateMode: " + updateMode + " for CSV sink.");
		}

	}

	public static String getJavaEscapedDelim(String fieldDelim) {
		String unescapedFieldDelim = StringEscapeUtils.unescapeJava(fieldDelim);
		if (fieldDelim != null && !fieldDelim.equals(unescapedFieldDelim)) {
			LOG.info("Field delimiter unescaped from {} to {}.", fieldDelim, unescapedFieldDelim);
		}
		return unescapedFieldDelim;
	}
}