org.apache.iceberg.connect.IcebergSinkConfig Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-kafka-connect Show documentation
Show all versions of iceberg-kafka-connect Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.connect;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.iceberg.IcebergBuild;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.kafka.common.config.AbstractConfig;
import org.apache.kafka.common.config.ConfigDef;
import org.apache.kafka.common.config.ConfigDef.Importance;
import org.apache.kafka.common.config.ConfigException;
import org.apache.kafka.connect.json.JsonConverter;
import org.apache.kafka.connect.json.JsonConverterConfig;
import org.apache.kafka.connect.storage.ConverterConfig;
import org.apache.kafka.connect.storage.ConverterType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IcebergSinkConfig extends AbstractConfig {
private static final Logger LOG = LoggerFactory.getLogger(IcebergSinkConfig.class.getName());
public static final String INTERNAL_TRANSACTIONAL_SUFFIX_PROP =
"iceberg.coordinator.transactional.suffix";
private static final String ROUTE_REGEX = "route-regex";
private static final String ID_COLUMNS = "id-columns";
private static final String PARTITION_BY = "partition-by";
private static final String COMMIT_BRANCH = "commit-branch";
private static final String CATALOG_PROP_PREFIX = "iceberg.catalog.";
private static final String HADOOP_PROP_PREFIX = "iceberg.hadoop.";
private static final String KAFKA_PROP_PREFIX = "iceberg.kafka.";
private static final String TABLE_PROP_PREFIX = "iceberg.table.";
private static final String AUTO_CREATE_PROP_PREFIX = "iceberg.tables.auto-create-props.";
private static final String WRITE_PROP_PREFIX = "iceberg.table.write-props.";
private static final String CATALOG_NAME_PROP = "iceberg.catalog";
private static final String TABLES_PROP = "iceberg.tables";
private static final String TABLES_DYNAMIC_PROP = "iceberg.tables.dynamic-enabled";
private static final String TABLES_ROUTE_FIELD_PROP = "iceberg.tables.route-field";
private static final String TABLES_DEFAULT_COMMIT_BRANCH = "iceberg.tables.default-commit-branch";
private static final String TABLES_DEFAULT_ID_COLUMNS = "iceberg.tables.default-id-columns";
private static final String TABLES_DEFAULT_PARTITION_BY = "iceberg.tables.default-partition-by";
private static final String TABLES_AUTO_CREATE_ENABLED_PROP =
"iceberg.tables.auto-create-enabled";
private static final String TABLES_EVOLVE_SCHEMA_ENABLED_PROP =
"iceberg.tables.evolve-schema-enabled";
private static final String TABLES_SCHEMA_FORCE_OPTIONAL_PROP =
"iceberg.tables.schema-force-optional";
private static final String TABLES_SCHEMA_CASE_INSENSITIVE_PROP =
"iceberg.tables.schema-case-insensitive";
private static final String CONTROL_TOPIC_PROP = "iceberg.control.topic";
private static final String COMMIT_INTERVAL_MS_PROP = "iceberg.control.commit.interval-ms";
private static final int COMMIT_INTERVAL_MS_DEFAULT = 300_000;
private static final String COMMIT_TIMEOUT_MS_PROP = "iceberg.control.commit.timeout-ms";
private static final int COMMIT_TIMEOUT_MS_DEFAULT = 30_000;
private static final String COMMIT_THREADS_PROP = "iceberg.control.commit.threads";
private static final String CONNECT_GROUP_ID_PROP = "iceberg.connect.group-id";
private static final String HADOOP_CONF_DIR_PROP = "iceberg.hadoop-conf-dir";
private static final String NAME_PROP = "name";
private static final String BOOTSTRAP_SERVERS_PROP = "bootstrap.servers";
private static final String DEFAULT_CATALOG_NAME = "iceberg";
private static final String DEFAULT_CONTROL_TOPIC = "control-iceberg";
public static final String DEFAULT_CONTROL_GROUP_PREFIX = "cg-control-";
public static final int SCHEMA_UPDATE_RETRIES = 2; // 3 total attempts
public static final int CREATE_TABLE_RETRIES = 2; // 3 total attempts
@VisibleForTesting static final String COMMA_NO_PARENS_REGEX = ",(?![^()]*+\\))";
public static final ConfigDef CONFIG_DEF = newConfigDef();
public static String version() {
return IcebergBuild.version();
}
private static ConfigDef newConfigDef() {
ConfigDef configDef = new ConfigDef();
configDef.define(
TABLES_PROP,
ConfigDef.Type.LIST,
null,
Importance.HIGH,
"Comma-delimited list of destination tables");
configDef.define(
TABLES_DYNAMIC_PROP,
ConfigDef.Type.BOOLEAN,
false,
Importance.MEDIUM,
"Enable dynamic routing to tables based on a record value");
configDef.define(
TABLES_ROUTE_FIELD_PROP,
ConfigDef.Type.STRING,
null,
Importance.MEDIUM,
"Source record field for routing records to tables");
configDef.define(
TABLES_DEFAULT_COMMIT_BRANCH,
ConfigDef.Type.STRING,
null,
Importance.MEDIUM,
"Default branch for commits");
configDef.define(
TABLES_DEFAULT_ID_COLUMNS,
ConfigDef.Type.STRING,
null,
Importance.MEDIUM,
"Default ID columns for tables, comma-separated");
configDef.define(
TABLES_DEFAULT_PARTITION_BY,
ConfigDef.Type.STRING,
null,
Importance.MEDIUM,
"Default partition spec to use when creating tables, comma-separated");
configDef.define(
TABLES_AUTO_CREATE_ENABLED_PROP,
ConfigDef.Type.BOOLEAN,
false,
Importance.MEDIUM,
"Set to true to automatically create destination tables, false otherwise");
configDef.define(
TABLES_SCHEMA_FORCE_OPTIONAL_PROP,
ConfigDef.Type.BOOLEAN,
false,
Importance.MEDIUM,
"Set to true to set columns as optional during table create and evolution, false to respect schema");
configDef.define(
TABLES_SCHEMA_CASE_INSENSITIVE_PROP,
ConfigDef.Type.BOOLEAN,
false,
Importance.MEDIUM,
"Set to true to look up table columns by case-insensitive name, false for case-sensitive");
configDef.define(
TABLES_EVOLVE_SCHEMA_ENABLED_PROP,
ConfigDef.Type.BOOLEAN,
false,
Importance.MEDIUM,
"Set to true to add any missing record fields to the table schema, false otherwise");
configDef.define(
CATALOG_NAME_PROP,
ConfigDef.Type.STRING,
DEFAULT_CATALOG_NAME,
Importance.MEDIUM,
"Iceberg catalog name");
configDef.define(
CONTROL_TOPIC_PROP,
ConfigDef.Type.STRING,
DEFAULT_CONTROL_TOPIC,
Importance.MEDIUM,
"Name of the control topic");
configDef.define(
CONNECT_GROUP_ID_PROP,
ConfigDef.Type.STRING,
null,
Importance.LOW,
"Name of the Connect consumer group, should not be set under normal conditions");
configDef.define(
COMMIT_INTERVAL_MS_PROP,
ConfigDef.Type.INT,
COMMIT_INTERVAL_MS_DEFAULT,
Importance.MEDIUM,
"Coordinator interval for performing Iceberg table commits, in millis");
configDef.define(
COMMIT_TIMEOUT_MS_PROP,
ConfigDef.Type.INT,
COMMIT_TIMEOUT_MS_DEFAULT,
Importance.MEDIUM,
"Coordinator time to wait for worker responses before committing, in millis");
configDef.define(
COMMIT_THREADS_PROP,
ConfigDef.Type.INT,
Runtime.getRuntime().availableProcessors() * 2,
Importance.MEDIUM,
"Coordinator threads to use for table commits, default is (cores * 2)");
configDef.define(
HADOOP_CONF_DIR_PROP,
ConfigDef.Type.STRING,
null,
Importance.MEDIUM,
"If specified, Hadoop config files in this directory will be loaded");
return configDef;
}
private final Map originalProps;
private final Map catalogProps;
private final Map hadoopProps;
private final Map kafkaProps;
private final Map autoCreateProps;
private final Map writeProps;
private final Map tableConfigMap = Maps.newHashMap();
private final JsonConverter jsonConverter;
public IcebergSinkConfig(Map originalProps) {
super(CONFIG_DEF, originalProps);
this.originalProps = originalProps;
this.catalogProps = PropertyUtil.propertiesWithPrefix(originalProps, CATALOG_PROP_PREFIX);
this.hadoopProps = PropertyUtil.propertiesWithPrefix(originalProps, HADOOP_PROP_PREFIX);
this.kafkaProps = Maps.newHashMap(loadWorkerProps());
kafkaProps.putAll(PropertyUtil.propertiesWithPrefix(originalProps, KAFKA_PROP_PREFIX));
this.autoCreateProps =
PropertyUtil.propertiesWithPrefix(originalProps, AUTO_CREATE_PROP_PREFIX);
this.writeProps = PropertyUtil.propertiesWithPrefix(originalProps, WRITE_PROP_PREFIX);
this.jsonConverter = new JsonConverter();
jsonConverter.configure(
ImmutableMap.of(
JsonConverterConfig.SCHEMAS_ENABLE_CONFIG,
false,
ConverterConfig.TYPE_CONFIG,
ConverterType.VALUE.getName()));
validate();
}
private void validate() {
checkState(!catalogProps().isEmpty(), "Must specify Iceberg catalog properties");
if (tables() != null) {
checkState(!dynamicTablesEnabled(), "Cannot specify both static and dynamic table names");
} else if (dynamicTablesEnabled()) {
checkState(
tablesRouteField() != null, "Must specify a route field if using dynamic table names");
} else {
throw new ConfigException("Must specify table name(s)");
}
}
private void checkState(boolean condition, String msg) {
if (!condition) {
throw new ConfigException(msg);
}
}
public String connectorName() {
return originalProps.get(NAME_PROP);
}
public String transactionalSuffix() {
// this is for internal use and is not part of the config definition...
return originalProps.get(INTERNAL_TRANSACTIONAL_SUFFIX_PROP);
}
public Map catalogProps() {
return catalogProps;
}
public Map hadoopProps() {
return hadoopProps;
}
public Map kafkaProps() {
return kafkaProps;
}
public Map autoCreateProps() {
return autoCreateProps;
}
public Map writeProps() {
return writeProps;
}
public String catalogName() {
return getString(CATALOG_NAME_PROP);
}
public List tables() {
return getList(TABLES_PROP);
}
public boolean dynamicTablesEnabled() {
return getBoolean(TABLES_DYNAMIC_PROP);
}
public String tablesRouteField() {
return getString(TABLES_ROUTE_FIELD_PROP);
}
public String tablesDefaultCommitBranch() {
return getString(TABLES_DEFAULT_COMMIT_BRANCH);
}
public String tablesDefaultIdColumns() {
return getString(TABLES_DEFAULT_ID_COLUMNS);
}
public String tablesDefaultPartitionBy() {
return getString(TABLES_DEFAULT_PARTITION_BY);
}
public TableSinkConfig tableConfig(String tableName) {
return tableConfigMap.computeIfAbsent(
tableName,
notUsed -> {
Map tableConfig =
PropertyUtil.propertiesWithPrefix(originalProps, TABLE_PROP_PREFIX + tableName + ".");
String routeRegexStr = tableConfig.get(ROUTE_REGEX);
Pattern routeRegex = routeRegexStr == null ? null : Pattern.compile(routeRegexStr);
String idColumnsStr = tableConfig.getOrDefault(ID_COLUMNS, tablesDefaultIdColumns());
List idColumns = stringToList(idColumnsStr, ",");
String partitionByStr =
tableConfig.getOrDefault(PARTITION_BY, tablesDefaultPartitionBy());
List partitionBy = stringToList(partitionByStr, COMMA_NO_PARENS_REGEX);
String commitBranch =
tableConfig.getOrDefault(COMMIT_BRANCH, tablesDefaultCommitBranch());
return new TableSinkConfig(routeRegex, idColumns, partitionBy, commitBranch);
});
}
@VisibleForTesting
static List stringToList(String value, String regex) {
if (value == null || value.isEmpty()) {
return ImmutableList.of();
}
return Arrays.stream(value.split(regex)).map(String::trim).collect(Collectors.toList());
}
public String controlTopic() {
return getString(CONTROL_TOPIC_PROP);
}
public String connectGroupId() {
String result = getString(CONNECT_GROUP_ID_PROP);
if (result != null) {
return result;
}
String connectorName = connectorName();
Preconditions.checkNotNull(connectorName, "Connector name cannot be null");
return "connect-" + connectorName;
}
public int commitIntervalMs() {
return getInt(COMMIT_INTERVAL_MS_PROP);
}
public int commitTimeoutMs() {
return getInt(COMMIT_TIMEOUT_MS_PROP);
}
public int commitThreads() {
return getInt(COMMIT_THREADS_PROP);
}
public String hadoopConfDir() {
return getString(HADOOP_CONF_DIR_PROP);
}
public boolean autoCreateEnabled() {
return getBoolean(TABLES_AUTO_CREATE_ENABLED_PROP);
}
public boolean evolveSchemaEnabled() {
return getBoolean(TABLES_EVOLVE_SCHEMA_ENABLED_PROP);
}
public boolean schemaForceOptional() {
return getBoolean(TABLES_SCHEMA_FORCE_OPTIONAL_PROP);
}
public boolean schemaCaseInsensitive() {
return getBoolean(TABLES_SCHEMA_CASE_INSENSITIVE_PROP);
}
public JsonConverter jsonConverter() {
return jsonConverter;
}
/**
* This method attempts to load the Kafka Connect worker properties, which are not exposed to
* connectors. It does this by parsing the Java command used to launch the worker, extracting the
* name of the properties file, and then loading the file.
* The sink uses these properties, if available, when initializing its internal Kafka clients. By
* doing this, Kafka-related properties only need to be set in the worker properties and do not
* need to be duplicated in the sink config.
* If the worker properties cannot be loaded, then Kafka-related properties must be set via the
* `iceberg.kafka.*` sink configs.
*
* @return The Kafka Connect worker properties
*/
private Map loadWorkerProps() {
String javaCmd = System.getProperty("sun.java.command");
if (javaCmd != null && !javaCmd.isEmpty()) {
List args = Splitter.on(' ').splitToList(javaCmd);
if (args.size() > 1
&& (args.get(0).endsWith(".ConnectDistributed")
|| args.get(0).endsWith(".ConnectStandalone"))) {
Properties result = new Properties();
try (InputStream in = Files.newInputStream(Paths.get(args.get(1)))) {
result.load(in);
// sanity check that this is the config we want
if (result.containsKey(BOOTSTRAP_SERVERS_PROP)) {
return Maps.fromProperties(result);
}
} catch (Exception e) {
// NO-OP
}
}
}
LOG.info(
"Worker properties not loaded, using only {}* properties for Kafka clients",
KAFKA_PROP_PREFIX);
return ImmutableMap.of();
}
}