All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.axual.connect.plugins.adls.gen2.AdlsGen2SinkConfig Maven / Gradle / Ivy

There is a newer version: 1.2.2
Show newest version
package io.axual.connect.plugins.adls.gen2;

/*-
 * ========================LICENSE_START=================================
 * Azure Data Lake Storage Gen2 Sink Connector for Kafka Connect
 * %%
 * Copyright (C) 2021 Axual B.V.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License")
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * =========================LICENSE_END==================================
 */

import com.azure.storage.file.datalake.DataLakeServiceClient;

import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileConstants;
import org.apache.kafka.common.config.AbstractConfig;
import org.apache.kafka.common.config.ConfigDef;
import org.apache.kafka.common.config.ConfigException;
import org.apache.kafka.common.config.ConfigValue;
import org.apache.kafka.common.config.types.Password;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Duration;
import java.time.ZoneId;
import java.time.temporal.ChronoUnit;
import java.util.Map;
import java.util.function.BinaryOperator;
import java.util.regex.Pattern;

import io.axual.connect.plugins.adls.gen2.authentication.AccountKeyClientProvider;
import io.axual.connect.plugins.adls.gen2.authentication.ClientSecretClientProvider;
import io.axual.connect.plugins.adls.gen2.authentication.ServiceClientProvider;
import io.axual.connect.plugins.adls.gen2.authentication.SharedAccessSignatureClientProvider;
import io.axual.connect.plugins.adls.gen2.exceptions.AdlsGen2ConfigurationException;
import io.axual.connect.plugins.adls.gen2.storage.AdlsGen2StorageProvider;
import io.axual.connect.plugins.adls.gen2.storage.LocalStorageProvider;
import io.axual.connect.plugins.adls.gen2.storage.StorageProvider;

import static io.axual.connect.plugins.adls.gen2.RetryHelper.RetryType.EXPONENTIAL_INTERVAL;
import static io.axual.connect.plugins.adls.gen2.RetryHelper.RetryType.FIXED_INTERVAL;

/**
 * The configuration object for the Azure DataLake Gen2 Sink Connector.
 * It contains the configuration definitions and validations, as well as helper methods to retrieve
 * the configuration values.
 */
public class AdlsGen2SinkConfig extends AbstractConfig {
    public static final String ADLS_ENDPONT_CONFIG = "adls.endpoint";
    public static final String ADLS_ENDPONT_DOC = "The Azure DataLake Storage endpoint to connect to." +
            " This usually looks like https://.dfs.core.windows.net";

    public static final String ADLS_CLIENT_TIMEOUT_CONFIG = "adls.client.timeout.seconds";
    public static final Integer ADLS_CLIENT_TIMEOUT_DEFAULT = 15;
    public static final String ADLS_CLIENT_TIMEOUT_DOC = "The maximum number of seconds the Azure DataLake Storage client will wait for a call to return before failing";

    public static final String ADLS_CLIENT_RETRY_COUNT_CONFIG = "adls.client.retry.maximum.tries";
    public static final Integer ADLS_CLIENT_RETRY_COUNT_DEFAULT = 4;
    public static final String ADLS_CLIENT_RETRY_COUNT_DOC = "The maximum number of times the Azure DataLake Storage client will retry a call before failing";

    public static final String ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_CONFIG = "adls.client.retry.exponential";
    public static final Boolean ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_DEFAULT = false;
    public static final String ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_DOC = "Selects if the Azure DataLake Storage Client will use exponential backoff for retries";

    public static final String ADLS_CLIENT_RETRY_INTERVAL_CONFIG = "adls.client.retry.interval";
    public static final Long ADLS_CLIENT_RETRY_INTERVAL_DEFAULT = 10_000L;
    public static final String ADLS_CLIENT_RETRY_INTERVAL_DOC = "The number of milliseconds the Azure DataLake Storage client will wait before retrying. When exponential retry is enabled, this value is doubled for each retry up to the maximum retry timeout.";

    public static final String ADLS_CLIENT_RETRY_INTERVAL_MAX_CONFIG = "adls.client.retry.maximum.interval";
    public static final Long ADLS_CLIENT_RETRY_INTERVAL_MAX_DEFAULT = 60_000L;
    public static final String ADLS_CLIENT_RETRY_INTERVAL_MAX_DOC = "The maximum number of milliseconds the Azure DataLake Storage client will wait before retrying when using exponential backoff";

    public static final String ADLS_AUTHENTICATION_METHOD_CONFIG = "adls.auth.method";
    public static final String ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY = "AccountKey";
    public static final String ADLS_AUTHENTICATION_METHOD_SAS_TOKEN = "SasToken";
    public static final String ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET = "ClientSecret";
    public static final String ADLS_AUTHENTICATION_METHOD_DEFAULT = ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY;
    public static final String ADLS_AUTHENTICATION_METHOD_DOC = "The authentication method used to connect to the storage account." +
            "Connections can be made using Account Keys, Shared Access Signature Tokens and Azure Active Directory using Client Secrets. Azure AD with Client Certificates are not yet supported. Valid values are" +
            ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY + ", " + ADLS_AUTHENTICATION_METHOD_SAS_TOKEN + ", " + ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET;
    private static final ConfigDef.Validator ADLS_AUTHENTICATION_METHOD_VALIDATOR = ConfigDef.ValidString.in(ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY, ADLS_AUTHENTICATION_METHOD_SAS_TOKEN, ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET);

    public static final String ADLS_CONTAINER_NAME_CONFIG = "adls.container.name";
    public static final String ADLS_CONTAINER_NAME_DOC = "The name of the storage container to use for storing the records.";

    public static final String ADLS_ACCOUNT_NAME_CONFIG = "adls.account.name";
    public static final String ADLS_ACCOUNT_NAME_DEFAULT = null;
    public static final String ADLS_ACCOUNT_NAME_DOC = "The name of the Azure Storage account";

    public static final String ADLS_ACCOUNT_KEY_CONFIG = "adls.account.key";
    public static final String ADLS_ACCOUNT_KEY_DEFAULT = null;
    public static final String ADLS_ACCOUNT_KEY_DOC = "The account key to use for this Azure Storage account";

    public static final String ADLS_SAS_TOKEN_CONFIG = "adls.sas.token";
    public static final String ADLS_SAS_TOKEN_DEFAULT = null;
    public static final String ADLS_SAS_TOKEN_DOC = "The Shared Access Signature token to use to connect to Azure";

    public static final String ADLS_TENANT_ID_CONFIG = "adls.tenant.id";
    public static final String ADLS_TENANT_ID_DEFAULT = null;
    public static final String ADLS_TENANT_ID_DOC = "The id of the Azure Tenant for the Azure AD user/application registration";

    public static final String ADLS_CLIENT_ID_CONFIG = "adls.client.id";
    public static final String ADLS_CLIENT_ID_DEFAULT = null;
    public static final String ADLS_CLIENT_ID_DOC = "The id of the client in the Azure AD user/application registration";

    public static final String ADLS_CLIENT_SECRET_CONFIG = "adls.client.secret";
    public static final String ADLS_CLIENT_SECRET_DEFAULT = null;
    public static final String ADLS_CLIENT_SECRET_DOC = "The secret for the client in the Azure AD user/application registration";

    public static final String BASE_DIRECTORY_CONFIG = "base.directory";
    public static final String BASE_DIRECTORY_DEFAULT = "";
    public static final String BASE_DIRECTORY_DOC = "The directory where the target and staging directories are placed in";
    public static final ConfigDef.Validator BASE_DIRECTORY_VALIDATOR = new PathValidator(false);

    public static final String TARGET_DIRECTORY_CONFIG = "target.directory";
    public static final String TARGET_DIRECTORY_DEFAULT = "target";
    public static final String TARGET_DIRECTORY_DOC = "The target directory to write the records to. The path can contain java.time.format.DateTimeFormatter patterns to add timestamp rotation to the target directories";
    public static final ConfigDef.Validator TARGET_DIRECTORY_VALIDATOR = new PathValidator(true);

    public static final String STAGING_DIRECTORY_CONFIG = "staging.directory";
    public static final String STAGING_DIRECTORY_DEFAULT = "staging";
    public static final String STAGING_DIRECTORY_DOC = "The directory where the staging data is written. Inside this staging directory the target directory format is stored";
    public static final ConfigDef.Validator STAGING_DIRECTORY_VALIDATOR = new PathValidator(true);

    public static final String ROTATION_TIME_SOURCE_PROCESSED = "processed";
    public static final String ROTATION_TIME_SOURCE_PRODUCED = "produced";
    public static final String ROTATION_TIME_SOURCE_CONFIG = "rotation.time.source";
    public static final String ROTATION_TIME_SOURCE_DEFAULT = ROTATION_TIME_SOURCE_PROCESSED;
    public static final String ROTATION_TIME_SOURCE_DOC = "The source of the timestamp used as part of the time based file rotation. Use '" +
            ROTATION_TIME_SOURCE_PROCESSED + "',which is the default, to use the time when the connector is processing the record. Use '" +
            ROTATION_TIME_SOURCE_PRODUCED + "' to use the record timestamp for the file rotation";
    private static final ConfigDef.Validator ROTATION_TIME_SOURCE_VALIDATOR = ConfigDef.ValidString.in(ROTATION_TIME_SOURCE_PROCESSED, ROTATION_TIME_SOURCE_PRODUCED);

    public static final String ROTATION_TIME_ZONE_CONFIG = "rotation.time.zone";
    public static final String ROTATION_TIME_ZONE_DEFAULT = "UTC";
    public static final String ROTATION_TIME_ZONE_DOC = "The Java ZoneID name to use for file rotation. This can be a value like `GMT`, `UTC`, `Europe/Amsterdam` or `UTC+1:00`";
    private static final ConfigDef.Validator ROTATION_TIME_ZONE_VALIDATOR = (c, v) -> {
        try {
            if (v == null) {
                throw new ConfigException(c, v, "Null is not a valid zone id");
            }
            if (v instanceof String) {
                ZoneId.of((String) v);
            }
        } catch (Exception e) {
            throw new ConfigException(c, v, e.getMessage());
        }
    };

    public static final String ROTATION_RECORD_COUNT_CONFIG = "rotation.record.count";
    public static final Integer ROTATION_RECORD_COUNT_DEFAULT = 100;
    public static final String ROTATION_RECORD_COUNT_DOC = "The maximum number of records in a file. A file rotation will take place when this limit is reached.";

    public static final String ROTATION_INACTIVITY_CONFIG = "rotation.inactivity";
    public static final Long ROTATION_INACTIVITY_DEFAULT = Duration.of(30, ChronoUnit.MINUTES).toMillis();
    public static final String ROTATION_INACTIVITY_DOC = "The number of milliseconds of inactivity to wait for new incoming records before rotating to a new file. This prevents a file to remain in staging when no new data comes in.";

    public static final String ROTATION_FILE_SIZE_CONFIG = "rotation.filesize";
    public static final Long ROTATION_FILE_SIZE_DEFAULT = 100L*1000*1000;
    public static final String ROTATION_FILE_SIZE_DOC = "The maximum filesize of the container file. A file rotation will take place when this limit is reached.";

    public static final String COMMIT_ROTATED_ONLY_CONFIG = "commit.rotated.only";
    public static final Boolean COMMIT_ROTATED_ONLY_DEFAULT = Boolean.TRUE;
    public static final String COMMIT_ROTATED_ONLY_DOC = "If set to true only the offsets of the records in a rotated file are committed. When false the offsets of records in staging files are committed as well. Default is true.";

    public static final String COMMIT_RECORD_COUNT_CONFIG = "commit.record.count";
    public static final Integer COMMIT_RECORD_COUNT_DEFAULT = 100;
    public static final String COMMIT_RECORD_COUNT_DOC = "The maximum number of records processed by a task before requesting a commit offsets when the setting to commit only rotated files is disabled";

    public static final String RETRY_COUNT_CONFIG = "retry.maximum.tries";
    public static final Integer RETRY_COUNT_DEFAULT = 10;
    public static final String RETRY_COUNT_DOC = "The maximum number of times to retry an action before failing";

    public static final String RETRY_EXPONENTIAL_INTERVAL_CONFIG = "retry.exponential";
    public static final Boolean RETRY_EXPONENTIAL_INTERVAL_DEFAULT = true;
    public static final String RETRY_EXPONENTIAL_INTERVAL_DOC = "Use exponential backoff for retries";

    public static final String RETRY_INTERVAL_CONFIG = "retry.interval";
    public static final Long RETRY_INTERVAL_DEFAULT = 500L;
    public static final String RETRY_INTERVAL_DOC = "The number of milliseconds to wait before retrying";

    public static final String RETRY_INTERVAL_MAX_CONFIG = "retry.maximum.interval";
    public static final Long RETRY_INTERVAL_MAX_DEFAULT = 15_000L;
    public static final String RETRY_INTERVAL_MAX_DOC = "The maximum number of milliseconds to wait before retrying when using exponential backoff";

    public static final String COMPRESSION_TYPE_NULL = "null";
    public static final String COMPRESSION_TYPE_SNAPPY = "snappy";
    public static final String COMPRESSION_TYPE_CONFIG = "compression";
    public static final String COMPRESSION_TYPE_DEFAULT = COMPRESSION_TYPE_NULL;
    public static final String COMPRESSION_TYPE_DOC = "Set the compression type for the container file.Valid values are '" +
            COMPRESSION_TYPE_NULL + "' (default, no compression), '" + COMPRESSION_TYPE_SNAPPY + "'";
    private static final ConfigDef.Validator COMPRESSION_TYPE_VALIDATOR = ConfigDef.ValidString.in(COMPRESSION_TYPE_NULL, COMPRESSION_TYPE_SNAPPY);

    public static final String SYNC_INTERVAL_CONFIG = "sync.interval";
    public static final String SYNC_INTERVAL_DOC = "The approximate number of uncompressed bytes to write in each Avro block. A higher number will result in less calls to the Azure DataLake Storage, better compression and higher throughput.Valid values range from 32 to 2^30 Suggested values are between 2K and 2M";
    public static final Integer SYNC_INTERVAL_DEFAULT = DataFileConstants.DEFAULT_SYNC_INTERVAL;
    public static final ConfigDef.Validator SYNC_INTERVAL_VALIDATOR = ConfigDef.Range.between(32, (1 << 30));

    private static final AdlsGen2ConfigDef CONFIG = initConfig();

    /**
     * Create a new configuration object using the provided properties
     *
     * @param originals the configuration properties to use
     */
    public AdlsGen2SinkConfig(Map originals) {
        super(config(), originals);
        CONFIG.validate(originals).stream()
                .filter(cv -> !cv.errorMessages().isEmpty())
                .map(cv -> cv.name() + " : " + System.lineSeparator() + "\t" + String.join(System.lineSeparator() + "\t", cv.errorMessages()))
                .reduce((a, b) -> a + System.lineSeparator() + b)
                .ifPresent(errors -> {
                    throw new ConfigException("Found validation errors." + System.lineSeparator() + errors);
                });
    }

    private static AdlsGen2ConfigDef initConfig() {
        return (AdlsGen2ConfigDef) new AdlsGen2ConfigDef()
                .define(ADLS_ENDPONT_CONFIG, ConfigDef.Type.STRING, ConfigDef.NO_DEFAULT_VALUE, ConfigDef.Importance.HIGH, ADLS_ENDPONT_DOC)
                .define(ADLS_CONTAINER_NAME_CONFIG, ConfigDef.Type.STRING, ConfigDef.NO_DEFAULT_VALUE, ConfigDef.Importance.HIGH, ADLS_CONTAINER_NAME_DOC)
                .define(ADLS_AUTHENTICATION_METHOD_CONFIG, ConfigDef.Type.STRING, ADLS_AUTHENTICATION_METHOD_DEFAULT, ADLS_AUTHENTICATION_METHOD_VALIDATOR, ConfigDef.Importance.HIGH, ADLS_AUTHENTICATION_METHOD_DOC)

                // ADLS Client Retry settings
                .define(ADLS_CLIENT_TIMEOUT_CONFIG, ConfigDef.Type.INT, ADLS_CLIENT_TIMEOUT_DEFAULT, ConfigDef.Importance.MEDIUM, ADLS_CLIENT_TIMEOUT_DOC)
                .define(ADLS_CLIENT_RETRY_COUNT_CONFIG, ConfigDef.Type.INT, ADLS_CLIENT_RETRY_COUNT_DEFAULT, ConfigDef.Importance.MEDIUM, ADLS_CLIENT_RETRY_COUNT_DOC)
                .define(ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_CONFIG, ConfigDef.Type.BOOLEAN, ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_DEFAULT, ConfigDef.Importance.MEDIUM, ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_DOC)
                .define(ADLS_CLIENT_RETRY_INTERVAL_CONFIG, ConfigDef.Type.LONG, ADLS_CLIENT_RETRY_INTERVAL_DEFAULT, ConfigDef.Importance.MEDIUM, ADLS_CLIENT_RETRY_INTERVAL_DOC)
                .define(ADLS_CLIENT_RETRY_INTERVAL_MAX_CONFIG, ConfigDef.Type.LONG, ADLS_CLIENT_RETRY_INTERVAL_MAX_DEFAULT, ConfigDef.Importance.MEDIUM, ADLS_CLIENT_RETRY_INTERVAL_MAX_DOC)

                // Account Key Authentication
                .define(ADLS_ACCOUNT_NAME_CONFIG, ConfigDef.Type.STRING, ADLS_ACCOUNT_NAME_DEFAULT, ConfigDef.Importance.HIGH, ADLS_ACCOUNT_NAME_DOC)
                .define(ADLS_ACCOUNT_KEY_CONFIG, ConfigDef.Type.PASSWORD, ADLS_ACCOUNT_KEY_DEFAULT, ConfigDef.Importance.HIGH, ADLS_ACCOUNT_KEY_DOC)

                // Shared Access Signature Authentication
                .define(ADLS_SAS_TOKEN_CONFIG, ConfigDef.Type.PASSWORD, ADLS_SAS_TOKEN_DEFAULT, ConfigDef.Importance.HIGH, ADLS_SAS_TOKEN_DOC)

                // Azure AD Client Secret Authentication
                .define(ADLS_TENANT_ID_CONFIG, ConfigDef.Type.STRING, ADLS_TENANT_ID_DEFAULT, ConfigDef.Importance.HIGH, ADLS_TENANT_ID_DOC)
                .define(ADLS_CLIENT_ID_CONFIG, ConfigDef.Type.STRING, ADLS_CLIENT_ID_DEFAULT, ConfigDef.Importance.HIGH, ADLS_CLIENT_ID_DOC)
                .define(ADLS_CLIENT_SECRET_CONFIG, ConfigDef.Type.PASSWORD, ADLS_CLIENT_SECRET_DEFAULT, ConfigDef.Importance.HIGH, ADLS_CLIENT_SECRET_DOC)

                // Directory settings
                .define(BASE_DIRECTORY_CONFIG, ConfigDef.Type.STRING, BASE_DIRECTORY_DEFAULT, BASE_DIRECTORY_VALIDATOR, ConfigDef.Importance.LOW, BASE_DIRECTORY_DOC)
                .define(TARGET_DIRECTORY_CONFIG, ConfigDef.Type.STRING, TARGET_DIRECTORY_DEFAULT, TARGET_DIRECTORY_VALIDATOR, ConfigDef.Importance.HIGH, TARGET_DIRECTORY_DOC)
                .define(STAGING_DIRECTORY_CONFIG, ConfigDef.Type.STRING, STAGING_DIRECTORY_DEFAULT, STAGING_DIRECTORY_VALIDATOR, ConfigDef.Importance.MEDIUM, STAGING_DIRECTORY_DOC)

                // Additional offset commit triggers
                .define(COMMIT_ROTATED_ONLY_CONFIG, ConfigDef.Type.BOOLEAN, COMMIT_ROTATED_ONLY_DEFAULT, ConfigDef.Importance.MEDIUM, COMMIT_ROTATED_ONLY_DOC)
                .define(COMMIT_RECORD_COUNT_CONFIG, ConfigDef.Type.INT, COMMIT_RECORD_COUNT_DEFAULT, ConfigDef.Importance.MEDIUM, COMMIT_RECORD_COUNT_DOC)

                // Rotation settings
                .define(ROTATION_INACTIVITY_CONFIG, ConfigDef.Type.LONG, ROTATION_INACTIVITY_DEFAULT, ConfigDef.Importance.MEDIUM, ROTATION_INACTIVITY_DOC)
                .define(ROTATION_RECORD_COUNT_CONFIG, ConfigDef.Type.INT, ROTATION_RECORD_COUNT_DEFAULT, ConfigDef.Importance.MEDIUM, ROTATION_RECORD_COUNT_DOC)
                .define(ROTATION_FILE_SIZE_CONFIG, ConfigDef.Type.LONG, ROTATION_FILE_SIZE_DEFAULT, ConfigDef.Importance.MEDIUM, ROTATION_FILE_SIZE_DOC)
                .define(ROTATION_TIME_SOURCE_CONFIG, ConfigDef.Type.STRING, ROTATION_TIME_SOURCE_DEFAULT, ROTATION_TIME_SOURCE_VALIDATOR, ConfigDef.Importance.MEDIUM, ROTATION_TIME_SOURCE_DOC)
                .define(ROTATION_TIME_ZONE_CONFIG, ConfigDef.Type.STRING, ROTATION_TIME_ZONE_DEFAULT, ROTATION_TIME_ZONE_VALIDATOR, ConfigDef.Importance.MEDIUM, ROTATION_TIME_ZONE_DOC)

                // Retry settings
                .define(RETRY_COUNT_CONFIG, ConfigDef.Type.INT, RETRY_COUNT_DEFAULT, ConfigDef.Importance.MEDIUM, RETRY_COUNT_DOC)
                .define(RETRY_EXPONENTIAL_INTERVAL_CONFIG, ConfigDef.Type.BOOLEAN, RETRY_EXPONENTIAL_INTERVAL_DEFAULT, ConfigDef.Importance.MEDIUM, RETRY_EXPONENTIAL_INTERVAL_DOC)
                .define(RETRY_INTERVAL_CONFIG, ConfigDef.Type.LONG, RETRY_INTERVAL_DEFAULT, ConfigDef.Importance.MEDIUM, RETRY_INTERVAL_DOC)
                .define(RETRY_INTERVAL_MAX_CONFIG, ConfigDef.Type.LONG, RETRY_INTERVAL_MAX_DEFAULT, ConfigDef.Importance.MEDIUM, RETRY_INTERVAL_MAX_DOC)

                // Compression settings
                .define(COMPRESSION_TYPE_CONFIG, ConfigDef.Type.STRING, COMPRESSION_TYPE_DEFAULT, COMPRESSION_TYPE_VALIDATOR, ConfigDef.Importance.MEDIUM, COMPRESSION_TYPE_DOC)

                // Avro ADLS Synchronisation settings
                .define(SYNC_INTERVAL_CONFIG, ConfigDef.Type.INT, SYNC_INTERVAL_DEFAULT, SYNC_INTERVAL_VALIDATOR, ConfigDef.Importance.MEDIUM, SYNC_INTERVAL_DOC)
                ;
    }

    /**
     * Get the configuration definition object
     *
     * @return An instance of ConfigDef containing the configuration definitions to use
     */
    public static ConfigDef config() {
        return new AdlsGen2ConfigDef(CONFIG);
    }

    /**
     * @return the Azure DataLake Storage endpoint
     */
    public String getEndpoint() {
        return getString(ADLS_ENDPONT_CONFIG);
    }

    /**
     * @return the Azure DataLake Storage container name
     */
    public String getContainerName() {
        return getString(ADLS_CONTAINER_NAME_CONFIG);
    }

    /**
     * @return the authentication method to use to connect to Azure DataLake Storage
     */
    public String getAuthenticationMethod() {
        return getString(ADLS_AUTHENTICATION_METHOD_CONFIG);
    }

    /**
     * @return the account name to use to connect to Azure DataLake Storage
     */
    public String getAccountName() {
        return getString(ADLS_ACCOUNT_NAME_CONFIG);
    }

    /**
     * @return the account key to use to connect to Azure DataLake Storage
     */
    public Password getAccountKey() {
        return getPassword(ADLS_ACCOUNT_KEY_CONFIG);
    }

    /**
     * @return the base directory path to use when storing data
     */
    public String getBaseDirectory() {
        return getString(BASE_DIRECTORY_CONFIG).trim();
    }

    /**
     * @return the target directory pattern to use for storing the final version of a container file
     */
    public String getTargetDirectory() {
        return getString(TARGET_DIRECTORY_CONFIG).trim();
    }

    /**
     * @return the directory where the currently loaded container file should be stored before it's moved to the target directory
     */
    public String getStagingDirectory() {
        return getString(STAGING_DIRECTORY_CONFIG).trim();
    }

    /**
     * @return the Shared Access Signature token to use to connect to Azure DataLake Storage
     */
    public Password getSasToken() {
        return getPassword(ADLS_SAS_TOKEN_CONFIG);
    }

    /**
     * @return the Azure AD Tenant ID to use to connect to Azure DataLake Storage
     */
    public String getTenantId() {
        return getString(ADLS_TENANT_ID_CONFIG);
    }

    /**
     * @return the Azure AD Client ID to use to connect to Azure DataLake Storage
     */
    public String getClientId() {
        return getString(ADLS_CLIENT_ID_CONFIG);
    }

    /**
     * @return the Azure AD Client Secret to use to connect to Azure DataLake Storage
     */
    public Password getClientSecret() {
        return getPassword(ADLS_CLIENT_SECRET_CONFIG);
    }

    /**
     * @return the number of milliseconds to wait before a staged file is marked as inactive and rotated to the target directory
     */
    public long getRotationInactiveTimeout() {
        return getLong(ROTATION_INACTIVITY_CONFIG);
    }

    /**
     * @return the maximum number of records allowed in a staged file before it is rotated to the target directory
     */
    public int getRotationRecordLimit() {
        return getInt(ROTATION_RECORD_COUNT_CONFIG);
    }

    /**
     * @return the maximum file size of a staged file before it is rotated to the target directory
     */
    public long getRotationFilesizeLimit() {
        return getLong(ROTATION_FILE_SIZE_CONFIG);
    }

    /**
     * @return source of the timestamp to use when processing a record
     */
    public String getRotationTimeSource() {
        return getString(ROTATION_TIME_SOURCE_CONFIG);
    }

    /**
     * @return the timezone ZoneId to use when applying the target directory pattern
     */
    public ZoneId getRotationTimestampZone() {
        return ZoneId.of(getString(ROTATION_TIME_ZONE_CONFIG));
    }

    /**
     * @return maximum number of records processed by a task before requesting an offset commit
     */
    public int getCommitRecordLimit() {
        return getInt(COMMIT_RECORD_COUNT_CONFIG);
    }

    /**
     * @return true if only offsets of records in rotated files should be committed
     */
    public boolean commitRotatedFilesOnly() {
        return getBoolean(COMMIT_ROTATED_ONLY_CONFIG);
    }

    /**
     * @return a configured RetryHelper object which can be used to control the retry flow.
     */
    public RetryHelper getRetryHelper() {
        return new RetryHelper(
                getInt(RETRY_COUNT_CONFIG),
                getLong(RETRY_INTERVAL_CONFIG),
                getLong(RETRY_INTERVAL_MAX_CONFIG),
                Boolean.TRUE.equals(getBoolean(RETRY_EXPONENTIAL_INTERVAL_CONFIG)) ? EXPONENTIAL_INTERVAL : FIXED_INTERVAL
        );
    }

    /**
     * @return the timeout in seconds for a call to the Azure DataLake Storage webservice
     */
    public int getAdlsClientTimeout() {
        return getInt(ADLS_CLIENT_TIMEOUT_CONFIG);
    }

    /**
     * @return The maximum number of retries before failing a call to the Azure DataLake Storage webservice
     */
    public int getAdlsClientRetryCount() {
        return getInt(ADLS_CLIENT_RETRY_COUNT_CONFIG);
    }

    /**
     * @return true if an exponential backoff policy should be used when retrying connections to the Azure DataLake Storage webservice
     */
    public boolean getAdlsClientUseExponential() {
        return getBoolean(ADLS_CLIENT_RETRY_EXPONENTIAL_INTERVAL_CONFIG);
    }

    /**
     * @return true the number of milliseconds to wait between retries when connecting to the Azure DataLake Storage webservice
     */
    public long getAdlsClientRetryInterval() {
        return getLong(ADLS_CLIENT_RETRY_INTERVAL_CONFIG);
    }

    /**
     * @return true the maximum number of milliseconds to wait between retries when connecting to the Azure DataLake Storage webservice. This is only used when exponential retry is enabled for the Azure DataLake Storage client.
     */
    public long getAdlsClientMaximumRetryInterval() {
        return getLong(ADLS_CLIENT_RETRY_INTERVAL_MAX_CONFIG);
    }

    /**
     * @return the Avro compression Codec to use when writing the container files.
     */
    public CodecFactory getCodecFactory() {
        if (COMPRESSION_TYPE_SNAPPY.equalsIgnoreCase(getString(COMPRESSION_TYPE_CONFIG))) {
            return CodecFactory.snappyCodec();
        }

        return CodecFactory.nullCodec();
    }

    /**
     * 
     * @return the amount of bytes in blocks before compressing and writing to the filesystem
     */
    public int getSyncInterval(){
        return getInt(SYNC_INTERVAL_CONFIG);
    }

    protected ServiceClientProvider getServiceClientProvider() {
        switch (getAuthenticationMethod()) {
            case ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY:
                return new AccountKeyClientProvider(this);
            case ADLS_AUTHENTICATION_METHOD_SAS_TOKEN:
                return new SharedAccessSignatureClientProvider(this);
            case ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET:
                return new ClientSecretClientProvider(this);
            default:
                // Should be unreachable
                throw new AdlsGen2ConfigurationException("Could not determine a ServiceClientProvider for authentication method" + getAuthenticationMethod());
        }
    }

    /**
     * These settings are used for testing with a local filesystem and are not part of the published configuration options.
     */
    private static final String STORAGE_PROVIDER_CONFIG = "storage.provider";
    private static final String STORAGE_PROVIDER_ADLS = "adls";
    private static final String STORAGE_PROVIDER_LOCAL = "local";
    private static final String LOCAL_STORAGE_PROVIDER_CONFIG = "storage.local.location";
    private static final String LOCAL_STORAGE_PROVIDER_DEFAULT = null;


    /**
     * @return a configured Storage Provider to write data to the remote file system.
     */
    @SuppressWarnings("FallThrough")
    public StorageProvider getStorageProvider() {
        final String providerName = (String) ConfigDef.parseType(STORAGE_PROVIDER_CONFIG,
                originals().getOrDefault(STORAGE_PROVIDER_CONFIG, STORAGE_PROVIDER_ADLS),
                ConfigDef.Type.STRING);

        switch (providerName) {
            case STORAGE_PROVIDER_LOCAL:
                final String location = (String) ConfigDef.parseType(LOCAL_STORAGE_PROVIDER_CONFIG,
                        originals().getOrDefault(LOCAL_STORAGE_PROVIDER_CONFIG, LOCAL_STORAGE_PROVIDER_DEFAULT),
                        ConfigDef.Type.STRING);
                if (location == null) {
                    throw new AdlsGen2ConfigurationException("No location set in config " + LOCAL_STORAGE_PROVIDER_CONFIG);
                }
                Path target = Paths.get(location);
                if (!Files.exists(target) || !Files.isDirectory(target)) {
                    throw new AdlsGen2ConfigurationException("Local storage location " + location +
                            " does not exist or is not a directory");
                }
                return new LocalStorageProvider(location);
            default:
            case STORAGE_PROVIDER_ADLS:
                DataLakeServiceClient serviceClient = getServiceClientProvider().create();
                return new AdlsGen2StorageProvider(serviceClient.getFileSystemClient(getContainerName()));
        }
    }

    static class PathValidator implements ConfigDef.Validator {
        private final Pattern containsMultipleSlash = Pattern.compile("\\/{2,}");
        private final boolean requirePath;

        PathValidator(boolean requirePath) {
            this.requirePath = requirePath;
        }

        @Override
        public void ensureValid(String name, Object value) {
            if (value == null) {
                throw new ConfigException(name, null, "Value must be non-null");
            }
            if (!(value instanceof String)) {
                throw new ConfigException(name, value, "Value must be a String");
            }

            String trimmedValue = ((String) value).trim();
            if (trimmedValue.isEmpty() && requirePath) {
                throw new ConfigException(name, value, "Value can not be empty");
            }

            if (trimmedValue.startsWith("/") || trimmedValue.endsWith("/")) {
                throw new ConfigException(name, value, "Value must not start or end with slashes (/)");
            }

            if (containsMultipleSlash.matcher(trimmedValue).find()) {
                throw new ConfigException(name, value, "Value must not contain consecutive slashes  (/)");
            }
        }
    }

    /**
     * A custom configuration definition to validate properties based on the values of other properties.
     * Used for validating the required properties for the authentication fields.
     */
    static class AdlsGen2ConfigDef extends ConfigDef {

        public AdlsGen2ConfigDef() {
            super();
        }

        public AdlsGen2ConfigDef(ConfigDef config) {
            super(config);
        }

        @Override
        public Map validateAll(Map props) {
            Map validated = super.validateAll(props);

            // Add Auth Validation
            final String authMethod = (String) parseType(ADLS_AUTHENTICATION_METHOD_CONFIG,
                    props.getOrDefault(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_DEFAULT),
                    Type.STRING);
            final BinaryOperator fieldError = (field, value) -> String.format("Field is required when using %s with value %s", field, value);
            switch (authMethod) {
                case ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY:
                    final String accountName = (String) parseType(ADLS_ACCOUNT_NAME_CONFIG, props.get(ADLS_ACCOUNT_NAME_CONFIG), Type.STRING);
                    final Password accountKey = (Password) parseType(ADLS_ACCOUNT_KEY_CONFIG, props.get(ADLS_ACCOUNT_KEY_CONFIG), Type.PASSWORD);
                    if (accountName == null) {
                        validated.computeIfAbsent(ADLS_ACCOUNT_NAME_CONFIG, cfg -> new ConfigValue(ADLS_ACCOUNT_NAME_CONFIG)).errorMessages()
                                .add(fieldError.apply(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY));
                    }
                    if (accountKey == null) {
                        validated.computeIfAbsent(ADLS_ACCOUNT_KEY_CONFIG, cfg -> new ConfigValue(ADLS_ACCOUNT_KEY_CONFIG)).errorMessages()
                                .add(fieldError.apply(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_ACCOUNT_KEY));
                    }
                    break;
                case ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET:
                    final Password secret = (Password) parseType(ADLS_CLIENT_SECRET_CONFIG, props.get(ADLS_CLIENT_SECRET_CONFIG), Type.PASSWORD);
                    final String clientId = (String) parseType(ADLS_CLIENT_ID_CONFIG, props.get(ADLS_CLIENT_ID_CONFIG), Type.STRING);
                    if (clientId == null) {
                        validated.computeIfAbsent(ADLS_CLIENT_ID_CONFIG, cfg -> new ConfigValue(ADLS_CLIENT_ID_CONFIG)).errorMessages()
                                .add(fieldError.apply(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET));
                    }
                    final String tenantId = (String) parseType(ADLS_TENANT_ID_CONFIG, props.get(ADLS_CLIENT_ID_CONFIG), Type.STRING);
                    if (tenantId == null) {
                        validated.computeIfAbsent(ADLS_TENANT_ID_CONFIG, cfg -> new ConfigValue(ADLS_TENANT_ID_CONFIG)).errorMessages()
                                .add(fieldError.apply(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET));
                    }
                    if (secret == null) {
                        validated.computeIfAbsent(ADLS_CLIENT_SECRET_CONFIG, cfg -> new ConfigValue(ADLS_CLIENT_SECRET_CONFIG)).errorMessages()
                                .add(fieldError.apply(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_CLIENT_SECRET));
                    }
                    break;
                case ADLS_AUTHENTICATION_METHOD_SAS_TOKEN:
                    final Password token = (Password) parseType(ADLS_SAS_TOKEN_CONFIG, props.get(ADLS_SAS_TOKEN_CONFIG), Type.PASSWORD);
                    if (token == null) {
                        validated.computeIfAbsent(ADLS_SAS_TOKEN_CONFIG, cfg -> new ConfigValue(ADLS_SAS_TOKEN_CONFIG)).errorMessages()
                                .add(fieldError.apply(ADLS_AUTHENTICATION_METHOD_CONFIG, ADLS_AUTHENTICATION_METHOD_SAS_TOKEN));
                    }
                    break;
                default:
                    // Eat this error scenario as it's handled by the normal field validation
            }

            return validated;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy