com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration Maven / Gradle / Ivy
Show all versions of gcs-connector Show documentation
/*
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.fs.gcs;
import static com.google.cloud.hadoop.gcsio.GoogleCloudStorageOptions.GRPC_WRITE_DEFAULT;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.HTTP_TRANSPORT_SUFFIX;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.PROXY_ADDRESS_SUFFIX;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.PROXY_PASSWORD_SUFFIX;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.PROXY_USERNAME_SUFFIX;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.getConfigKeyPrefixes;
import static com.google.common.base.Strings.nullToEmpty;
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.GcsFileChecksumType;
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.GlobAlgorithm;
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.OutputStreamType;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemOptions.ClientType;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageOptions.MetricsSink;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions.Fadvise;
import com.google.cloud.hadoop.gcsio.PerformanceCachingGoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.authorization.AuthorizationHandler;
import com.google.cloud.hadoop.gcsio.cooplock.CooperativeLockingOptions;
import com.google.cloud.hadoop.util.AsyncWriteChannelOptions;
import com.google.cloud.hadoop.util.AsyncWriteChannelOptions.PartFileCleanupType;
import com.google.cloud.hadoop.util.AsyncWriteChannelOptions.PipeType;
import com.google.cloud.hadoop.util.AsyncWriteChannelOptions.UploadType;
import com.google.cloud.hadoop.util.RedactedString;
import com.google.cloud.hadoop.util.RequesterPaysOptions;
import com.google.cloud.hadoop.util.RequesterPaysOptions.RequesterPaysMode;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.flogger.GoogleLogger;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.FsPermission;
/** This class provides a configuration for the {@link GoogleHadoopFileSystem} implementations. */
public class GoogleHadoopFileSystemConfiguration {
private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();
public static final String GCS_CONFIG_PREFIX = "fs.gs";
public static final List CONFIG_KEY_PREFIXES =
ImmutableList.copyOf(getConfigKeyPrefixes(GCS_CONFIG_PREFIX));
// -----------------------------------------------------------------
// Configuration settings.
// -----------------------------------------------------------------
/** Configuration key for the Cloud Storage API endpoint root URL. */
public static final HadoopConfigurationProperty GCS_ROOT_URL =
new HadoopConfigurationProperty<>(
"fs.gs.storage.root.url", GoogleCloudStorageOptions.STORAGE_ROOT_URL_DEFAULT);
/** Configuration key for the Cloud Storage API endpoint service path. */
public static final HadoopConfigurationProperty GCS_SERVICE_PATH =
new HadoopConfigurationProperty<>(
"fs.gs.storage.service.path", GoogleCloudStorageOptions.STORAGE_SERVICE_PATH_DEFAULT);
/**
* Key for the permissions that we report a file or directory to have. Can either be octal or
* symbolic mode accepted by {@link FsPermission#FsPermission(String)}
*
* Default value for the permissions that we report a file or directory to have. Note: We do
* not really support file/dir permissions but we need to report some permission value when Hadoop
* calls getFileStatus(). A MapReduce job fails if we report permissions more relaxed than the
* value below and this is the default File System.
*/
public static final HadoopConfigurationProperty PERMISSIONS_TO_REPORT =
new HadoopConfigurationProperty<>("fs.gs.reported.permissions", "700");
/**
* Configuration key for default block size of a file.
*
* Note that this is the size that is reported to Hadoop FS clients. It does not modify the
* actual block size of an underlying GCS object, because GCS JSON API does not allow modifying or
* querying the value. Modifying this value allows one to control how many mappers are used to
* process a given file.
*/
public static final HadoopConfigurationProperty BLOCK_SIZE =
new HadoopConfigurationProperty<>("fs.gs.block.size", 64 * 1024 * 1024L);
/**
* Configuration key for enabling hierarchical namespace buckets.
*
* If this is enabled, rename folder operation on a Hierarchical namespace enabled bucket will
* be performed by calling the rename API.
*/
public static final HadoopConfigurationProperty GCS_HIERARCHICAL_NAMESPACE_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.hierarchical.namespace.folders.enable", false);
/** Configuration key for Delegation Token binding class. Default value: none */
public static final HadoopConfigurationProperty DELEGATION_TOKEN_BINDING_CLASS =
new HadoopConfigurationProperty<>("fs.gs.delegation.token.binding");
/** Configuration key for GCS project ID. Default value: none */
public static final HadoopConfigurationProperty GCS_PROJECT_ID =
new HadoopConfigurationProperty<>("fs.gs.project.id");
/** Configuration key for initial working directory of a GHFS instance. Default value: '/' */
public static final HadoopConfigurationProperty GCS_WORKING_DIRECTORY =
new HadoopConfigurationProperty<>("fs.gs.working.dir", "/");
/**
* If true, recursive delete on a path that refers to a GCS bucket itself ('/' for any
* bucket-rooted GoogleHadoopFileSystem) or delete on that path when it's empty will result in
* fully deleting the GCS bucket. If false, any operation that normally would have deleted the
* bucket will be ignored instead. Setting to 'false' preserves the typical behavior of "rm -rf /"
* which translates to deleting everything inside of root, but without clobbering the filesystem
* authority corresponding to that root path in the process.
*/
public static final HadoopConfigurationProperty GCE_BUCKET_DELETE_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.bucket.delete.enable", false);
/** Configuration key for GCS project ID. Default value: "DISABLED" */
public static final HadoopConfigurationProperty GCS_REQUESTER_PAYS_MODE =
new HadoopConfigurationProperty<>(
"fs.gs.requester.pays.mode", RequesterPaysOptions.REQUESTER_PAYS_MODE_DEFAULT);
/** Configuration key for GCS Requester Pays Project ID. Default value: none */
public static final HadoopConfigurationProperty GCS_REQUESTER_PAYS_PROJECT_ID =
new HadoopConfigurationProperty<>("fs.gs.requester.pays.project.id");
/** Configuration key for GCS Requester Pays Buckets. Default value: none */
public static final HadoopConfigurationProperty> GCS_REQUESTER_PAYS_BUCKETS =
new HadoopConfigurationProperty<>("fs.gs.requester.pays.buckets", ImmutableList.of());
/**
* Configuration key for which type of FileChecksum to return; if a particular file doesn't
* support the requested type, then getFileChecksum() will return null for that file.
*/
public static final HadoopConfigurationProperty GCS_FILE_CHECKSUM_TYPE =
new HadoopConfigurationProperty<>("fs.gs.checksum.type", GcsFileChecksumType.NONE);
/**
* Configuration key for using a local item cache to supplement GCS API "getFile" results. This
* provides faster access to recently queried data. Because the data is cached, modifications made
* outside of this instance may not be immediately reflected. The performance cache can be used in
* conjunction with other caching options.
*/
public static final HadoopConfigurationProperty GCS_PERFORMANCE_CACHE_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.performance.cache.enable", false);
/**
* Configuration key for maximum number of milliseconds a GoogleCloudStorageItemInfo will remain
* "valid" in the performance cache before it's invalidated.
*/
public static final HadoopConfigurationProperty GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS =
new HadoopConfigurationProperty<>(
"fs.gs.performance.cache.max.entry.age.ms",
PerformanceCachingGoogleCloudStorageOptions.MAX_ENTRY_AGE_MILLIS_DEFAULT);
/**
* If true, executes GCS requests in {@code listStatus} and {@code getFileStatus} methods in
* parallel to reduce latency.
*/
public static final HadoopConfigurationProperty GCS_STATUS_PARALLEL_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.status.parallel.enable", true);
/** Configuration key for enabling lazy initialization of GCS FS instance. */
public static final HadoopConfigurationProperty GCS_LAZY_INITIALIZATION_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.lazy.init.enable", false);
/**
* Configuration key for enabling automatic repair of implicit directories whenever detected
* inside delete and rename calls.
*/
public static final HadoopConfigurationProperty GCS_REPAIR_IMPLICIT_DIRECTORIES_ENABLE =
new HadoopConfigurationProperty<>(
"fs.gs.implicit.dir.repair.enable",
GoogleCloudStorageOptions.AUTO_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT);
/**
* Configuration key for enabling check to ensure that conflicting directories do not exist when
* creating files and conflicting files do not exist when creating directories.
*/
public static final HadoopConfigurationProperty GCS_CREATE_ITEMS_CONFLICT_CHECK_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.create.items.conflict.check.enable", true);
/** Configuration key for customizing glob search algorithm. */
public static final HadoopConfigurationProperty GCS_GLOB_ALGORITHM =
new HadoopConfigurationProperty<>("fs.gs.glob.algorithm", GlobAlgorithm.CONCURRENT);
/** Configuration key for marker file pattern. Default value: none */
public static final HadoopConfigurationProperty GCS_MARKER_FILE_PATTERN =
new HadoopConfigurationProperty<>("fs.gs.marker.file.pattern");
/** Configuration key for a max number of GCS RPCs in batch request. */
public static final HadoopConfigurationProperty GCS_MAX_REQUESTS_PER_BATCH =
new HadoopConfigurationProperty<>("fs.gs.max.requests.per.batch", 15L);
/** Configuration key for a number of threads to execute batch requests. */
public static final HadoopConfigurationProperty GCS_BATCH_THREADS =
new HadoopConfigurationProperty<>("fs.gs.batch.threads", 15);
/**
* Configuration key for enabling the use of Rewrite requests for copy operations. Rewrite request
* has the same effect as Copy request, but it can handle moving large objects that may
* potentially timeout a Copy request.
*/
public static final HadoopConfigurationProperty GCS_COPY_WITH_REWRITE_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.copy.with.rewrite.enable", true);
/**
* Configuration key for specifying max number of bytes rewritten in a single rewrite request when
* fs.gs.copy.with.rewrite.enable is set to 'true'.
*/
public static final HadoopConfigurationProperty GCS_REWRITE_MAX_BYTES_PER_CALL =
new HadoopConfigurationProperty<>("fs.gs.rewrite.max.bytes.per.call", 512 * 1024 * 1024L);
/** Configuration key for number of items to return per call to the list* GCS RPCs. */
public static final HadoopConfigurationProperty GCS_MAX_LIST_ITEMS_PER_CALL =
new HadoopConfigurationProperty<>("fs.gs.list.max.items.per.call", 5000L);
/**
* Configuration key for the max number of retries for failed HTTP request to GCS. Note that the
* connector will retry *up to* the number of times as specified, using a default
* ExponentialBackOff strategy.
*
* Also, note that this number will only control the number of retries in the low level HTTP
* request implementation.
*/
public static final HadoopConfigurationProperty GCS_HTTP_MAX_RETRY =
new HadoopConfigurationProperty<>("fs.gs.http.max.retry", 10);
/** Configuration key for the connect timeout (in millisecond) for HTTP request to GCS. */
public static final HadoopConfigurationProperty GCS_HTTP_CONNECT_TIMEOUT =
new HadoopConfigurationProperty<>("fs.gs.http.connect-timeout", 20 * 1000);
/** Configuration key for the connect timeout (in millisecond) for HTTP request to GCS. */
public static final HadoopConfigurationProperty GCS_HTTP_READ_TIMEOUT =
new HadoopConfigurationProperty<>("fs.gs.http.read-timeout", 20 * 1000);
/** Configuration key for adding a suffix to the GHFS application name sent to GCS. */
public static final HadoopConfigurationProperty GCS_APPLICATION_NAME_SUFFIX =
new HadoopConfigurationProperty<>("fs.gs.application.name.suffix", "");
/**
* Configuration key for modifying the maximum amount of time to wait for empty object creation.
*/
public static final HadoopConfigurationProperty GCS_MAX_WAIT_MILLIS_EMPTY_OBJECT_CREATE =
new HadoopConfigurationProperty<>("fs.gs.max.wait.for.empty.object.creation.ms", 3_000);
/**
* Configuration key for which type of output stream to use; different options may have different
* degrees of support for advanced features like {@code hsync()} and different performance
* characteristics. Options:
*
* BASIC: Stream is closest analogue to direct wrapper around low-level HTTP stream into GCS.
*
*
SYNCABLE_COMPOSITE: Stream behaves similarly to BASIC when used with basic
* create/write/close patterns, but supports hsync() by creating discrete temporary GCS objects
* which are composed onto the destination object.
*
*
FLUSHABLE_COMPOSITE: Stream behaves similarly to SYNCABLE_COMPOSITE, except hflush() is also
* supported. It will use the same implementation of hsync().
*/
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_TYPE =
new HadoopConfigurationProperty<>("fs.gs.outputstream.type", OutputStreamType.BASIC);
/** Configuration key for setting write buffer size. */
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE =
new HadoopConfigurationProperty<>("fs.gs.outputstream.buffer.size", 8 * 1024 * 1024);
/** Configuration key for setting pipe buffer size. */
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_PIPE_BUFFER_SIZE =
new HadoopConfigurationProperty<>("fs.gs.outputstream.pipe.buffer.size", 1024 * 1024);
/** Configuration key for setting pipe type. */
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_PIPE_TYPE =
new HadoopConfigurationProperty<>("fs.gs.outputstream.pipe.type", PipeType.IO_STREAM_PIPE);
/** Configuration key for setting GCS upload chunk size. */
// chunk size etc. Get the following value from GCSWC class in a better way. For now, we hard code
// it to a known good value.
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_UPLOAD_CHUNK_SIZE =
new HadoopConfigurationProperty<>(
"fs.gs.outputstream.upload.chunk.size", 64 * 1024 * 1024, "fs.gs.io.buffersize.write");
/** Configuration for setting GCS upload cache size. */
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_UPLOAD_CACHE_SIZE =
new HadoopConfigurationProperty<>("fs.gs.outputstream.upload.cache.size", 0);
/** Configuration key for enabling GCS direct upload. */
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_DIRECT_UPLOAD_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.outputstream.direct.upload.enable", false);
/**
* Configuration key for the minimal time interval between consecutive sync/hsync/hflush calls.
*/
public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL_MS =
new HadoopConfigurationProperty<>("fs.gs.outputstream.sync.min.interval.ms", 0);
/**
* If {@code true}, on opening a file we will proactively perform a metadata {@code GET} to check
* whether the object exists, even though the underlying channel will not open a data stream until
* {@code read()} is actually called. This is necessary to technically match the expected behavior
* of Hadoop filesystems, but incurs an extra latency overhead on {@code open()}. If the calling
* code can handle late failures on not-found errors, or has independently already ensured that a
* file exists before calling {@code open()}, then you can set this to {@code false} for more
* efficient reads.
*
* Note, this is known to not work with YARN {@code CommonNodeLabelsManager} and potentially
* other Hadoop components. That's why it's not recommended to set this property to {@code false}
* cluster-wide, instead set it for a specific job/application that is compatible with it.
*/
public static final HadoopConfigurationProperty
GCS_INPUT_STREAM_FAST_FAIL_ON_NOT_FOUND_ENABLE =
new HadoopConfigurationProperty<>(
"fs.gs.inputstream.fast.fail.on.not.found.enable", true);
/**
* If true, reading a file with GZIP content encoding (HTTP header "Content-Encoding: gzip") will
* result in failure (IOException is thrown).
*/
public static final HadoopConfigurationProperty
GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE =
new HadoopConfigurationProperty<>(
"fs.gs.inputstream.support.gzip.encoding.enable", false);
/**
* If forward seeks are within this many bytes of the current position, seeks are performed by
* reading and discarding bytes in-place rather than opening a new underlying stream.
*/
public static final HadoopConfigurationProperty GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT =
new HadoopConfigurationProperty<>("fs.gs.inputstream.inplace.seek.limit", 8 * 1024 * 1024L);
/** Tunes reading objects behavior to optimize HTTP GET requests for various use cases. */
public static final HadoopConfigurationProperty GCS_INPUT_STREAM_FADVISE =
new HadoopConfigurationProperty<>("fs.gs.inputstream.fadvise", Fadvise.AUTO);
/**
* Minimum size in bytes of the HTTP Range header set in GCS request when opening new stream to
* read an object.
*/
public static final HadoopConfigurationProperty GCS_INPUT_STREAM_MIN_RANGE_REQUEST_SIZE =
new HadoopConfigurationProperty<>(
"fs.gs.inputstream.min.range.request.size",
GoogleCloudStorageReadOptions.DEFAULT_MIN_RANGE_REQUEST_SIZE);
/** Configuration key for enabling use of the gRPC API for read/write. */
public static final HadoopConfigurationProperty GCS_GRPC_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.grpc.enable", false);
/** Configuration key for enabling checksum validation for the gRPC API. */
public static final HadoopConfigurationProperty GCS_GRPC_CHECKSUMS_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.grpc.checksums.enable", false);
/** Configuration key for check interval (in millisecond) for gRPC request timeout to GCS. */
public static final HadoopConfigurationProperty GCS_GRPC_CHECK_INTERVAL_TIMEOUT_MS =
new HadoopConfigurationProperty<>("fs.gs.grpc.checkinterval.timeout.ms", 1_000L);
/**
* Configuration key for the connection timeout (in millisecond) for gRPC read requests to GCS.
*/
public static final HadoopConfigurationProperty GCS_GRPC_READ_TIMEOUT_MS =
new HadoopConfigurationProperty<>("fs.gs.grpc.read.timeout.ms", 3600 * 1000L);
/** Configuration key for the message timeout (in millisecond) for gRPC read requests to GCS. */
public static final HadoopConfigurationProperty GCS_GRPC_READ_MESSAGE_TIMEOUT_MS =
new HadoopConfigurationProperty<>("fs.gs.grpc.read.message.timeout.ms", 3 * 1_000L);
/**
* Configuration key for the connection timeout (in millisecond) for gRPC metadata requests to
* GCS.
*/
public static final HadoopConfigurationProperty GCS_GRPC_READ_METADATA_TIMEOUT_MS =
new HadoopConfigurationProperty<>("fs.gs.grpc.read.metadata.timeout.ms", 60 * 1000L);
/** Configuration key for enabling the zero-copy deserializer for the gRPC API. */
public static final HadoopConfigurationProperty GCS_GRPC_READ_ZEROCOPY_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.grpc.read.zerocopy.enable", true);
/** Configuration key for the number of requests to be buffered for uploads to GCS. */
public static final HadoopConfigurationProperty GCS_GRPC_UPLOAD_BUFFERED_REQUESTS =
new HadoopConfigurationProperty<>("fs.gs.grpc.write.buffered.requests", 20L);
/** Configuration key for the connect timeout (in millisecond) for gRPC write requests to GCS. */
public static final HadoopConfigurationProperty GCS_GRPC_WRITE_TIMEOUT_MS =
new HadoopConfigurationProperty<>("fs.gs.grpc.write.timeout.ms", 10 * 60 * 1000L);
/** Configuration key for the message timeout (in millisecond) for gRPC write requests to GCS. */
public static final HadoopConfigurationProperty GCS_GRPC_WRITE_MESSAGE_TIMEOUT_MS =
new HadoopConfigurationProperty<>("fs.gs.grpc.write.message.timeout.ms", 3 * 1_000L);
/** Configuration key for enabling use of directpath gRPC API for read/write. */
public static final HadoopConfigurationProperty GCS_GRPC_DIRECTPATH_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.grpc.directpath.enable", true);
/** Configuration key for enabling use of traffic director gRPC API for read/write. */
public static final HadoopConfigurationProperty GCS_GRPC_TRAFFICDIRECTOR_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.grpc.trafficdirector.enable", true);
/**
* Configuration key for using cooperative locking to achieve a directory mutation operations
* isolation.
*/
public static final HadoopConfigurationProperty GCS_COOPERATIVE_LOCKING_ENABLE =
new HadoopConfigurationProperty<>("fs.gs.cooperative.locking.enable", false);
/** Configuration key for lock expiration when using cooperative locking. */
public static final HadoopConfigurationProperty
GCS_COOPERATIVE_LOCKING_EXPIRATION_TIMEOUT_MS =
new HadoopConfigurationProperty<>(
"fs.gs.cooperative.locking.expiration.timeout.ms",
CooperativeLockingOptions.LOCK_EXPIRATION_TIMEOUT_MS_DEFAULT);
/** Configuration key for maximum allowed concurrent operations when using cooperative locking. */
public static final HadoopConfigurationProperty
GCS_COOPERATIVE_LOCKING_MAX_CONCURRENT_OPERATIONS =
new HadoopConfigurationProperty<>(
"fs.gs.cooperative.locking.max.concurrent.operations",
CooperativeLockingOptions.MAX_CONCURRENT_OPERATIONS_DEFAULT);
/** Configuration key for the headers for HTTP request to GCS. */
public static final HadoopConfigurationProperty