Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.fs.gcs;
import static com.google.cloud.hadoop.util.RequesterPaysOptions.REQUESTER_PAYS_MODE_DEFAULT;
import com.google.api.client.auth.oauth2.Credential;
import com.google.cloud.hadoop.gcsio.CreateFileOptions;
import com.google.cloud.hadoop.gcsio.FileInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageItemInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions.Fadvise;
import com.google.cloud.hadoop.gcsio.PathCodec;
import com.google.cloud.hadoop.gcsio.PerformanceCachingGoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.StorageResourceId;
import com.google.cloud.hadoop.util.AccessTokenProvider;
import com.google.cloud.hadoop.util.AccessTokenProviderClassFromConfigFactory;
import com.google.cloud.hadoop.util.AsyncWriteChannelOptions;
import com.google.cloud.hadoop.util.CredentialFactory;
import com.google.cloud.hadoop.util.CredentialFromAccessTokenProviderClassFactory;
import com.google.cloud.hadoop.util.EntriesCredentialConfiguration;
import com.google.cloud.hadoop.util.HadoopCredentialConfiguration;
import com.google.cloud.hadoop.util.HadoopVersionInfo;
import com.google.cloud.hadoop.util.HttpTransportFactory;
import com.google.cloud.hadoop.util.PropertyUtil;
import com.google.cloud.hadoop.util.RequesterPaysOptions;
import com.google.cloud.hadoop.util.RequesterPaysOptions.RequesterPaysMode;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.nio.file.DirectoryNotEmptyException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.GlobPattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class provides a Hadoop compatible File System on top of Google Cloud Storage (GCS).
*
*
It is implemented as a thin abstraction layer on top of GCS. The layer hides any specific
* characteristics of the underlying store and exposes FileSystem interface understood by the Hadoop
* engine.
*
*
Users interact with the files in the storage using fully qualified URIs. The file system
* exposed by this class is identified using the 'gs' scheme. For example, {@code
* gs://dir1/dir2/file1.txt}.
*
*
This implementation translates paths between hadoop Path and GCS URI with the convention that
* the Hadoop root directly corresponds to the GCS "root", e.g. gs:/. This is convenient for many
* reasons, such as data portability and close equivalence to gsutil paths, but imposes certain
* inherited constraints, such as files not being allowed in root (only 'directories' can be placed
* in root), and directory names inside root have a more limited set of allowed characters.
*
*
One of the main goals of this implementation is to maintain compatibility with behavior of
* HDFS implementation when accessed through FileSystem interface. HDFS implementation is not very
* consistent about the cases when it throws versus the cases when methods return false. We run GHFS
* tests and HDFS tests against the same test data and use that as a guide to decide whether to
* throw or to return false.
*/
public abstract class GoogleHadoopFileSystemBase extends GoogleHadoopFileSystemBaseSpecific
implements FileSystemDescriptor {
/** Logger. */
public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystemBase.class);
/** Default value of replication factor. */
public static final short REPLICATION_FACTOR_DEFAULT = 3;
/** Splitter for list values stored in a single configuration value */
private static final Splitter CONFIGURATION_SPLITTER = Splitter.on(',');
// -----------------------------------------------------------------
// Configuration settings.
// -----------------------------------------------------------------
/**
* Key for the permissions that we report a file or directory to have. Can either be octal or
* symbolic mode accepted by {@link FsPermission#FsPermission(String)}
*/
public static final String PERMISSIONS_TO_REPORT_KEY = "fs.gs.reported.permissions";
/**
* Default value for the permissions that we report a file or directory to have. Note: We do not
* really support file/dir permissions but we need to report some permission value when Hadoop
* calls getFileStatus(). A MapReduce job fails if we report permissions more relaxed than the
* value below and this is the default File System.
*/
public static final String PERMISSIONS_TO_REPORT_DEFAULT = "700";
/** Configuration key for setting IO buffer size. */
// TODO(user): rename the following to indicate that it is read buffer size.
public static final String BUFFERSIZE_KEY = "fs.gs.io.buffersize";
/**
* Hadoop passes 4096 bytes as buffer size which causes poor perf. Default value of {@link
* #BUFFERSIZE_KEY}.
*/
public static final int BUFFERSIZE_DEFAULT = 8 * 1024 * 1024;
/** Configuration key for setting write buffer size. */
public static final String WRITE_BUFFERSIZE_KEY = "fs.gs.io.buffersize.write";
/** Default value of {@link #WRITE_BUFFERSIZE_KEY}. */
// chunk size etc. Get the following value from GCSWC class in a better way. For now, we hard code
// it to a known good value.
public static final int WRITE_BUFFERSIZE_DEFAULT = 64 * 1024 * 1024;
/** Configuration key for default block size of a file. */
public static final String BLOCK_SIZE_KEY = "fs.gs.block.size";
/** Default value of {@link #BLOCK_SIZE_KEY}. */
public static final int BLOCK_SIZE_DEFAULT = 64 * 1024 * 1024;
/** Prefix to use for common authentication keys. */
public static final String AUTHENTICATION_PREFIX = "fs.gs";
/**
* Configuration key for enabling GCE service account authentication. This key is deprecated. See
* {@link HadoopCredentialConfiguration} for current key names.
*/
public static final String ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY =
"fs.gs.enable.service.account.auth";
/**
* Configuration key specifying the email address of the service-account with which to
* authenticate. Only required if {@link #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY} is true AND we're
* using fs.gs.service.account.auth.keyfile to authenticate with a private keyfile. NB: Once GCE
* supports setting multiple service account email addresses for metadata auth, this key will also
* be used in the metadata auth flow. This key is deprecated. See {@link
* HadoopCredentialConfiguration} for current key names.
*/
public static final String SERVICE_ACCOUNT_AUTH_EMAIL_KEY = "fs.gs.service.account.auth.email";
/**
* Configuration key specifying local file containing a service-account private .p12 keyfile. Only
* used if {@link #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY} is true; if provided, the keyfile will be
* used for service-account authentication. Otherwise, it is assumed that we are on a GCE VM with
* metadata-authentication for service-accounts enabled, and the metadata server will be used
* instead. Default value: none This key is deprecated. See {@link HadoopCredentialConfiguration}
* for current key names.
*/
public static final String SERVICE_ACCOUNT_AUTH_KEYFILE_KEY =
"fs.gs.service.account.auth.keyfile";
/** Configuration key for GCS project ID. Default value: none */
public static final String GCS_PROJECT_ID_KEY = "fs.gs.project.id";
/** Configuration key for GCS project ID. Default value: "DISABLED" */
public static final String GCS_REQUESTER_PAYS_MODE_KEY = "fs.gs.requester.pays.mode";
/** Configuration key for GCS Requester Pays Project ID. Default value: none */
public static final String GCS_REQUESTER_PAYS_PROJECT_ID_KEY = "fs.gs.requester.pays.project.id";
/** Configuration key for GCS Requester Pays Buckets. Default value: none */
public static final String GCS_REQUESTER_PAYS_BUCKETS_KEY = "fs.gs.requester.pays.buckets";
/**
* Configuration key for GCS client ID. Required if {@link #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY}
* == false. Default value: none This key is deprecated. See {@link HadoopCredentialConfiguration}
* for current key names.
*/
public static final String GCS_CLIENT_ID_KEY = "fs.gs.client.id";
/**
* Configuration key for GCS client secret. Required if {@link
* #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY} == false. Default value: none This key is deprecated. See
* HadoopCredentialConfiguration for current key names.
*/
public static final String GCS_CLIENT_SECRET_KEY = "fs.gs.client.secret";
/**
* Configuration key for system bucket name. It is a fall back for the rootBucket of
* GoogleHadoopFileSystem in gs:///path URIs . Default value: none This key is deprecated. Always
* init the FileSystem with a bucket.
*/
public static final String GCS_SYSTEM_BUCKET_KEY = "fs.gs.system.bucket";
/**
* Configuration key for flag to indicate whether system bucket should be created if it does not
* exist. This key is deprecated. See {@link #GCS_SYSTEM_BUCKET_KEY}.
*/
public static final String GCS_CREATE_SYSTEM_BUCKET_KEY = "fs.gs.system.bucket.create";
/** Default value of {@link #GCS_CREATE_SYSTEM_BUCKET_KEY}. */
public static final boolean GCS_CREATE_SYSTEM_BUCKET_DEFAULT = true;
/** Configuration key for initial working directory of a GHFS instance. Default value: '/' */
public static final String GCS_WORKING_DIRECTORY_KEY = "fs.gs.working.dir";
/**
* Configuration key for setting 250GB upper limit on file size to gain higher write throughput.
*/
// TODO(user): remove it once blobstore supports high throughput without limiting size.
public static final String GCS_FILE_SIZE_LIMIT_250GB = "fs.gs.file.size.limit.250gb";
/** Default value of {@link #GCS_FILE_SIZE_LIMIT_250GB}. */
public static final boolean GCS_FILE_SIZE_LIMIT_250GB_DEFAULT = false;
/** Configuration key for marker file pattern. Default value: none */
public static final String GCS_MARKER_FILE_PATTERN_KEY = "fs.gs.marker.file.pattern";
/**
* Configuration key for using a local item cache to supplement GCS API "getFile" results. This
* provides faster access to recently queried data. Because the data is cached, modifications made
* outside of this instance may not be immediately reflected. The performance cache can be used in
* conjunction with other caching options.
*/
public static final String GCS_ENABLE_PERFORMANCE_CACHE_KEY = "fs.gs.performance.cache.enable";
/** Default value for {@link #GCS_ENABLE_PERFORMANCE_CACHE_KEY}. */
public static final boolean GCS_ENABLE_PERFORMANCE_CACHE_DEFAULT = false;
/**
* Configuration key for maximum number of milliseconds a GoogleCloudStorageItemInfo will remain
* "valid" in the performance cache before it's invalidated.
*/
public static final String GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS_KEY =
"fs.gs.performance.cache.max.entry.age.ms";
/** Default value for {@link #GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS_KEY}. */
public static final long GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS_DEFAULT =
PerformanceCachingGoogleCloudStorageOptions.MAX_ENTRY_AGE_MILLIS_DEFAULT;
/** Configuration key for whether or not to enable list caching for the performance cache. */
public static final String GCS_PERFORMANCE_CACHE_LIST_CACHING_ENABLE_KEY =
"fs.gs.performance.cache.list.caching.enable";
/** Default value for {@link #GCS_PERFORMANCE_CACHE_LIST_CACHING_ENABLE_KEY}. */
public static final boolean GCS_PERFORMANCE_CACHE_LIST_CACHING_ENABLE_DEFAULT =
PerformanceCachingGoogleCloudStorageOptions.LIST_CACHING_ENABLED;
/**
* Configuration key for whether or not we should update timestamps for parent directories when we
* create new files in them.
*/
public static final String GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY =
"fs.gs.parent.timestamp.update.enable";
/** Default value for {@link #GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY}. */
public static final boolean GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_DEFAULT = true;
/**
* Configuration key containing a comma-separated list of sub-strings that when matched will cause
* a particular directory to not have its modification timestamp updated. Includes take precedence
* over excludes.
*/
public static final String GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY =
"fs.gs.parent.timestamp.update.substrings.excludes";
/** Default value for {@link #GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY}. */
public static final String GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_DEFAULT = "/";
/** Configuration key for the MR intermediate done dir. */
public static final String MR_JOB_HISTORY_INTERMEDIATE_DONE_DIR_KEY =
"mapreduce.jobhistory.intermediate-done-dir";
/** Configuration key of the MR done directory. */
public static final String MR_JOB_HISTORY_DONE_DIR_KEY = "mapreduce.jobhistory.done-dir";
/**
* Configuration key containing a comma-separated list of sub-strings that when matched will cause
* a particular directory to have its modification timestamp updated. Includes take precedence
* over excludes.
*/
public static final String GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY =
"fs.gs.parent.timestamp.update.substrings.includes";
/** Default value for {@link #GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY}. */
public static final String GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_DEFAULT =
String.format(
"${%s},${%s}", MR_JOB_HISTORY_INTERMEDIATE_DONE_DIR_KEY, MR_JOB_HISTORY_DONE_DIR_KEY);
/**
* Configuration key for enabling automatic repair of implicit directories whenever detected
* inside listStatus and globStatus calls, or other methods which may indirectly call listStatus
* and/or globaStatus.
*/
public static final String GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY =
"fs.gs.implicit.dir.repair.enable";
/** Default value for {@link #GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY}. */
public static final boolean GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT = true;
/** Configuration key for changing the path codec from legacy to 'uri path encoding'. */
public static final String PATH_CODEC_KEY = "fs.gs.path.encoding";
/** Use new URI_ENCODED_PATH_CODEC. */
public static final String PATH_CODEC_USE_URI_ENCODING = "uri-path";
/** Use LEGACY_PATH_CODEC. */
public static final String PATH_CODEC_USE_LEGACY_ENCODING = "legacy";
/** Use the default path codec. */
public static final String PATH_CODEC_DEFAULT = PATH_CODEC_USE_LEGACY_ENCODING;
/**
* Instance value of {@link #GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY} based on the initial
* Configuration.
*/
private boolean enableAutoRepairImplicitDirectories =
GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT;
/**
* Configuration key for enabling automatic inference of implicit directories. If set, we create
* and return in-memory directory objects on the fly when no backing object exists, but we know
* there are files with the same prefix. The ENABLE_REPAIR flag takes precedence over this flag:
* if both are set, the repair is attempted, and only if it fails does the setting of this flag
* kick in.
*/
public static final String GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY =
"fs.gs.implicit.dir.infer.enable";
/** Default value for {@link #GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY}. */
public static final boolean GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_DEFAULT = true;
/**
* Instance value of {@link #GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY} based on the initial
* Configuration.
*/
private boolean enableInferImplicitDirectories = GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_DEFAULT;
/**
* Configuration key for enabling the use of a large flat listing to pre-populate possible glob
* matches in a single API call before running the core globbing logic in-memory rather than
* sequentially and recursively performing API calls.
*/
public static final String GCS_ENABLE_FLAT_GLOB_KEY = "fs.gs.glob.flatlist.enable";
/** Default value for {@link #GCS_ENABLE_FLAT_GLOB_KEY}. */
public static final boolean GCS_ENABLE_FLAT_GLOB_DEFAULT = true;
/**
* Configuration key for enabling the use of marker files during file creation. When running
* non-MR applications that make use of the FileSystem, it is a good idea to enable marker files
* to better mimic HDFS overwrite and locking behavior.
*/
public static final String GCS_ENABLE_MARKER_FILE_CREATION_KEY =
"fs.gs.create.marker.files.enable";
/** Default value for {@link #GCS_ENABLE_MARKER_FILE_CREATION_KEY}. */
public static final boolean GCS_ENABLE_MARKER_FILE_CREATION_DEFAULT = false;
/**
* Configuration key for enabling the use of Rewrite requests for copy operations. Rewrite request
* has the same effect as Copy request, but it can handle moving large objects that may
* potentially timeout a Copy request.
*/
public static final String GCS_ENABLE_COPY_WITH_REWRITE_KEY = "fs.gs.copy.with.rewrite.enable";
/** Default value for {@link #GCS_ENABLE_COPY_WITH_REWRITE_KEY}. */
public static final boolean GCS_ENABLE_COPY_WITH_REWRITE_DEFAULT = false;
/** Configuration key for number of items to return per call to the list* GCS RPCs. */
public static final String GCS_MAX_LIST_ITEMS_PER_CALL = "fs.gs.list.max.items.per.call";
/** Default value for {@link #GCS_MAX_LIST_ITEMS_PER_CALL}. */
public static final long GCS_MAX_LIST_ITEMS_PER_CALL_DEFAULT = 1024;
/** Configuration key for a max number of GCS RPCs in batch request. */
public static final String GCS_MAX_REQUESTS_PER_BATCH = "fs.gs.max.requests.per.batch";
/** Default value for {@link #GCS_MAX_REQUESTS_PER_BATCH}. */
public static final long GCS_MAX_REQUESTS_PER_BATCH_DEFAULT =
GoogleCloudStorageOptions.MAX_REQUESTS_PER_BATCH_DEFAULT;
/**
* Configuration key for the max number of retries for failed HTTP request to GCS. Note that the
* connector will retry *up to* the number of times as specified, using a default
* ExponentialBackOff strategy.
*
*
Also, note that this number will only control the number of retries in the low level HTTP
* request implementation.
*/
public static final String GCS_HTTP_MAX_RETRY_KEY = "fs.gs.http.max.retry";
/** Default value for {@link #GCS_HTTP_MAX_RETRY_KEY}. */
public static final int GCS_HTTP_MAX_RETRY_DEFAULT = 10;
/** Configuration key for the connect timeout (in millisecond) for HTTP request to GCS. */
public static final String GCS_HTTP_CONNECT_TIMEOUT_KEY = "fs.gs.http.connect-timeout";
/** Default value for {@link #GCS_HTTP_CONNECT_TIMEOUT_KEY}. */
public static final int GCS_HTTP_CONNECT_TIMEOUT_DEFAULT = 20 * 1000;
/** Configuration key for the connect timeout (in millisecond) for HTTP request to GCS. */
public static final String GCS_HTTP_READ_TIMEOUT_KEY = "fs.gs.http.read-timeout";
/** Default value for {@link #GCS_HTTP_READ_TIMEOUT_KEY}. */
public static final int GCS_HTTP_READ_TIMEOUT_DEFAULT = 20 * 1000;
/**
* Configuration key for setting a proxy for the connector to use to connect to GCS. The proxy
* must be an HTTP proxy of the form "host:port".
*/
public static final String GCS_PROXY_ADDRESS_KEY =
EntriesCredentialConfiguration.PROXY_ADDRESS_KEY;
/** Default to no proxy. */
public static final String GCS_PROXY_ADDRESS_DEFAULT =
EntriesCredentialConfiguration.PROXY_ADDRESS_DEFAULT;
/**
* Configuration key for the name of HttpTransport class to use for connecting to GCS. Must be the
* name of an HttpTransportFactory.HttpTransportType (APACHE or JAVA_NET).
*/
public static final String GCS_HTTP_TRANSPORT_KEY =
EntriesCredentialConfiguration.HTTP_TRANSPORT_KEY;
/** Default to the default specified in HttpTransportFactory. */
public static final String GCS_HTTP_TRANSPORT_DEFAULT =
EntriesCredentialConfiguration.HTTP_TRANSPORT_DEFAULT;
/** Configuration key for adding a suffix to the GHFS application name sent to GCS. */
public static final String GCS_APPLICATION_NAME_SUFFIX_KEY = "fs.gs.application.name.suffix";
/** Default suffix to add to the application name. */
public static final String GCS_APPLICATION_NAME_SUFFIX_DEFAULT = "";
/**
* Configuration key for modifying the maximum amount of time to wait for empty object creation.
*/
public static final String GCS_MAX_WAIT_MILLIS_EMPTY_OBJECT_CREATE_KEY =
"fs.gs.max.wait.for.empty.object.creation.ms";
/** Default to 3 seconds. */
public static final int GCS_MAX_WAIT_MILLIS_EMPTY_OBJECT_CREATE_DEFAULT = 3_000;
/**
* Configuration key for which type of output stream to use; different options may have different
* degrees of support for advanced features like hsync() and different performance
* characteristics. Options:
*
*
BASIC: Stream is closest analogue to direct wrapper around low-level HTTP stream into GCS.
*
*
SYNCABLE_COMPOSITE: Stream behaves similarly to BASIC when used with basic
* create/write/close patterns, but supports hsync() by creating discrete temporary GCS objects
* which are composed onto the destination object. Has a hard upper limit of number of components
* which can be composed onto the destination object.
*/
public static final String GCS_OUTPUTSTREAM_TYPE_KEY = "fs.gs.outputstream.type";
/** Default value for {@link #GCS_OUTPUTSTREAM_TYPE_KEY}. */
public static final String GCS_OUTPUTSTREAM_TYPE_DEFAULT = "BASIC";
/** Available types for use with {@link #GCS_OUTPUTSTREAM_TYPE_KEY}. */
public static enum OutputStreamType {
BASIC,
SYNCABLE_COMPOSITE
}
/**
* If true, the returned FSDataInputStream from the open(Path) method will hold an internal
* ByteBuffer of size fs.gs.io.buffersize which it pre-fills on each read, and can efficiently
* seek within the internal buffer. Otherwise, calls are delegated straight through to a lower
* level channel and the value of {@link #BUFFERSIZE_KEY} is passed through for the lower-level
* channel to interpret as it sees fit.
*/
public static final String GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_KEY =
"fs.gs.inputstream.internalbuffer.enable";
/** Default value for {@link #GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_KEY}. */
public static final boolean GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_DEFAULT = false;
/**
* If true, input streams will proactively check the "content-encoding" header of underlying
* objects during reads for special handling of cases where content-encoding causes the reported
* object sizes to not match the actual number of read bytes due to the content being decoded
* in-transit; such encoded objects also aren't suitable for splitting or resuming on failure, so
* the underlying channel will restart from byte 0 and discard the requisite number of bytes to
* seek to a desired position or resume in such cases. In general, content-encoded objects are
* *not* well-suited for FileSystem-style access, and will break most of the split computations in
* the Hadoop subsystem anyways. To avoid paying the cost of an extra metadata GET on every single
* opened channel in the usual case where no content-encoded objects are present, it may be
* desirable to set this to 'false'.
*/
public static final String GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_KEY =
"fs.gs.inputstream.support.content.encoding.enable";
/** Default value for {@link #GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_KEY}. */
public static final boolean GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_DEFAULT = true;
/**
* If forward seeks are within this many bytes of the current position, seeks are performed by
* reading and discarding bytes in-place rather than opening a new underlying stream.
*/
public static final String GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_KEY =
"fs.gs.inputstream.inplace.seek.limit";
/** Default value for {@link #GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_KEY}. */
public static final long GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_DEFAULT = 8 * 1024 * 1024L;
/** Tunes reading objects behavior to optimize HTTP GET requests for various use cases. */
public static final String GCS_INPUTSTREAM_FADVISE_KEY = "fs.gs.inputstream.fadvise";
/** Default value for {@link #GCS_INPUTSTREAM_FADVISE_KEY}. */
public static final Fadvise GCS_INPUTSTREAM_FADVISE_DEFAULT =
GoogleCloudStorageReadOptions.DEFAULT_FADVISE;
/**
* Minimum size in bytes of the HTTP Range header set in GCS request when opening new stream to
* read an object.
*/
public static final String GCS_INPUTSTREAM_MIN_RANGE_REQUEST_SIZE_KEY =
"fs.gs.inputstream.min.range.request.size";
/** Default value for {@link #GCS_INPUTSTREAM_MIN_RANGE_REQUEST_SIZE_KEY}. */
public static final int GCS_INPUTSTREAM_MIN_RANGE_REQUEST_SIZE_DEFAULT =
GoogleCloudStorageReadOptions.DEFAULT_MIN_RANGE_REQUEST_SIZE;
/**
* Size of the object footer that will be prefetched when read channel opened. Footer prefetching
* is disabled if this property is set to 0.
*/
public static final String GCS_INPUTSTREAM_FOOTER_PREFETCH_SIZE_KEY =
"fs.gs.inputstream.footer.prefetch.size";
/** Default value for {@link #GCS_INPUTSTREAM_FOOTER_PREFETCH_SIZE_KEY}. */
public static final int GCS_INPUTSTREAM_FOOTER_PREFETCH_SIZE_DEFAULT =
GoogleCloudStorageReadOptions.DEFAULT_FOOTER_PREFETCH_SIZE;
/**
* If true, recursive delete on a path that refers to a GCS bucket itself ('/' for any
* bucket-rooted GoogleHadoopFileSystem) or delete on that path when it's empty will result in
* fully deleting the GCS bucket. If false, any operation that normally would have deleted the
* bucket will be ignored instead. Setting to 'false' preserves the typical behavior of "rm -rf /"
* which translates to deleting everything inside of root, but without clobbering the filesystem
* authority corresponding to that root path in the process.
*/
public static final String GCE_BUCKET_DELETE_ENABLE_KEY = "fs.gs.bucket.delete.enable";
/** Default value for {@link #GCE_BUCKET_DELETE_ENABLE_KEY}. */
public static final boolean GCE_BUCKET_DELETE_ENABLE_DEFAULT = false;
/** Default PathFilter that accepts all paths. */
public static final PathFilter DEFAULT_FILTER =
new PathFilter() {
@Override
public boolean accept(Path path) {
return true;
}
};
/** A resource file containing GCS related build properties. */
public static final String PROPERTIES_FILE = "gcs.properties";
/** The key in the PROPERTIES_FILE that contains the version built. */
public static final String VERSION_PROPERTY = "gcs.connector.version";
/** The version returned when one cannot be found in properties. */
public static final String UNKNOWN_VERSION = "0.0.0";
/** Current version. */
public static final String VERSION;
/** Identifies this version of the GoogleHadoopFileSystemBase library. */
public static final String GHFS_ID;
static {
VERSION =
PropertyUtil.getPropertyOrDefault(
GoogleHadoopFileSystemBase.class, PROPERTIES_FILE, VERSION_PROPERTY, UNKNOWN_VERSION);
LOG.info("GHFS version: {}", VERSION);
GHFS_ID = String.format("GHFS/%s", VERSION);
}
/** Instance value of {@link #GCS_ENABLE_FLAT_GLOB_KEY} based on the initial Configuration. */
private boolean enableFlatGlob = GCS_ENABLE_FLAT_GLOB_DEFAULT;
/** The URI the File System is passed in initialize. */
protected URI initUri;
/**
* The retrieved configuration value for {@link #GCS_SYSTEM_BUCKET_KEY}. Used as a fallback for a
* root bucket, when required.
*/
@Deprecated protected String systemBucket;
/** Underlying GCS file system object. */
protected GoogleCloudStorageFileSystem gcsfs;
/**
* Current working directory; overridden in initialize() if {@link #GCS_WORKING_DIRECTORY_KEY} is
* set.
*/
private Path workingDirectory;
/** Buffer size to use instead of what Hadoop passed. */
private int bufferSizeOverride = BUFFERSIZE_DEFAULT;
/**
* Default block size. Note that this is the size that is reported to Hadoop FS clients. It does
* not modify the actual block size of an underlying GCS object, because GCS JSON API does not
* allow modifying or querying the value. Modifying this value allows one to control how many
* mappers are used to process a given file.
*/
protected long defaultBlockSize = BLOCK_SIZE_DEFAULT;
/** The fixed reported permission of all files. */
private FsPermission reportedPermissions;
/** Map of counter values */
protected final ImmutableMap counters = createCounterMap();
protected ImmutableMap createCounterMap() {
EnumMap countersMap = new EnumMap<>(Counter.class);
for (Counter counter : ALL_COUNTERS) {
countersMap.put(counter, new AtomicLong());
}
return Maps.immutableEnumMap(countersMap);
}
/**
* Behavior of listStatus when a path is not found.
*/
protected enum ListStatusFileNotFoundBehavior {
Hadoop1 {
@Override
public FileStatus[] handle(String path) throws IOException {
return null;
}
},
Hadoop2 {
@Override
public FileStatus[] handle(String path) throws IOException {
throw new FileNotFoundException(String.format("Path '%s' does not exist.", path));
}
};
/**
* Perform version specific handling for a missing path.
* @param path The missing path
*/
public abstract FileStatus[] handle(String path) throws IOException;
/**
* Get the ListStatusFileNotFoundBehavior for the currently running Hadoop version.
*/
public static ListStatusFileNotFoundBehavior get() {
return get(HadoopVersionInfo.getInstance());
}
/**
* Get the ListStatusFileNotFoundBehavior for the given hadoop version/
* @param hadoopVersionInfo The hadoop version.
*/
public static ListStatusFileNotFoundBehavior get(HadoopVersionInfo hadoopVersionInfo) {
if (hadoopVersionInfo.isGreaterThan(2, 0)
|| hadoopVersionInfo.isEqualTo(2, 0)
|| hadoopVersionInfo.isEqualTo(0, 23)) {
return Hadoop2;
}
return Hadoop1;
}
}
// Behavior when a path is not found in listStatus()
protected ListStatusFileNotFoundBehavior listStatusFileNotFoundBehavior =
ListStatusFileNotFoundBehavior.get();
@VisibleForTesting
protected void setListStatusFileNotFoundBehavior(ListStatusFileNotFoundBehavior behavior) {
this.listStatusFileNotFoundBehavior = behavior;
}
/**
* Defines names of counters we track for each operation.
*
* There are two types of counters:
* -- METHOD_NAME : Number of successful invocations of method METHOD.
* -- METHOD_NAME_TIME : Total inclusive time spent in method METHOD.
*/
public enum Counter {
APPEND,
APPEND_TIME,
CREATE,
CREATE_TIME,
DELETE,
DELETE_TIME,
GET_FILE_STATUS,
GET_FILE_STATUS_TIME,
INIT,
INIT_TIME,
INPUT_STREAM,
INPUT_STREAM_TIME,
LIST_STATUS,
LIST_STATUS_TIME,
MKDIRS,
MKDIRS_TIME,
OPEN,
OPEN_TIME,
OUTPUT_STREAM,
OUTPUT_STREAM_TIME,
READ1,
READ1_TIME,
READ,
READ_TIME,
READ_FROM_CHANNEL,
READ_FROM_CHANNEL_TIME,
READ_CLOSE,
READ_CLOSE_TIME,
READ_POS,
READ_POS_TIME,
RENAME,
RENAME_TIME,
SEEK,
SEEK_TIME,
SET_WD,
SET_WD_TIME,
WRITE1,
WRITE1_TIME,
WRITE,
WRITE_TIME,
WRITE_CLOSE,
WRITE_CLOSE_TIME,
}
/**
* Set of all counters.
*
*
It is used for performance optimization instead of `Counter.values`, because
* `Counter.values` returns new array on each invocation.
*/
private static final ImmutableSet ALL_COUNTERS =
Sets.immutableEnumSet(EnumSet.allOf(Counter.class));
/**
* A predicate that processes individual directory paths and evaluates the conditions set in
* fs.gs.parent.timestamp.update.enable, fs.gs.parent.timestamp.update.substrings.include and
* fs.gs.parent.timestamp.update.substrings.exclude to determine if a path should be ignored
* when running directory timestamp updates. If no match is found in either include or
* exclude and updates are enabled, the directory timestamp will be updated.
*/
public static class ParentTimestampUpdateIncludePredicate
implements GoogleCloudStorageFileSystemOptions.TimestampUpdatePredicate {
/**
* Create a new ParentTimestampUpdateIncludePredicate from the passed Hadoop configuration
* object.
*/
public static ParentTimestampUpdateIncludePredicate create(Configuration config) {
boolean enableDirectoryTimestampUpdating = config.getBoolean(
GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY,
GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_DEFAULT);
LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY,
enableDirectoryTimestampUpdating);
String includedParentPaths = config.get(
GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY,
GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_DEFAULT);
LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY, includedParentPaths);
List splitIncludedParentPaths =
CONFIGURATION_SPLITTER.splitToList(includedParentPaths);
String excludedParentPaths = config.get(
GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY,
GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_DEFAULT);
LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY, excludedParentPaths);
List splitExcludedParentPaths =
CONFIGURATION_SPLITTER.splitToList(excludedParentPaths);
return new ParentTimestampUpdateIncludePredicate(
enableDirectoryTimestampUpdating,
splitIncludedParentPaths,
splitExcludedParentPaths);
}
// Include and exclude lists are intended to be small N and checked relatively
// infrequently. If that becomes not that case, consider Aho-Corasick or similar matching
// algorithms.
private final List includeSubstrings;
private final List excludeSubstrings;
private final boolean enableTimestampUpdates;
public ParentTimestampUpdateIncludePredicate(
boolean enableTimestampUpdates,
List includeSubstrings,
List excludeSubstrings) {
this.includeSubstrings = includeSubstrings;
this.excludeSubstrings = excludeSubstrings;
this.enableTimestampUpdates = enableTimestampUpdates;
}
/**
* Determine if updating directory timestamps should be ignored.
* @return True if the directory timestamp should not be updated. False to indicate it should
* be updated.
*/
@Override
public boolean shouldUpdateTimestamp(URI uri) {
if (!enableTimestampUpdates) {
LOG.debug("Timestamp updating disabled. Not updating uri {}", uri);
return false;
}
for (String include : includeSubstrings) {
if (uri.toString().contains(include)) {
LOG.debug("Path {} matched included path {}. Updating timestamps.", uri, include);
return true;
}
}
for (String exclude : excludeSubstrings) {
if (uri.toString().contains(exclude)) {
LOG.debug("Path {} matched excluded path {}. Not updating timestamps.", uri, exclude);
return false;
}
}
return true;
}
}
/**
* Constructs an instance of GoogleHadoopFileSystemBase; the internal
* GoogleCloudStorageFileSystem will be set up with config settings when initialize() is called.
*/
public GoogleHadoopFileSystemBase() {
}
/**
* Constructs an instance of GoogleHadoopFileSystemBase using the provided
* GoogleCloudStorageFileSystem; initialize() will not re-initialize it.
*/
public GoogleHadoopFileSystemBase(GoogleCloudStorageFileSystem gcsfs) {
Preconditions.checkArgument(gcsfs != null, "gcsfs must not be null");
this.gcsfs = gcsfs;
}
/**
* Returns an unqualified path without any leading slash, relative to the filesystem root,
* which serves as the home directory of the current user; see {@code getHomeDirectory} for
* a description of what the home directory means.
*/
protected abstract String getHomeDirectorySubpath();
/**
* Gets Hadoop path corresponding to the given GCS path.
*
* @param gcsPath Fully-qualified GCS path, of the form gs:///