Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.fs.gcs;
import com.google.api.client.auth.oauth2.Credential;
import com.google.cloud.hadoop.gcsio.CreateFileOptions;
import com.google.cloud.hadoop.gcsio.DirectoryListCache;
import com.google.cloud.hadoop.gcsio.FileInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemOptions;
import com.google.cloud.hadoop.gcsio.PathCodec;
import com.google.cloud.hadoop.util.ConfigurationUtil;
import com.google.cloud.hadoop.util.CredentialFactory;
import com.google.cloud.hadoop.util.HadoopCredentialConfiguration;
import com.google.cloud.hadoop.util.HadoopVersionInfo;
import com.google.cloud.hadoop.util.HttpTransportFactory;
import com.google.cloud.hadoop.util.PropertyUtil;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.nio.file.DirectoryNotEmptyException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.GlobPattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class provides a Hadoop compatible File System on top of Google Cloud Storage (GCS).
*
* It is implemented as a thin abstraction layer on top of GCS.
* The layer hides any specific characteristics of the underlying store and exposes FileSystem
* interface understood by the Hadoop engine.
*
* Users interact with the files in the storage using fully qualified URIs.
* The file system exposed by this class is identified using the 'gs' scheme.
* For example, {@code gs://dir1/dir2/file1.txt}.
*
* This implementation translates paths between hadoop Path and GCS URI with the convention that
* the Hadoop root directly corresponds to the GCS "root", e.g. gs:/. This is convenient for many
* reasons, such as data portability and close equivalence to gsutil paths, but imposes certain
* inherited constraints, such as files not being allowed in root (only 'directories' can be placed
* in root), and directory names inside root have a more limited set of allowed characters.
*
* One of the main goals of this implementation is to maintain compatibility
* with behavior of HDFS implementation when accessed through FileSystem interface.
* HDFS implementation is not very consistent about the cases when it throws versus
* the cases when methods return false. We run GHFS tests and HDFS tests against the
* same test data and use that as a guide to decide whether to throw or to
* return false.
*/
public abstract class GoogleHadoopFileSystemBase
extends FileSystem implements FileSystemDescriptor {
// Logger.
public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystemBase.class);
// Default value of replication factor.
public static final short REPLICATION_FACTOR_DEFAULT = 3;
// We report this value as a file's owner/group name.
private static final String USER_NAME = System.getProperty("user.name");
// Splitter for list values stored in a single configuration value
private static final Splitter CONFIGURATION_SPLITTER = Splitter.on(',');
// -----------------------------------------------------------------
// Configuration settings.
// Key for the permissions that we report a file or directory to have.
// Can either be octal or symbolic mode accepted by {@link FsPermission#FromString(String)}
public static final String PERMISSIONS_TO_REPORT_KEY = "fs.gs.reported.permissions";
// Default value for the permissions that we report a file or directory to have.
// Note:
// We do not really support file/dir permissions but we need to
// report some permission value when Hadoop calls getFileStatus().
// A MapReduce job fails if we report permissions more relaxed than
// the value below and this is the default File System.
public static final String PERMISSIONS_TO_REPORT_DEFAULT = "700";
// Configuration key for setting IO buffer size.
// TODO(user): rename the following to indicate that it is read buffer size.
public static final String BUFFERSIZE_KEY = "fs.gs.io.buffersize";
// Hadoop passes 4096 bytes as buffer size which causes poor perf.
// Default value of fs.gs.io.buffersize.
public static final int BUFFERSIZE_DEFAULT = 8 * 1024 * 1024;
// Configuration key for setting write buffer size.
public static final String WRITE_BUFFERSIZE_KEY = "fs.gs.io.buffersize.write";
// Default value of fs.gs.io.buffersize.write.
// chunk size etc. Get the following value from GCSWC class in a better way. For now, we hard code
// it to a known good value.
public static final int WRITE_BUFFERSIZE_DEFAULT = 64 * 1024 * 1024;
// Configuration key for default block size of a file.
public static final String BLOCK_SIZE_KEY = "fs.gs.block.size";
// Default value of fs.gs.block.size.
public static final int BLOCK_SIZE_DEFAULT = 64 * 1024 * 1024;
// Prefix to use for common authentication keys
public static final String AUTHENTICATION_PREFIX = "fs.gs";
// Configuration key for enabling GCE service account authentication.
// This key is deprecated. See HadoopCredentialConfiguration for current key names.
public static final String ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY =
"fs.gs.enable.service.account.auth";
// Configuration key specifying the email address of the service-account with which to
// authenticate. Only required if fs.gs.enable.service.account.auth is true AND we're using
// fs.gs.service.account.auth.keyfile to authenticate with a private keyfile.
// NB: Once GCE supports setting multiple service account email addresses for metadata auth,
// this key will also be used in the metadata auth flow.
// This key is deprecated. See HadoopCredentialConfiguration for current key names.
public static final String SERVICE_ACCOUNT_AUTH_EMAIL_KEY =
"fs.gs.service.account.auth.email";
// Configuration key specifying local file containing a service-account private .p12 keyfile.
// Only used if fs.gs.enable.service.account.auth is true; if provided, the keyfile will be used
// for service-account authentication. Otherwise, it is assumed that we are on a GCE VM with
// metadata-authentication for service-accounts enabled, and the metadata server will be used
// instead.
// Default value: none
// This key is deprecated. See HadoopCredentialConfiguration for current key names.
public static final String SERVICE_ACCOUNT_AUTH_KEYFILE_KEY =
"fs.gs.service.account.auth.keyfile";
// Configuration key for GCS project ID.
// Default value: none
public static final String GCS_PROJECT_ID_KEY = "fs.gs.project.id";
// Configuration key for GCS client ID.
// Required if fs.gs.enable.service.account.auth == false.
// Default value: none
// This key is deprecated. See HadoopCredentialConfiguration for current key names.
public static final String GCS_CLIENT_ID_KEY = "fs.gs.client.id";
// Configuration key for GCS client secret.
// Required if fs.gs.enable.service.account.auth == false.
// Default value: none
// This key is deprecated. See HadoopCredentialConfiguration for current key names.
public static final String GCS_CLIENT_SECRET_KEY = "fs.gs.client.secret";
// Configuration key for system bucket name. It is a fall back for the
// rootBucket of GoogleHadoopFileSystem in gs:///path URIs .
// Default value: none
// This key is deprecated. Always init the FileSystem with a bucket.
public static final String GCS_SYSTEM_BUCKET_KEY = "fs.gs.system.bucket";
// Configuration key for flag to indicate whether system bucket should be created
// if it does not exist.
// This key is deprecated. See GCS_SYSTEM_BUCKET_KEY.
public static final String GCS_CREATE_SYSTEM_BUCKET_KEY = "fs.gs.system.bucket.create";
// Default value of fs.gs.system.bucket.create.
public static final boolean GCS_CREATE_SYSTEM_BUCKET_DEFAULT = true;
// Configuration key for initial working directory of a GHFS instance.
// Default value: '/'
public static final String GCS_WORKING_DIRECTORY_KEY = "fs.gs.working.dir";
// Configuration key for setting 250GB upper limit on file size to gain higher write throughput.
public static final String GCS_FILE_SIZE_LIMIT_250GB = "fs.gs.file.size.limit.250gb";
// Default value of fs.gs.file.size.limit.250gb.
public static final boolean GCS_FILE_SIZE_LIMIT_250GB_DEFAULT = false;
// Configuration key for using a local metadata cache to supplement GCS API "list" results;
// this allows same-client create() to immediately be visible to a subsequent list() call.
public static final String GCS_ENABLE_METADATA_CACHE_KEY = "fs.gs.metadata.cache.enable";
// Default value for fs.gs.metadata.cache.enable.
public static final boolean GCS_ENABLE_METADATA_CACHE_DEFAULT = true;
// Configuration key for whether or not we should update timestamps for parent directories
// when we create new files in them.
public static final String GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY =
"fs.gs.parent.timestamp.update.enable";
// Default value for fs.gs.parent.timestamp.update.enable.
public static final boolean GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_DEFAULT = true;
// Configuration key for specifying which implementation of DirectoryListCache to use for
// supplementing GCS API "list" results. Supported implementations:
// IN_MEMORY: Enforces immediate consistency within same Java process.
// FILESYSTEM_BACKED: Enforces consistency across all cooperating processes pointed at the same
// local mirror directory, which may be an NFS directory for distributed coordination.
public static final String GCS_METADATA_CACHE_TYPE_KEY = "fs.gs.metadata.cache.type";
// Default value for fs.gs.metadata.cache.type.
public static final String GCS_METADATA_CACHE_TYPE_DEFAULT = "IN_MEMORY";
// Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies the local path to
// use as the base path for storing mirrored GCS metadata. Must be an absolute path, must be
// a directory, and must be fully readable/writable/executable by any user running processes
// which use the GCS connector.
public static final String GCS_METADATA_CACHE_DIRECTORY_KEY = "fs.gs.metadata.cache.directory";
// Default value for fs.gs.metadata.cache.directory.
public static final String GCS_METADATA_CACHE_DIRECTORY_DEFAULT =
"/tmp/gcs_connector_metadata_cache";
// Maximum number of milliseconds a cache entry will remain in the list-consistency cache, even
// as an id-only entry (no risk of stale GoogleCloudStorageItemInfo). In general, entries should
// be allowed to expire fully from the cache once reasonably certain the remote GCS API's
// list-index is up-to-date to save memory and computation when trying to supplement new results
// using the cache.
public static final String GCS_METADATA_CACHE_MAX_ENTRY_AGE_KEY =
"fs.gs.metadata.cache.max.age.entry.ms";
// Default value for fs.gs.metadata.cache.max.age.entry.ms.
public static final long GCS_METADATA_CACHE_MAX_ENTRY_AGE_DEFAULT =
DirectoryListCache.Config.MAX_ENTRY_AGE_MILLIS_DEFAULT;
// Maximum number of milliseconds a GoogleCloudStorageItemInfo will remain "valid" in the
// list-consistency cache, after which the next attempt to fetch the itemInfo will require
// fetching fresh info from a GoogleCloudStorage instance.
public static final String GCS_METADATA_CACHE_MAX_INFO_AGE_KEY =
"fs.gs.metadata.cache.max.age.info.ms";
// Default value for fs.gs.metadata.cache.max.age.info.ms.
public static final long GCS_METADATA_CACHE_MAX_INFO_AGE_DEFAULT =
DirectoryListCache.Config.MAX_INFO_AGE_MILLIS_DEFAULT;
// Configuration key containing a comma-separated list of sub-strings that when matched will
// cause a particular directory to not have its modification timestamp updated.
// Includes take precedence over excludes.
public static final String GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY =
"fs.gs.parent.timestamp.update.substrings.excludes";
// Default value for fs.gs.parent.timestamp.updating.substrings.exclude
public static final String GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_DEFAULT =
"/";
// Configuration key for the MR intermediate done dir.
public static final String MR_JOB_HISTORY_INTERMEDIATE_DONE_DIR_KEY =
"mapreduce.jobhistory.intermediate-done-dir";
// Configuration key of the MR done directory.
public static final String MR_JOB_HISTORY_DONE_DIR_KEY =
"mapreduce.jobhistory.done-dir";
// Configuration key containing a comma-separated list of sub-strings that when matched will
// cause a particular directory to have its modification timestamp updated.
// Includes take precedence over excludes.
public static final String GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY =
"fs.gs.parent.timestamp.update.substrings.includes";
// Default value for fs.gs.parent.timestamp.updating.substrings.include
public static final String GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_DEFAULT =
String.format(
"${%s},${%s}",
MR_JOB_HISTORY_INTERMEDIATE_DONE_DIR_KEY,
MR_JOB_HISTORY_DONE_DIR_KEY);
// Configuration key for enabling automatic repair of implicit directories whenever detected
// inside listStatus and globStatus calls, or other methods which may indirectly call listStatus
// and/or globaStatus.
public static final String GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY =
"fs.gs.implicit.dir.repair.enable";
// Default value for fs.gs.implicit.dir.repair.enable.
public static final boolean GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT = true;
// Configuration key for changing the path codec from legacy to 'uri path encoding'.
public static final String PATH_CODEC_KEY = "fs.gs.path.encoding";
// Use new URI_ENCODED_PATH_CODEC
public static final String PATH_CODEC_USE_URI_ENCODING = "uri-path";
// Use LEGACY_PATH_CODEC
public static final String PATH_CODEC_USE_LEGACY_ENCODING = "legacy";
// Use the default path codec.
public static final String PATH_CODEC_DEFAULT = PATH_CODEC_USE_LEGACY_ENCODING;
// Instance value of fs.gs.implicit.dir.repair.enable based on the initial Configuration.
private boolean enableAutoRepairImplicitDirectories =
GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT;
// Configuration key for enabling automatic inference of implicit directories.
// If set, we create and return in-memory directory objects on the fly when
// no backing object exists, but we know there are files with the same prefix.
// The ENABLE_REPAIR flag takes precedence over this flag: if both are set,
// the repair is attempted, and only if it fails does the setting of this
// flag kick in.
public static final String GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY =
"fs.gs.implicit.dir.infer.enable";
// Default value for fs.gs.implicit.dir.infer.enable.
public static final boolean GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_DEFAULT = true;
// Instance value of fs.gs.implicit.dir.infer.enable
// based on the initial Configuration.
private boolean enableInferImplicitDirectories =
GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_DEFAULT;
// Configuration key for enabling the use of a large flat listing to pre-populate possible
// glob matches in a single API call before running the core globbing logic in-memory rather
// than sequentially and recursively performing API calls.
public static final String GCS_ENABLE_FLAT_GLOB_KEY = "fs.gs.glob.flatlist.enable";
// Default value for fs.gs.glob.flatlist.enable.
public static final boolean GCS_ENABLE_FLAT_GLOB_DEFAULT = true;
// Configuration key for enabling the use of marker files during file creation. When running
// non-MR applications that make use of the FileSystem, it is a idea to enable marker files
// to better mimic HDFS overwrite and locking behavior.
public static final String GCS_ENABLE_MARKER_FILE_CREATION_KEY =
"fs.gs.create.marker.files.enable";
// Default value for fs.gs.create.marker.files.enable
public static final boolean GCS_ENABLE_MARKER_FILE_CREATION_DEFAULT = false;
// Configuration key for setting a proxy for the connector to use to connect to GCS.
// The proxy must be an HTTP proxy of the form "host:port".
public static final String GCS_PROXY_ADDRESS_KEY = "fs.gs.proxy.address";
// Default to no proxy.
public static final String GCS_PROXY_ADDRESS_DEFAULT = null;
// Configuration key for the name of HttpTransport class to use for connecting to GCS.
// Must be the name of an HttpTransportFactory.HttpTransportType (APACHE or JAVA_NET).
public static final String GCS_HTTP_TRANSPORT_KEY = "fs.gs.http.transport.type";
// Default to the default specified in HttpTransportFactory.
public static final String GCS_HTTP_TRANSPORT_DEFAULT = null;
// Configuration key for adding a suffix to the GHFS application name sent to GCS.
public static final String GCS_APPLICATION_NAME_SUFFIX_KEY = "fs.gs.application.name.suffix";
// Default suffix to add to the application name.
public static final String GCS_APPLICATION_NAME_SUFFIX_DEFAULT = "";
// Configuration key for which type of output stream to use; different options may have different
// degrees of support for advanced features like hsync() and different performance
// characteristics. Options:
// BASIC: Stream is closest analogue to direct wrapper around low-level HTTP stream into GCS.
// SYNCABLE_COMPOSITE: Stream behaves similarly to BASIC when used with basic create/write/close
// patterns, but supports hsync() by creating discrete temporary GCS objects which are
// composed onto the destination object. Has a hard upper limit of number of components
// which can be composed onto the destination object.
public static final String GCS_OUTPUTSTREAM_TYPE_KEY = "fs.gs.outputstream.type";
// Default value for fs.gs.outputstream.type.
public static final String GCS_OUTPUTSTREAM_TYPE_DEFAULT = "BASIC";
/**
* Available types for use with fs.gs.outputstream.type.
*/
public static enum OutputStreamType {
BASIC,
SYNCABLE_COMPOSITE
}
// If true, the returned FSDataInputStream from the open(Path) method will hold an internal
// ByteBuffer of size fs.gs.io.buffersize which it pre-fills on each read, and can efficiently
// seek within the internal buffer. Otherwise, calls are delegated straight through to a lower
// level channel and the value of fs.gs.io.buffersize is passed through for the lower-level
// channel to interpret as it sees fit.
public static final String GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_KEY =
"fs.gs.inputstream.internalbuffer.enable";
// Default value for fs.gs.inputstream.internalbuffer.enable.
public static final boolean GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_DEFAULT = false;
// If true, input streams will proactively check the "content-encoding" header of underlying
// objects during reads for special handling of cases where content-encoding causes the
// reported object sizes to not match the actual number of read bytes due to the content
// being decoded in-transit; such encoded objects also aren't suitable for splitting or
// resuming on failure, so the underlying channel will restart from byte 0 and discard the
// requisite number of bytes to seek to a desired position or resume in such cases. In
// general, content-encoded objects are *not* well-suited for FileSystem-style access, and
// will break most of the split computations in the Hadoop subsystem anyways. To avoid
// paying the cost of an extra metadata GET on every single opened channel in the usual case
// where no content-encoded objects are present, it may be desirable to set this to 'false'.
public static final String GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_KEY =
"fs.gs.inputstream.support.content.encoding.enable";
// Default value for fs.gs.inputstream.support.content.encoding.enable.
public static final boolean GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_DEFAULT = true;
// If true, on opening a file we will proactively perform a metadata GET to check whether
// the object exists, even though the underlying channel will not open a data stream
// until read() is actually called so that streams can seek to nonzero file positions
// without incurring an extra stream creation. This is necessary to technically match the
// expected behavior of Hadoop filesystems, but incurs extra latency overhead on open().
// If the calling code can handle late failures on not-found errors, or has independently
// already ensured that a file exists before calling open(), then set this to false for
// more efficient reads.
public static final String GCS_INPUTSTREAM_FAST_FAIL_ON_NOT_FOUND_ENABLE_KEY =
"fs.gs.inputstream.fast.fail.on.not.found.enable";
// Default value for fs.gs.inputstream.fast.fail.on.not.found.enable.
public static final boolean GCS_INPUTSTREAM_FAST_FAIL_ON_NOT_FOUND_ENABLE_DEFAULT = true;
// If forward seeks are within this many bytes of the current position, seeks are performed
// by reading and discarding bytes in-place rather than opening a new underlying stream.
public static final String GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_KEY =
"fs.gs.inputstream.inplace.seek.limit";
// Default value for fs.gs.inputstream.inplace.seek.limit.
public static final long GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_DEFAULT = 8 * 1024 * 1024L;
// Default PathFilter that accepts all paths.
public static final PathFilter DEFAULT_FILTER = new PathFilter() {
@Override
public boolean accept(Path path) {
return true;
}
};
// A resource file containing GCS related build properties.
public static final String PROPERTIES_FILE = "gcs.properties";
// The key in the PROPERTIES_FILE that contains the version built.
public static final String VERSION_PROPERTY = "gcs.connector.version";
// The version returned when one cannot be found in properties.
public static final String UNKNOWN_VERSION = "0.0.0";
// Current version.
public static final String VERSION;
// Identifies this version of the GoogleHadoopFileSystemBase library.
public static final String GHFS_ID;
static {
VERSION = PropertyUtil.getPropertyOrDefault(
GoogleHadoopFileSystemBase.class, PROPERTIES_FILE, VERSION_PROPERTY, UNKNOWN_VERSION);
LOG.info("GHFS version: {}", VERSION);
GHFS_ID = String.format("GHFS/%s", VERSION);
}
// Instance value of fs.gs.glob.flatlist.enable based on the initial Configuration.
private boolean enableFlatGlob = GCS_ENABLE_FLAT_GLOB_DEFAULT;
//The URI the File System is passed in initialize.
protected URI initUri;
// The retrieved configuration value for fs.gs.system.bucket.
// Used as a fallback for a root bucket, when required.
@Deprecated
protected String systemBucket;
// Underlying GCS file system object.
protected GoogleCloudStorageFileSystem gcsfs;
// Current working directory; overridden in initialize() if fs.gs.working.dir is set.
private Path workingDirectory;
// Buffer size to use instead of what Hadoop passed.
private int bufferSizeOverride = BUFFERSIZE_DEFAULT;
// Default block size.
// Note that this is the size that is reported to Hadoop FS clients.
// It does not modify the actual block size of an underlying GCS object,
// because GCS JSON API does not allow modifying or querying the value.
// Modifying this value allows one to control how many mappers are used
// to process a given file.
protected long defaultBlockSize = BLOCK_SIZE_DEFAULT;
// The fixed reported permission of all files.
private FsPermission reportedPermissions;
// Map of counter values
protected final ImmutableMap counters = createCounterMap();
protected ImmutableMap createCounterMap() {
ImmutableMap.Builder builder = ImmutableMap.builder();
for (Counter counter : Counter.values()) {
builder.put(counter, new AtomicLong());
}
return builder.build();
}
/**
* Behavior of listStatus when a path is not found.
*/
protected enum ListStatusFileNotFoundBehavior {
Hadoop1 {
@Override
public FileStatus[] handle(String path) throws IOException {
return null;
}
},
Hadoop2 {
@Override
public FileStatus[] handle(String path) throws IOException {
throw new FileNotFoundException(String.format("Path '%s' does not exist.", path));
}
};
/**
* Perform version specific handling for a missing path.
* @param path The missing path
*/
public abstract FileStatus[] handle(String path) throws IOException;
/**
* Get the ListStatusFileNotFoundBehavior for the currently running Hadoop version.
*/
public static ListStatusFileNotFoundBehavior get() {
return get(HadoopVersionInfo.getInstance());
}
/**
* Get the ListStatusFileNotFoundBehavior for the given hadoop version/
* @param hadoopVersionInfo The hadoop version.
*/
public static ListStatusFileNotFoundBehavior get(HadoopVersionInfo hadoopVersionInfo) {
if (hadoopVersionInfo.isGreaterThan(2, 0) || hadoopVersionInfo.isEqualTo(0, 23)) {
return Hadoop2;
}
return Hadoop1;
}
}
// Behavior when a path is not found in listStatus()
protected ListStatusFileNotFoundBehavior listStatusFileNotFoundBehavior =
ListStatusFileNotFoundBehavior.get();
@VisibleForTesting
protected void setListStatusFileNotFoundBehavior(ListStatusFileNotFoundBehavior behavior) {
this.listStatusFileNotFoundBehavior = behavior;
}
/**
* Defines names of counters we track for each operation.
*
* There are two types of counters:
* -- METHOD_NAME : Number of successful invocations of method METHOD.
* -- METHOD_NAME_TIME : Total inclusive time spent in method METHOD.
*/
public enum Counter {
APPEND,
APPEND_TIME,
CREATE,
CREATE_TIME,
DELETE,
DELETE_TIME,
GET_FILE_STATUS,
GET_FILE_STATUS_TIME,
INIT,
INIT_TIME,
INPUT_STREAM,
INPUT_STREAM_TIME,
LIST_STATUS,
LIST_STATUS_TIME,
MKDIRS,
MKDIRS_TIME,
OPEN,
OPEN_TIME,
OUTPUT_STREAM,
OUTPUT_STREAM_TIME,
READ1,
READ1_TIME,
READ,
READ_TIME,
READ_FROM_CHANNEL,
READ_FROM_CHANNEL_TIME,
READ_CLOSE,
READ_CLOSE_TIME,
READ_POS,
READ_POS_TIME,
RENAME,
RENAME_TIME,
SEEK,
SEEK_TIME,
SET_WD,
SET_WD_TIME,
WRITE1,
WRITE1_TIME,
WRITE,
WRITE_TIME,
WRITE_CLOSE,
WRITE_CLOSE_TIME,
}
/**
* A predicate that processes individual directory paths and evaluates the conditions set in
* fs.gs.parent.timestamp.update.enable, fs.gs.parent.timestamp.update.substrings.include and
* fs.gs.parent.timestamp.update.substrings.exclude to determine if a path should be ignored
* when running directory timestamp updates. If no match is found in either include or
* exclude and updates are enabled, the directory timestamp will be updated.
*/
public static class ParentTimestampUpdateIncludePredicate implements Predicate {
/**
* Create a new ParentTimestampUpdateIncludePredicate from the passed Hadoop configuration
* object.
*/
public static Predicate create(Configuration config) {
boolean enableDirectoryTimestampUpdating = config.getBoolean(
GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY,
GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_DEFAULT);
LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY,
enableDirectoryTimestampUpdating);
String includedParentPaths = config.get(
GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY,
GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_DEFAULT);
LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY, includedParentPaths);
List splitIncludedParentPaths =
CONFIGURATION_SPLITTER.splitToList(includedParentPaths);
String excludedParentPaths = config.get(
GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY,
GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_DEFAULT);
LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY, excludedParentPaths);
List splitExcludedParentPaths =
CONFIGURATION_SPLITTER.splitToList(excludedParentPaths);
return new ParentTimestampUpdateIncludePredicate(
enableDirectoryTimestampUpdating, splitIncludedParentPaths, splitExcludedParentPaths);
}
// Include and exclude lists are intended to be small N and checked relatively
// infrequently. If that becomes not that case, consider Aho-Corasick or similar matching
// algorithms.
private final List includeSubstrings;
private final List excludeSubstrings;
private final boolean enableTimestampUpdates;
public ParentTimestampUpdateIncludePredicate(
boolean enableTimestampUpdates,
List includeSubstrings,
List excludeSubstrings) {
this.includeSubstrings = includeSubstrings;
this.excludeSubstrings = excludeSubstrings;
this.enableTimestampUpdates = enableTimestampUpdates;
}
/**
* Determine if updating directory timestamps should be ignored.
* @return True if the directory timestamp should not be updated. False to indicate it should
* be updated.
*/
@Override
public boolean apply(String path) {
if (!enableTimestampUpdates) {
LOG.debug("Timestamp updating disabled. Not updating path {}", path);
return false;
}
for (String include : includeSubstrings) {
if (path.contains(include)) {
LOG.debug(
"Path %s matched included path %s. Updating timestamps.", path, include);
return true;
}
}
for (String exclude : excludeSubstrings) {
if (path.contains(exclude)) {
LOG.debug(
"Path %s matched excluded path %s. Not updating timestamps.", path, exclude);
return false;
}
}
return true;
}
}
/**
* Constructs an instance of GoogleHadoopFileSystemBase; the internal
* GoogleCloudStorageFileSystem will be set up with config settings when initialize() is called.
*/
public GoogleHadoopFileSystemBase() {
}
/**
* Constructs an instance of GoogleHadoopFileSystemBase using the provided
* GoogleCloudStorageFileSystem; initialize() will not re-initialize it.
*/
public GoogleHadoopFileSystemBase(GoogleCloudStorageFileSystem gcsfs) {
Preconditions.checkArgument(gcsfs != null, "gcsfs must not be null");
this.gcsfs = gcsfs;
}
/**
* Returns an unqualified path without any leading slash, relative to the filesystem root,
* which serves as the home directory of the current user; see {@code getHomeDirectory} for
* a description of what the home directory means.
*/
protected abstract String getHomeDirectorySubpath();
/**
* Gets Hadoop path corresponding to the given GCS path.
*
* @param gcsPath Fully-qualified GCS path, of the form gs:///