com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2013 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.hadoop.fs.gcs;

import static com.google.cloud.hadoop.util.RequesterPaysOptions.REQUESTER_PAYS_MODE_DEFAULT;

import com.google.api.client.auth.oauth2.Credential;
import com.google.cloud.hadoop.gcsio.CreateFileOptions;
import com.google.cloud.hadoop.gcsio.FileInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageItemInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions.Fadvise;
import com.google.cloud.hadoop.gcsio.PathCodec;
import com.google.cloud.hadoop.gcsio.PerformanceCachingGoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.StorageResourceId;
import com.google.cloud.hadoop.util.AccessTokenProvider;
import com.google.cloud.hadoop.util.AccessTokenProviderClassFromConfigFactory;
import com.google.cloud.hadoop.util.AsyncWriteChannelOptions;
import com.google.cloud.hadoop.util.CredentialFactory;
import com.google.cloud.hadoop.util.CredentialFromAccessTokenProviderClassFactory;
import com.google.cloud.hadoop.util.EntriesCredentialConfiguration;
import com.google.cloud.hadoop.util.HadoopCredentialConfiguration;
import com.google.cloud.hadoop.util.HadoopVersionInfo;
import com.google.cloud.hadoop.util.HttpTransportFactory;
import com.google.cloud.hadoop.util.PropertyUtil;
import com.google.cloud.hadoop.util.RequesterPaysOptions;
import com.google.cloud.hadoop.util.RequesterPaysOptions.RequesterPaysMode;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.nio.file.DirectoryNotEmptyException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.GlobPattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class provides a Hadoop compatible File System on top of Google Cloud Storage (GCS).
 *
 * It is implemented as a thin abstraction layer on top of GCS. The layer hides any specific
 * characteristics of the underlying store and exposes FileSystem interface understood by the Hadoop
 * engine.
 *
 * 
Users interact with the files in the storage using fully qualified URIs. The file system
 * exposed by this class is identified using the 'gs' scheme. For example, {@code
 * gs://dir1/dir2/file1.txt}.
 *
 * 
This implementation translates paths between hadoop Path and GCS URI with the convention that
 * the Hadoop root directly corresponds to the GCS "root", e.g. gs:/. This is convenient for many
 * reasons, such as data portability and close equivalence to gsutil paths, but imposes certain
 * inherited constraints, such as files not being allowed in root (only 'directories' can be placed
 * in root), and directory names inside root have a more limited set of allowed characters.
 *
 * 
One of the main goals of this implementation is to maintain compatibility with behavior of
 * HDFS implementation when accessed through FileSystem interface. HDFS implementation is not very
 * consistent about the cases when it throws versus the cases when methods return false. We run GHFS
 * tests and HDFS tests against the same test data and use that as a guide to decide whether to
 * throw or to return false.
 */
public abstract class GoogleHadoopFileSystemBase extends GoogleHadoopFileSystemBaseSpecific
    implements FileSystemDescriptor {
  /** Logger. */
  public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystemBase.class);

  /** Default value of replication factor. */
  public static final short REPLICATION_FACTOR_DEFAULT = 3;

  /** Splitter for list values stored in a single configuration value */
  private static final Splitter CONFIGURATION_SPLITTER = Splitter.on(',');

  // -----------------------------------------------------------------
  // Configuration settings.
  // -----------------------------------------------------------------

  /**
   * Key for the permissions that we report a file or directory to have. Can either be octal or
   * symbolic mode accepted by {@link FsPermission#FsPermission(String)}
   */
  public static final String PERMISSIONS_TO_REPORT_KEY = "fs.gs.reported.permissions";

  /**
   * Default value for the permissions that we report a file or directory to have. Note: We do not
   * really support file/dir permissions but we need to report some permission value when Hadoop
   * calls getFileStatus(). A MapReduce job fails if we report permissions more relaxed than the
   * value below and this is the default File System.
   */
  public static final String PERMISSIONS_TO_REPORT_DEFAULT = "700";

  /** Configuration key for setting IO buffer size. */
  // TODO(user): rename the following to indicate that it is read buffer size.
  public static final String BUFFERSIZE_KEY = "fs.gs.io.buffersize";

  /**
   * Hadoop passes 4096 bytes as buffer size which causes poor perf. Default value of {@link
   * #BUFFERSIZE_KEY}.
   */
  public static final int BUFFERSIZE_DEFAULT = 8 * 1024 * 1024;

  /** Configuration key for setting write buffer size. */
  public static final String WRITE_BUFFERSIZE_KEY = "fs.gs.io.buffersize.write";

  /** Default value of {@link #WRITE_BUFFERSIZE_KEY}. */
  // chunk size etc. Get the following value from GCSWC class in a better way. For now, we hard code
  // it to a known good value.
  public static final int WRITE_BUFFERSIZE_DEFAULT = 64 * 1024 * 1024;

  /** Configuration key for default block size of a file. */
  public static final String BLOCK_SIZE_KEY = "fs.gs.block.size";

  /** Default value of {@link #BLOCK_SIZE_KEY}. */
  public static final int BLOCK_SIZE_DEFAULT = 64 * 1024 * 1024;

  /** Prefix to use for common authentication keys. */
  public static final String AUTHENTICATION_PREFIX = "fs.gs";

  /**
   * Configuration key for enabling GCE service account authentication. This key is deprecated. See
   * {@link HadoopCredentialConfiguration} for current key names.
   */
  public static final String ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY =
      "fs.gs.enable.service.account.auth";

  /**
   * Configuration key specifying the email address of the service-account with which to
   * authenticate. Only required if {@link #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY} is true AND we're
   * using fs.gs.service.account.auth.keyfile to authenticate with a private keyfile. NB: Once GCE
   * supports setting multiple service account email addresses for metadata auth, this key will also
   * be used in the metadata auth flow. This key is deprecated. See {@link
   * HadoopCredentialConfiguration} for current key names.
   */
  public static final String SERVICE_ACCOUNT_AUTH_EMAIL_KEY = "fs.gs.service.account.auth.email";

  /**
   * Configuration key specifying local file containing a service-account private .p12 keyfile. Only
   * used if {@link #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY} is true; if provided, the keyfile will be
   * used for service-account authentication. Otherwise, it is assumed that we are on a GCE VM with
   * metadata-authentication for service-accounts enabled, and the metadata server will be used
   * instead. Default value: none This key is deprecated. See {@link HadoopCredentialConfiguration}
   * for current key names.
   */
  public static final String SERVICE_ACCOUNT_AUTH_KEYFILE_KEY =
      "fs.gs.service.account.auth.keyfile";

  /** Configuration key for GCS project ID. Default value: none */
  public static final String GCS_PROJECT_ID_KEY = "fs.gs.project.id";

  /** Configuration key for GCS project ID. Default value: "DISABLED" */
  public static final String GCS_REQUESTER_PAYS_MODE_KEY = "fs.gs.requester.pays.mode";

  /** Configuration key for GCS Requester Pays Project ID. Default value: none */
  public static final String GCS_REQUESTER_PAYS_PROJECT_ID_KEY = "fs.gs.requester.pays.project.id";

  /** Configuration key for GCS Requester Pays Buckets. Default value: none */
  public static final String GCS_REQUESTER_PAYS_BUCKETS_KEY = "fs.gs.requester.pays.buckets";

  /**
   * Configuration key for GCS client ID. Required if {@link #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY}
   * == false. Default value: none This key is deprecated. See {@link HadoopCredentialConfiguration}
   * for current key names.
   */
  public static final String GCS_CLIENT_ID_KEY = "fs.gs.client.id";

  /**
   * Configuration key for GCS client secret. Required if {@link
   * #ENABLE_GCE_SERVICE_ACCOUNT_AUTH_KEY} == false. Default value: none This key is deprecated. See
   * HadoopCredentialConfiguration for current key names.
   */
  public static final String GCS_CLIENT_SECRET_KEY = "fs.gs.client.secret";

  /**
   * Configuration key for system bucket name. It is a fall back for the rootBucket of
   * GoogleHadoopFileSystem in gs:///path URIs . Default value: none This key is deprecated. Always
   * init the FileSystem with a bucket.
   */
  public static final String GCS_SYSTEM_BUCKET_KEY = "fs.gs.system.bucket";

  /**
   * Configuration key for flag to indicate whether system bucket should be created if it does not
   * exist. This key is deprecated. See {@link #GCS_SYSTEM_BUCKET_KEY}.
   */
  public static final String GCS_CREATE_SYSTEM_BUCKET_KEY = "fs.gs.system.bucket.create";

  /** Default value of {@link #GCS_CREATE_SYSTEM_BUCKET_KEY}. */
  public static final boolean GCS_CREATE_SYSTEM_BUCKET_DEFAULT = true;

  /** Configuration key for initial working directory of a GHFS instance. Default value: '/' */
  public static final String GCS_WORKING_DIRECTORY_KEY = "fs.gs.working.dir";

  /**
   * Configuration key for setting 250GB upper limit on file size to gain higher write throughput.
   */
  // TODO(user): remove it once blobstore supports high throughput without limiting size.
  public static final String GCS_FILE_SIZE_LIMIT_250GB = "fs.gs.file.size.limit.250gb";

  /** Default value of {@link #GCS_FILE_SIZE_LIMIT_250GB}. */
  public static final boolean GCS_FILE_SIZE_LIMIT_250GB_DEFAULT = false;

  /** Configuration key for marker file pattern. Default value: none */
  public static final String GCS_MARKER_FILE_PATTERN_KEY = "fs.gs.marker.file.pattern";

  /**
   * Configuration key for using a local item cache to supplement GCS API "getFile" results. This
   * provides faster access to recently queried data. Because the data is cached, modifications made
   * outside of this instance may not be immediately reflected. The performance cache can be used in
   * conjunction with other caching options.
   */
  public static final String GCS_ENABLE_PERFORMANCE_CACHE_KEY = "fs.gs.performance.cache.enable";

  /** Default value for {@link #GCS_ENABLE_PERFORMANCE_CACHE_KEY}. */
  public static final boolean GCS_ENABLE_PERFORMANCE_CACHE_DEFAULT = false;

  /**
   * Configuration key for maximum number of milliseconds a GoogleCloudStorageItemInfo will remain
   * "valid" in the performance cache before it's invalidated.
   */
  public static final String GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS_KEY =
      "fs.gs.performance.cache.max.entry.age.ms";

  /** Default value for {@link #GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS_KEY}. */
  public static final long GCS_PERFORMANCE_CACHE_MAX_ENTRY_AGE_MILLIS_DEFAULT =
      PerformanceCachingGoogleCloudStorageOptions.MAX_ENTRY_AGE_MILLIS_DEFAULT;

  /** Configuration key for whether or not to enable list caching for the performance cache. */
  public static final String GCS_PERFORMANCE_CACHE_LIST_CACHING_ENABLE_KEY =
      "fs.gs.performance.cache.list.caching.enable";

  /** Default value for {@link #GCS_PERFORMANCE_CACHE_LIST_CACHING_ENABLE_KEY}. */
  public static final boolean GCS_PERFORMANCE_CACHE_LIST_CACHING_ENABLE_DEFAULT =
      PerformanceCachingGoogleCloudStorageOptions.LIST_CACHING_ENABLED;

  /**
   * Configuration key for whether or not we should update timestamps for parent directories when we
   * create new files in them.
   */
  public static final String GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY =
      "fs.gs.parent.timestamp.update.enable";

  /** Default value for {@link #GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY}. */
  public static final boolean GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_DEFAULT = true;

  /**
   * Configuration key containing a comma-separated list of sub-strings that when matched will cause
   * a particular directory to not have its modification timestamp updated. Includes take precedence
   * over excludes.
   */
  public static final String GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY =
      "fs.gs.parent.timestamp.update.substrings.excludes";

  /** Default value for {@link #GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY}. */
  public static final String GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_DEFAULT = "/";

  /** Configuration key for the MR intermediate done dir. */
  public static final String MR_JOB_HISTORY_INTERMEDIATE_DONE_DIR_KEY =
      "mapreduce.jobhistory.intermediate-done-dir";

  /** Configuration key of the MR done directory. */
  public static final String MR_JOB_HISTORY_DONE_DIR_KEY = "mapreduce.jobhistory.done-dir";

  /**
   * Configuration key containing a comma-separated list of sub-strings that when matched will cause
   * a particular directory to have its modification timestamp updated. Includes take precedence
   * over excludes.
   */
  public static final String GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY =
      "fs.gs.parent.timestamp.update.substrings.includes";

  /** Default value for {@link #GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY}. */
  public static final String GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_DEFAULT =
      String.format(
          "${%s},${%s}", MR_JOB_HISTORY_INTERMEDIATE_DONE_DIR_KEY, MR_JOB_HISTORY_DONE_DIR_KEY);

  /**
   * Configuration key for enabling automatic repair of implicit directories whenever detected
   * inside listStatus and globStatus calls, or other methods which may indirectly call listStatus
   * and/or globaStatus.
   */
  public static final String GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY =
      "fs.gs.implicit.dir.repair.enable";

  /** Default value for {@link #GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY}. */
  public static final boolean GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT = true;

  /** Configuration key for changing the path codec from legacy to 'uri path encoding'. */
  public static final String PATH_CODEC_KEY = "fs.gs.path.encoding";

  /** Use new URI_ENCODED_PATH_CODEC. */
  public static final String PATH_CODEC_USE_URI_ENCODING = "uri-path";

  /** Use LEGACY_PATH_CODEC. */
  public static final String PATH_CODEC_USE_LEGACY_ENCODING = "legacy";

  /** Use the default path codec. */
  public static final String PATH_CODEC_DEFAULT = PATH_CODEC_USE_LEGACY_ENCODING;

  /**
   * Instance value of {@link #GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_KEY} based on the initial
   * Configuration.
   */
  private boolean enableAutoRepairImplicitDirectories =
      GCS_ENABLE_REPAIR_IMPLICIT_DIRECTORIES_DEFAULT;

  /**
   * Configuration key for enabling automatic inference of implicit directories. If set, we create
   * and return in-memory directory objects on the fly when no backing object exists, but we know
   * there are files with the same prefix. The ENABLE_REPAIR flag takes precedence over this flag:
   * if both are set, the repair is attempted, and only if it fails does the setting of this flag
   * kick in.
   */
  public static final String GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY =
      "fs.gs.implicit.dir.infer.enable";

  /** Default value for {@link #GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY}. */
  public static final boolean GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_DEFAULT = true;

  /**
   * Instance value of {@link #GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_KEY} based on the initial
   * Configuration.
   */
  private boolean enableInferImplicitDirectories = GCS_ENABLE_INFER_IMPLICIT_DIRECTORIES_DEFAULT;

  /**
   * Configuration key for enabling the use of a large flat listing to pre-populate possible glob
   * matches in a single API call before running the core globbing logic in-memory rather than
   * sequentially and recursively performing API calls.
   */
  public static final String GCS_ENABLE_FLAT_GLOB_KEY = "fs.gs.glob.flatlist.enable";

  /** Default value for {@link #GCS_ENABLE_FLAT_GLOB_KEY}. */
  public static final boolean GCS_ENABLE_FLAT_GLOB_DEFAULT = true;

  /**
   * Configuration key for enabling the use of marker files during file creation. When running
   * non-MR applications that make use of the FileSystem, it is a good idea to enable marker files
   * to better mimic HDFS overwrite and locking behavior.
   */
  public static final String GCS_ENABLE_MARKER_FILE_CREATION_KEY =
      "fs.gs.create.marker.files.enable";

  /** Default value for {@link #GCS_ENABLE_MARKER_FILE_CREATION_KEY}. */
  public static final boolean GCS_ENABLE_MARKER_FILE_CREATION_DEFAULT = false;

  /**
   * Configuration key for enabling the use of Rewrite requests for copy operations. Rewrite request
   * has the same effect as Copy request, but it can handle moving large objects that may
   * potentially timeout a Copy request.
   */
  public static final String GCS_ENABLE_COPY_WITH_REWRITE_KEY = "fs.gs.copy.with.rewrite.enable";

  /** Default value for {@link #GCS_ENABLE_COPY_WITH_REWRITE_KEY}. */
  public static final boolean GCS_ENABLE_COPY_WITH_REWRITE_DEFAULT = false;

  /** Configuration key for number of items to return per call to the list* GCS RPCs. */
  public static final String GCS_MAX_LIST_ITEMS_PER_CALL = "fs.gs.list.max.items.per.call";

  /** Default value for {@link #GCS_MAX_LIST_ITEMS_PER_CALL}. */
  public static final long GCS_MAX_LIST_ITEMS_PER_CALL_DEFAULT = 1024;

  /** Configuration key for a max number of GCS RPCs in batch request. */
  public static final String GCS_MAX_REQUESTS_PER_BATCH = "fs.gs.max.requests.per.batch";

  /** Default value for {@link #GCS_MAX_REQUESTS_PER_BATCH}. */
  public static final long GCS_MAX_REQUESTS_PER_BATCH_DEFAULT =
      GoogleCloudStorageOptions.MAX_REQUESTS_PER_BATCH_DEFAULT;

  /**
   * Configuration key for the max number of retries for failed HTTP request to GCS. Note that the
   * connector will retry *up to* the number of times as specified, using a default
   * ExponentialBackOff strategy.
   *
   * 
Also, note that this number will only control the number of retries in the low level HTTP
   * request implementation.
   */
  public static final String GCS_HTTP_MAX_RETRY_KEY = "fs.gs.http.max.retry";

  /** Default value for {@link #GCS_HTTP_MAX_RETRY_KEY}. */
  public static final int GCS_HTTP_MAX_RETRY_DEFAULT = 10;

  /** Configuration key for the connect timeout (in millisecond) for HTTP request to GCS. */
  public static final String GCS_HTTP_CONNECT_TIMEOUT_KEY = "fs.gs.http.connect-timeout";

  /** Default value for {@link #GCS_HTTP_CONNECT_TIMEOUT_KEY}. */
  public static final int GCS_HTTP_CONNECT_TIMEOUT_DEFAULT = 20 * 1000;

  /** Configuration key for the connect timeout (in millisecond) for HTTP request to GCS. */
  public static final String GCS_HTTP_READ_TIMEOUT_KEY = "fs.gs.http.read-timeout";

  /** Default value for {@link #GCS_HTTP_READ_TIMEOUT_KEY}. */
  public static final int GCS_HTTP_READ_TIMEOUT_DEFAULT = 20 * 1000;

  /**
   * Configuration key for setting a proxy for the connector to use to connect to GCS. The proxy
   * must be an HTTP proxy of the form "host:port".
   */
  public static final String GCS_PROXY_ADDRESS_KEY =
      EntriesCredentialConfiguration.PROXY_ADDRESS_KEY;

  /** Default to no proxy. */
  public static final String GCS_PROXY_ADDRESS_DEFAULT =
      EntriesCredentialConfiguration.PROXY_ADDRESS_DEFAULT;

  /**
   * Configuration key for the name of HttpTransport class to use for connecting to GCS. Must be the
   * name of an HttpTransportFactory.HttpTransportType (APACHE or JAVA_NET).
   */
  public static final String GCS_HTTP_TRANSPORT_KEY =
      EntriesCredentialConfiguration.HTTP_TRANSPORT_KEY;

  /** Default to the default specified in HttpTransportFactory. */
  public static final String GCS_HTTP_TRANSPORT_DEFAULT =
      EntriesCredentialConfiguration.HTTP_TRANSPORT_DEFAULT;

  /** Configuration key for adding a suffix to the GHFS application name sent to GCS. */
  public static final String GCS_APPLICATION_NAME_SUFFIX_KEY = "fs.gs.application.name.suffix";

  /** Default suffix to add to the application name. */
  public static final String GCS_APPLICATION_NAME_SUFFIX_DEFAULT = "";

  /**
   * Configuration key for modifying the maximum amount of time to wait for empty object creation.
   */
  public static final String GCS_MAX_WAIT_MILLIS_EMPTY_OBJECT_CREATE_KEY =
      "fs.gs.max.wait.for.empty.object.creation.ms";

  /** Default to 3 seconds. */
  public static final int GCS_MAX_WAIT_MILLIS_EMPTY_OBJECT_CREATE_DEFAULT = 3_000;

  /**
   * Configuration key for which type of output stream to use; different options may have different
   * degrees of support for advanced features like hsync() and different performance
   * characteristics. Options:
   *
   * 
BASIC: Stream is closest analogue to direct wrapper around low-level HTTP stream into GCS.
   *
   * 
SYNCABLE_COMPOSITE: Stream behaves similarly to BASIC when used with basic
   * create/write/close patterns, but supports hsync() by creating discrete temporary GCS objects
   * which are composed onto the destination object. Has a hard upper limit of number of components
   * which can be composed onto the destination object.
   */
  public static final String GCS_OUTPUTSTREAM_TYPE_KEY = "fs.gs.outputstream.type";

  /** Default value for {@link #GCS_OUTPUTSTREAM_TYPE_KEY}. */
  public static final String GCS_OUTPUTSTREAM_TYPE_DEFAULT = "BASIC";

  /** Available types for use with {@link #GCS_OUTPUTSTREAM_TYPE_KEY}. */
  public static enum OutputStreamType {
    BASIC,
    SYNCABLE_COMPOSITE
  }

  /**
   * If true, the returned FSDataInputStream from the open(Path) method will hold an internal
   * ByteBuffer of size fs.gs.io.buffersize which it pre-fills on each read, and can efficiently
   * seek within the internal buffer. Otherwise, calls are delegated straight through to a lower
   * level channel and the value of {@link #BUFFERSIZE_KEY} is passed through for the lower-level
   * channel to interpret as it sees fit.
   */
  public static final String GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_KEY =
      "fs.gs.inputstream.internalbuffer.enable";

  /** Default value for {@link #GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_KEY}. */
  public static final boolean GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_DEFAULT = false;

  /**
   * If true, input streams will proactively check the "content-encoding" header of underlying
   * objects during reads for special handling of cases where content-encoding causes the reported
   * object sizes to not match the actual number of read bytes due to the content being decoded
   * in-transit; such encoded objects also aren't suitable for splitting or resuming on failure, so
   * the underlying channel will restart from byte 0 and discard the requisite number of bytes to
   * seek to a desired position or resume in such cases. In general, content-encoded objects are
   * *not* well-suited for FileSystem-style access, and will break most of the split computations in
   * the Hadoop subsystem anyways. To avoid paying the cost of an extra metadata GET on every single
   * opened channel in the usual case where no content-encoded objects are present, it may be
   * desirable to set this to 'false'.
   */
  public static final String GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_KEY =
      "fs.gs.inputstream.support.content.encoding.enable";

  /** Default value for {@link #GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_KEY}. */
  public static final boolean GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_DEFAULT = true;

  /**
   * If forward seeks are within this many bytes of the current position, seeks are performed by
   * reading and discarding bytes in-place rather than opening a new underlying stream.
   */
  public static final String GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_KEY =
      "fs.gs.inputstream.inplace.seek.limit";

  /** Default value for {@link #GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_KEY}. */
  public static final long GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_DEFAULT = 8 * 1024 * 1024L;

  /** Tunes reading objects behavior to optimize HTTP GET requests for various use cases. */
  public static final String GCS_INPUTSTREAM_FADVISE_KEY = "fs.gs.inputstream.fadvise";

  /** Default value for {@link #GCS_INPUTSTREAM_FADVISE_KEY}. */
  public static final Fadvise GCS_INPUTSTREAM_FADVISE_DEFAULT =
      GoogleCloudStorageReadOptions.DEFAULT_FADVISE;

  /**
   * Minimum size in bytes of the HTTP Range header set in GCS request when opening new stream to
   * read an object.
   */
  public static final String GCS_INPUTSTREAM_MIN_RANGE_REQUEST_SIZE_KEY =
      "fs.gs.inputstream.min.range.request.size";

  /** Default value for {@link #GCS_INPUTSTREAM_MIN_RANGE_REQUEST_SIZE_KEY}. */
  public static final int GCS_INPUTSTREAM_MIN_RANGE_REQUEST_SIZE_DEFAULT =
      GoogleCloudStorageReadOptions.DEFAULT_MIN_RANGE_REQUEST_SIZE;

  /**
   * Size of the object footer that will be prefetched when read channel opened. Footer prefetching
   * is disabled if this property is set to 0.
   */
  public static final String GCS_INPUTSTREAM_FOOTER_PREFETCH_SIZE_KEY =
      "fs.gs.inputstream.footer.prefetch.size";

  /** Default value for {@link #GCS_INPUTSTREAM_FOOTER_PREFETCH_SIZE_KEY}. */
  public static final int GCS_INPUTSTREAM_FOOTER_PREFETCH_SIZE_DEFAULT =
      GoogleCloudStorageReadOptions.DEFAULT_FOOTER_PREFETCH_SIZE;

  /**
   * If true, recursive delete on a path that refers to a GCS bucket itself ('/' for any
   * bucket-rooted GoogleHadoopFileSystem) or delete on that path when it's empty will result in
   * fully deleting the GCS bucket. If false, any operation that normally would have deleted the
   * bucket will be ignored instead. Setting to 'false' preserves the typical behavior of "rm -rf /"
   * which translates to deleting everything inside of root, but without clobbering the filesystem
   * authority corresponding to that root path in the process.
   */
  public static final String GCE_BUCKET_DELETE_ENABLE_KEY = "fs.gs.bucket.delete.enable";

  /** Default value for {@link #GCE_BUCKET_DELETE_ENABLE_KEY}. */
  public static final boolean GCE_BUCKET_DELETE_ENABLE_DEFAULT = false;

  /** Default PathFilter that accepts all paths. */
  public static final PathFilter DEFAULT_FILTER =
      new PathFilter() {
        @Override
        public boolean accept(Path path) {
          return true;
        }
      };

  /** A resource file containing GCS related build properties. */
  public static final String PROPERTIES_FILE = "gcs.properties";

  /** The key in the PROPERTIES_FILE that contains the version built. */
  public static final String VERSION_PROPERTY = "gcs.connector.version";

  /** The version returned when one cannot be found in properties. */
  public static final String UNKNOWN_VERSION = "0.0.0";

  /** Current version. */
  public static final String VERSION;

  /** Identifies this version of the GoogleHadoopFileSystemBase library. */
  public static final String GHFS_ID;

  static {
    VERSION =
        PropertyUtil.getPropertyOrDefault(
            GoogleHadoopFileSystemBase.class, PROPERTIES_FILE, VERSION_PROPERTY, UNKNOWN_VERSION);
    LOG.info("GHFS version: {}", VERSION);
    GHFS_ID = String.format("GHFS/%s", VERSION);
  }

  /** Instance value of {@link #GCS_ENABLE_FLAT_GLOB_KEY} based on the initial Configuration. */
  private boolean enableFlatGlob = GCS_ENABLE_FLAT_GLOB_DEFAULT;

  /** The URI the File System is passed in initialize. */
  protected URI initUri;

  /**
   * The retrieved configuration value for {@link #GCS_SYSTEM_BUCKET_KEY}. Used as a fallback for a
   * root bucket, when required.
   */
  @Deprecated protected String systemBucket;

  /** Underlying GCS file system object. */
  protected GoogleCloudStorageFileSystem gcsfs;

  /**
   * Current working directory; overridden in initialize() if {@link #GCS_WORKING_DIRECTORY_KEY} is
   * set.
   */
  private Path workingDirectory;

  /** Buffer size to use instead of what Hadoop passed. */
  private int bufferSizeOverride = BUFFERSIZE_DEFAULT;

  /**
   * Default block size. Note that this is the size that is reported to Hadoop FS clients. It does
   * not modify the actual block size of an underlying GCS object, because GCS JSON API does not
   * allow modifying or querying the value. Modifying this value allows one to control how many
   * mappers are used to process a given file.
   */
  protected long defaultBlockSize = BLOCK_SIZE_DEFAULT;

  /** The fixed reported permission of all files. */
  private FsPermission reportedPermissions;

  /** Map of counter values */
  protected final ImmutableMap counters = createCounterMap();

  protected ImmutableMap createCounterMap() {
    EnumMap countersMap = new EnumMap<>(Counter.class);
    for (Counter counter : ALL_COUNTERS) {
      countersMap.put(counter, new AtomicLong());
    }
    return Maps.immutableEnumMap(countersMap);
  }

  /**
   * Behavior of listStatus when a path is not found.
   */
  protected enum ListStatusFileNotFoundBehavior {
    Hadoop1 {
      @Override
      public FileStatus[] handle(String path) throws IOException {
        return null;
      }
    },
    Hadoop2 {
      @Override
      public FileStatus[] handle(String path) throws IOException {
        throw new FileNotFoundException(String.format("Path '%s' does not exist.", path));
      }
    };

    /**
     * Perform version specific handling for a missing path.
     * @param path The missing path
     */
    public abstract FileStatus[] handle(String path) throws IOException;

    /**
     * Get the ListStatusFileNotFoundBehavior for the currently running Hadoop version.
     */
    public static ListStatusFileNotFoundBehavior get() {
      return get(HadoopVersionInfo.getInstance());
    }

    /**
     * Get the ListStatusFileNotFoundBehavior for the given hadoop version/
     * @param hadoopVersionInfo The hadoop version.
     */
    public static ListStatusFileNotFoundBehavior get(HadoopVersionInfo hadoopVersionInfo) {
      if (hadoopVersionInfo.isGreaterThan(2, 0)
          || hadoopVersionInfo.isEqualTo(2, 0)
          || hadoopVersionInfo.isEqualTo(0, 23)) {
        return Hadoop2;
      }
      return Hadoop1;
    }
  }

  // Behavior when a path is not found in listStatus()
  protected ListStatusFileNotFoundBehavior listStatusFileNotFoundBehavior =
      ListStatusFileNotFoundBehavior.get();

  @VisibleForTesting
  protected void setListStatusFileNotFoundBehavior(ListStatusFileNotFoundBehavior behavior) {
    this.listStatusFileNotFoundBehavior = behavior;
  }


  /**
   * Defines names of counters we track for each operation.
   *
   * There are two types of counters:
   * -- METHOD_NAME      : Number of successful invocations of method METHOD.
   * -- METHOD_NAME_TIME : Total inclusive time spent in method METHOD.
   */
  public enum Counter {
    APPEND,
    APPEND_TIME,
    CREATE,
    CREATE_TIME,
    DELETE,
    DELETE_TIME,
    GET_FILE_STATUS,
    GET_FILE_STATUS_TIME,
    INIT,
    INIT_TIME,
    INPUT_STREAM,
    INPUT_STREAM_TIME,
    LIST_STATUS,
    LIST_STATUS_TIME,
    MKDIRS,
    MKDIRS_TIME,
    OPEN,
    OPEN_TIME,
    OUTPUT_STREAM,
    OUTPUT_STREAM_TIME,
    READ1,
    READ1_TIME,
    READ,
    READ_TIME,
    READ_FROM_CHANNEL,
    READ_FROM_CHANNEL_TIME,
    READ_CLOSE,
    READ_CLOSE_TIME,
    READ_POS,
    READ_POS_TIME,
    RENAME,
    RENAME_TIME,
    SEEK,
    SEEK_TIME,
    SET_WD,
    SET_WD_TIME,
    WRITE1,
    WRITE1_TIME,
    WRITE,
    WRITE_TIME,
    WRITE_CLOSE,
    WRITE_CLOSE_TIME,
  }

  /**
   * Set of all counters.
   *
   * 
It is used for performance optimization instead of `Counter.values`, because
   * `Counter.values` returns new array on each invocation.
   */
  private static final ImmutableSet ALL_COUNTERS =
      Sets.immutableEnumSet(EnumSet.allOf(Counter.class));

  /**
   * A predicate that processes individual directory paths and evaluates the conditions set in
   * fs.gs.parent.timestamp.update.enable, fs.gs.parent.timestamp.update.substrings.include and
   * fs.gs.parent.timestamp.update.substrings.exclude to determine if a path should be ignored
   * when running directory timestamp updates. If no match is found in either include or
   * exclude and updates are enabled, the directory timestamp will be updated.
   */
  public static class ParentTimestampUpdateIncludePredicate
      implements GoogleCloudStorageFileSystemOptions.TimestampUpdatePredicate {

    /**
     * Create a new ParentTimestampUpdateIncludePredicate from the passed Hadoop configuration
     * object.
     */
    public static ParentTimestampUpdateIncludePredicate create(Configuration config) {
      boolean enableDirectoryTimestampUpdating = config.getBoolean(
          GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY,
          GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_DEFAULT);
      LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_ENABLE_KEY,
          enableDirectoryTimestampUpdating);

      String includedParentPaths = config.get(
          GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY,
          GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_DEFAULT);
      LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_INCLUDES_KEY, includedParentPaths);
      List splitIncludedParentPaths =
          CONFIGURATION_SPLITTER.splitToList(includedParentPaths);

      String excludedParentPaths = config.get(
          GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY,
          GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_DEFAULT);
      LOG.debug("{} = {}", GCS_PARENT_TIMESTAMP_UPDATE_EXCLUDES_KEY, excludedParentPaths);
      List splitExcludedParentPaths =
          CONFIGURATION_SPLITTER.splitToList(excludedParentPaths);

      return new ParentTimestampUpdateIncludePredicate(
          enableDirectoryTimestampUpdating,
          splitIncludedParentPaths,
          splitExcludedParentPaths);
    }

    // Include and exclude lists are intended to be small N and checked relatively
    // infrequently. If that becomes not that case, consider Aho-Corasick or similar matching
    // algorithms.
    private final List includeSubstrings;
    private final List excludeSubstrings;
    private final boolean enableTimestampUpdates;

    public ParentTimestampUpdateIncludePredicate(
        boolean enableTimestampUpdates,
        List includeSubstrings,
        List excludeSubstrings) {
      this.includeSubstrings = includeSubstrings;
      this.excludeSubstrings = excludeSubstrings;
      this.enableTimestampUpdates = enableTimestampUpdates;
    }

    /**
     * Determine if updating directory timestamps should be ignored.
     * @return True if the directory timestamp should not be updated. False to indicate it should
     * be updated.
     */
    @Override
    public boolean shouldUpdateTimestamp(URI uri) {
      if (!enableTimestampUpdates) {
        LOG.debug("Timestamp updating disabled. Not updating uri {}", uri);
        return false;
      }

      for (String include : includeSubstrings) {
        if (uri.toString().contains(include)) {
          LOG.debug("Path {} matched included path {}. Updating timestamps.", uri, include);
          return true;
        }
      }

      for (String exclude : excludeSubstrings) {
        if (uri.toString().contains(exclude)) {
          LOG.debug("Path {} matched excluded path {}. Not updating timestamps.", uri, exclude);
          return false;
        }
      }

      return true;
    }
  }

  /**
   * Constructs an instance of GoogleHadoopFileSystemBase; the internal
   * GoogleCloudStorageFileSystem will be set up with config settings when initialize() is called.
   */
  public GoogleHadoopFileSystemBase() {
  }

  /**
   * Constructs an instance of GoogleHadoopFileSystemBase using the provided
   * GoogleCloudStorageFileSystem; initialize() will not re-initialize it.
   */
  public GoogleHadoopFileSystemBase(GoogleCloudStorageFileSystem gcsfs) {
    Preconditions.checkArgument(gcsfs != null, "gcsfs must not be null");
    this.gcsfs = gcsfs;
  }

  /**
   * Returns an unqualified path without any leading slash, relative to the filesystem root,
   * which serves as the home directory of the current user; see {@code getHomeDirectory} for
   * a description of what the home directory means.
   */
  protected abstract String getHomeDirectorySubpath();

  /**
   * Gets Hadoop path corresponding to the given GCS path.
   *
   * @param gcsPath Fully-qualified GCS path, of the form gs:///