All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting Google Cloud Storage

There is a newer version: 3.0.4
Show newest version
/*
 * Copyright 2013 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.hadoop.fs.gcs;

import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.OutputStreamType.FLUSHABLE_COMPOSITE;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.BLOCK_SIZE;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.CONFIG_KEY_PREFIXES;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.DELEGATION_TOKEN_BINDING_CLASS;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_CONFIG_PREFIX;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_FILE_CHECKSUM_TYPE;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_GLOB_ALGORITHM;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_LAZY_INITIALIZATION_ENABLE;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_OPERATION_TRACE_LOG_ENABLE;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL_MS;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_OUTPUT_STREAM_TYPE;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.GCS_WORKING_DIRECTORY;
import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.PERMISSIONS_TO_REPORT;
import static com.google.cloud.hadoop.gcsio.CreateFileOptions.DEFAULT_OVERWRITE;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.GROUP_IMPERSONATION_SERVICE_ACCOUNT_SUFFIX;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.IMPERSONATION_SERVICE_ACCOUNT_SUFFIX;
import static com.google.cloud.hadoop.util.HadoopCredentialConfiguration.USER_IMPERSONATION_SERVICE_ACCOUNT_SUFFIX;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Strings.isNullOrEmpty;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.flogger.LazyArgs.lazy;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.concurrent.Executors.newFixedThreadPool;

import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
import com.google.api.client.http.HttpTransport;
import com.google.cloud.hadoop.fs.gcs.auth.GcsDelegationTokens;
import com.google.cloud.hadoop.gcsio.CreateFileOptions;
import com.google.cloud.hadoop.gcsio.CreateObjectOptions;
import com.google.cloud.hadoop.gcsio.FileInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorage;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorage.ListPage;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageItemInfo;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions;
import com.google.cloud.hadoop.gcsio.ListFileOptions;
import com.google.cloud.hadoop.gcsio.StorageResourceId;
import com.google.cloud.hadoop.gcsio.UpdatableItemInfo;
import com.google.cloud.hadoop.gcsio.UriPaths;
import com.google.cloud.hadoop.util.AccessTokenProvider;
import com.google.cloud.hadoop.util.AccessTokenProvider.AccessTokenType;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.cloud.hadoop.util.CredentialFactory;
import com.google.cloud.hadoop.util.CredentialFactory.CredentialHttpRetryInitializer;
import com.google.cloud.hadoop.util.CredentialFromAccessTokenProviderClassFactory;
import com.google.cloud.hadoop.util.GoogleCloudStorageEventBus;
import com.google.cloud.hadoop.util.GoogleCredentialWithIamAccessToken;
import com.google.cloud.hadoop.util.HadoopCredentialConfiguration;
import com.google.cloud.hadoop.util.HttpTransportFactory;
import com.google.cloud.hadoop.util.ITraceFactory;
import com.google.cloud.hadoop.util.PropertyUtil;
import com.google.cloud.hadoop.util.TraceFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Ascii;
import com.google.common.base.Suppliers;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.flogger.GoogleLogger;
import com.google.common.io.BaseEncoding;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.nio.file.DirectoryNotEmptyException;
import java.security.GeneralSecurityException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.GlobPattern;
import org.apache.hadoop.fs.GlobalStorageStatistics;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.StorageStatistics;
import org.apache.hadoop.fs.XAttrSetFlag;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.ProviderUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;

/**
 * This class provides a Hadoop compatible File System on top of Google Cloud Storage (GCS).
 *
 * 

It is implemented as a thin abstraction layer on top of GCS. The layer hides any specific * characteristics of the underlying store and exposes FileSystem interface understood by the Hadoop * engine. * *

Users interact with the files in the storage using fully qualified URIs. The file system * exposed by this class is identified using the 'gs' scheme. For example, {@code * gs://dir1/dir2/file1.txt}. * *

This implementation translates paths between hadoop Path and GCS URI with the convention that * the Hadoop root directly corresponds to the GCS "root", e.g. gs:/. This is convenient for many * reasons, such as data portability and close equivalence to gsutil paths, but imposes certain * inherited constraints, such as files not being allowed in root (only 'directories' can be placed * in root), and directory names inside root have a more limited set of allowed characters. * *

One of the main goals of this implementation is to maintain compatibility with behavior of * HDFS implementation when accessed through FileSystem interface. HDFS implementation is not very * consistent about the cases when it throws versus the cases when methods return false. We run GHFS * tests and HDFS tests against the same test data and use that as a guide to decide whether to * throw or to return false. */ public abstract class GoogleHadoopFileSystemBase extends FileSystem implements FileSystemDescriptor { private static final GoogleLogger logger = GoogleLogger.forEnclosingClass(); static final String SCHEME = GoogleCloudStorageFileSystem.SCHEME; // Request only object fields that are used in Hadoop FileStatus: // https://cloud.google.com/storage/docs/json_api/v1/objects#resource-representations private static final String OBJECT_FIELDS = "bucket,name,size,updated"; private static final ListFileOptions LIST_OPTIONS = ListFileOptions.DEFAULT.toBuilder().setFields(OBJECT_FIELDS).build(); /** * Available types for use with {@link * GoogleHadoopFileSystemConfiguration#GCS_OUTPUT_STREAM_TYPE}. */ public enum OutputStreamType { BASIC, FLUSHABLE_COMPOSITE, SYNCABLE_COMPOSITE } /** * Available GCS checksum types for use with {@link * GoogleHadoopFileSystemConfiguration#GCS_FILE_CHECKSUM_TYPE}. */ public enum GcsFileChecksumType { NONE(null, 0), CRC32C("COMPOSITE-CRC32C", 4), MD5("MD5", 16); private final String algorithmName; private final int byteLength; GcsFileChecksumType(String algorithmName, int byteLength) { this.algorithmName = algorithmName; this.byteLength = byteLength; } public String getAlgorithmName() { return algorithmName; } public int getByteLength() { return byteLength; } } /** * Available GCS glob algorithms for use with {@link * GoogleHadoopFileSystemConfiguration#GCS_GLOB_ALGORITHM}. */ public enum GlobAlgorithm { CONCURRENT, DEFAULT, FLAT } /** Default value of replication factor. */ public static final short REPLICATION_FACTOR_DEFAULT = 3; /** Default PathFilter that accepts all paths. */ public static final PathFilter DEFAULT_FILTER = path -> true; /** A resource file containing GCS related build properties. */ public static final String PROPERTIES_FILE = "gcs.properties"; /** The key in the PROPERTIES_FILE that contains the version built. */ public static final String VERSION_PROPERTY = "gcs.connector.version"; /** The version returned when one cannot be found in properties. */ public static final String UNKNOWN_VERSION = "0.0.0"; /** Current version. */ public static final String VERSION; /** Identifies this version of the GoogleHadoopFileSystemBase library. */ public static final String GHFS_ID; static { VERSION = PropertyUtil.getPropertyOrDefault( GoogleHadoopFileSystemBase.class, PROPERTIES_FILE, VERSION_PROPERTY, UNKNOWN_VERSION); logger.atFine().log("GHFS version: %s", VERSION); GHFS_ID = String.format("GHFS/%s", VERSION); } private static final String XATTR_KEY_PREFIX = "GHFS_XATTR_"; // Use empty array as null value because GCS API already uses null value to remove metadata key private static final byte[] XATTR_NULL_VALUE = new byte[0]; private static final ThreadFactory DAEMON_THREAD_FACTORY = new ThreadFactoryBuilder().setNameFormat("ghfs-thread-%d").setDaemon(true).build(); @VisibleForTesting GlobAlgorithm globAlgorithm = GCS_GLOB_ALGORITHM.getDefault(); private GcsFileChecksumType checksumType = GCS_FILE_CHECKSUM_TYPE.getDefault(); /** The URI the File System is passed in initialize. */ protected URI initUri; /** Delegation token support */ protected GcsDelegationTokens delegationTokens = null; /** Underlying GCS file system object. */ private Supplier gcsFsSupplier; private boolean gcsFsInitialized = false; /** * Current working directory; overridden in initialize() if {@link * GoogleHadoopFileSystemConfiguration#GCS_WORKING_DIRECTORY} is set. */ private Path workingDirectory; /** * Default block size. Note that this is the size that is reported to Hadoop FS clients. It does * not modify the actual block size of an underlying GCS object, because GCS JSON API does not * allow modifying or querying the value. Modifying this value allows one to control how many * mappers are used to process a given file. */ protected long defaultBlockSize = BLOCK_SIZE.getDefault(); /** The fixed reported permission of all files. */ private FsPermission reportedPermissions; private ITraceFactory traceFactory = TraceFactory.get(/* isEnabled */ false); ITraceFactory getTraceFactory() { return this.traceFactory; } private final GhfsStorageStatistics storageStatistics; /** * GCS {@link FileChecksum} which takes constructor parameters to define the return values of the * various abstract methods of {@link FileChecksum}. */ private static class GcsFileChecksum extends FileChecksum { private final GcsFileChecksumType checksumType; private final byte[] bytes; public GcsFileChecksum(GcsFileChecksumType checksumType, byte[] bytes) { this.checksumType = checksumType; this.bytes = bytes; checkState( bytes == null || bytes.length == checksumType.getByteLength(), "Checksum value length (%s) should be equal to the algorithm byte length (%s)", checksumType.getByteLength(), bytes.length); } @Override public String getAlgorithmName() { return checksumType.getAlgorithmName(); } @Override public int getLength() { return checksumType.getByteLength(); } @Override public byte[] getBytes() { return bytes; } @Override public void readFields(DataInput in) throws IOException { in.readFully(bytes); } @Override public void write(DataOutput out) throws IOException { out.write(bytes); } @Override public String toString() { return String.format( "%s: %s", getAlgorithmName(), bytes == null ? null : BaseEncoding.base16().encode(bytes)); } } /** * Constructs an instance of GoogleHadoopFileSystemBase; the internal {@link * GoogleCloudStorageFileSystem} will be set up with config settings when initialize() is called. */ public GoogleHadoopFileSystemBase() { // Inserts in to GlobalStorageStatistics. Spark Plugin for e.g. can query this and register to // Spark metrics system. StorageStatistics globalStats = GlobalStorageStatistics.INSTANCE.put( GhfsStorageStatistics.NAME, () -> new GhfsStorageStatistics()); if (GhfsStorageStatistics.class.isAssignableFrom(globalStats.getClass())) { storageStatistics = (GhfsStorageStatistics) globalStats; } else { logger.atWarning().log( "Encountered an error while registering to GlobalStorageStatistics. Some of the GCS connector metrics will not be reported to metrics sinks. globalStatsClassLoader=<%s>; classLoader=<%s>", globalStats.getClass().getClassLoader(), GhfsStorageStatistics.class.getClassLoader()); storageStatistics = GhfsStorageStatistics.DUMMY_INSTANCE; } GoogleCloudStorageEventBus.register(storageStatistics); } /** * Constructs an instance of {@link GoogleHadoopFileSystemBase} using the provided * GoogleCloudStorageFileSystem; initialize() will not re-initialize it. */ // TODO(b/120887495): This @VisibleForTesting annotation was being ignored by prod code. // Please check that removing it is correct, and remove this comment along with it. // @VisibleForTesting GoogleHadoopFileSystemBase(GoogleCloudStorageFileSystem gcsFs) { this(); checkNotNull(gcsFs, "gcsFs must not be null"); setGcsFs(gcsFs); } private void setGcsFs(GoogleCloudStorageFileSystem gcsFs) { this.gcsFsSupplier = Suppliers.ofInstance(gcsFs); this.gcsFsInitialized = true; } /** * Returns an unqualified path without any leading slash, relative to the filesystem root, which * serves as the home directory of the current user; see {@code getHomeDirectory} for a * description of what the home directory means. */ protected abstract String getHomeDirectorySubpath(); /** * Gets Hadoop path corresponding to the given GCS path. * * @param gcsPath Fully-qualified GCS path, of the form gs://bucket/object-path. */ public abstract Path getHadoopPath(URI gcsPath); /** * Gets GCS path corresponding to the given Hadoop path, which can be relative or absolute, and * can have either {@code gs://} or {@code gs:/} forms. * * @param hadoopPath Hadoop path. */ public abstract URI getGcsPath(Path hadoopPath); /** Gets the default value of working directory. */ public abstract Path getDefaultWorkingDirectory(); // ================================================================= // Methods implementing FileSystemDescriptor interface; these define the way // paths are translated between Hadoop and GCS. // ================================================================= @Override public abstract Path getFileSystemRoot(); @Override public abstract String getScheme(); /** * Overridden to make root its own parent. This is POSIX compliant, but more importantly guards * against poor directory accounting in the PathData class of Hadoop 2's FsShell. */ @Override public Path makeQualified(final Path path) { Path qualifiedPath = super.makeQualified(path); URI uri = qualifiedPath.toUri(); checkState( "".equals(uri.getPath()) || qualifiedPath.isAbsolute(), "Path '%s' must be fully qualified.", qualifiedPath); // Strip initial '..'s to make root is its own parent. StringBuilder sb = new StringBuilder(uri.getPath()); while (sb.indexOf("/../") == 0) { // Leave a preceding slash, so path is still absolute. sb.delete(0, 3); } String strippedPath = sb.toString(); // Allow a Path of gs://someBucket to map to gs://someBucket/ if (strippedPath.equals("/..") || strippedPath.equals("")) { strippedPath = "/"; } Path result = new Path(uri.getScheme(), uri.getAuthority(), strippedPath); logger.atFiner().log("makeQualified(path: %s): %s", path, result); return result; } @Override protected void checkPath(Path path) { URI uri = path.toUri(); String scheme = uri.getScheme(); // Only check that the scheme matches. The authority and path will be // validated later. if (scheme == null || scheme.equalsIgnoreCase(getScheme())) { return; } GoogleCloudStorageEventBus.postOnException(); String msg = String.format( "Wrong FS scheme: %s, in path: %s, expected scheme: %s", scheme, path, getScheme()); throw new IllegalArgumentException(msg); } /** * Initializes this file system instance. * *

Note: The path passed to this method could be path of any file/directory. It does not matter * because the only thing we check is whether it uses 'gs' scheme. The rest is ignored. * * @param path URI of a file/directory within this file system. * @param config Hadoop configuration. */ @Override public void initialize(URI path, Configuration config) throws IOException { logger.atFiner().log("initialize(path: %s, config: %s)", path, config); checkArgument(path != null, "path must not be null"); checkArgument(config != null, "config must not be null"); checkArgument(path.getScheme() != null, "scheme of path must not be null"); checkArgument(path.getScheme().equals(getScheme()), "URI scheme not supported: %s", path); config = ProviderUtils.excludeIncompatibleCredentialProviders(config, GoogleHadoopFileSystem.class); super.initialize(path, config); initUri = path; // Set this configuration as the default config for this instance; configure() // will perform some file-system-specific adjustments, but the original should // be sufficient (and is required) for the delegation token binding initialization. setConf(config); this.traceFactory = TraceFactory.get(GCS_OPERATION_TRACE_LOG_ENABLE.get(config, config::getBoolean)); // Initialize the delegation token support, if it is configured initializeDelegationTokenSupport(config, path); configure(config); } /** * Initialize the delegation token support for this filesystem. * * @param config The filesystem configuration * @param path The filesystem path * @throws IOException */ private void initializeDelegationTokenSupport(Configuration config, URI path) throws IOException { logger.atFiner().log("initializeDelegationTokenSupport(config: %s, path: %s)", config, path); // Load delegation token binding, if support is configured if (isNullOrEmpty(DELEGATION_TOKEN_BINDING_CLASS.get(config, config::get))) { return; } GcsDelegationTokens dts = new GcsDelegationTokens(); Text service = new Text(getScheme() + "://" + path.getAuthority()); dts.bindToFileSystem(this, service); dts.init(config); dts.start(); delegationTokens = dts; if (delegationTokens.isBoundToDT()) { GoogleCloudStorageEventBus.postOnException(); logger.atFine().log( "initializeDelegationTokenSupport(config: %s, path: %s): using existing delegation token", config, path); } } private void stopDelegationTokens() { if (delegationTokens != null) { try { delegationTokens.close(); } catch (IOException e) { GoogleCloudStorageEventBus.postOnException(); logger.atSevere().withCause(e).log("Failed to stop delegation tokens support"); } } } /** Returns a URI of the root of this FileSystem. */ @Override public URI getUri() { return getFileSystemRoot().toUri(); } /** The default port is listed as -1 as an indication that ports are not used. */ @Override protected int getDefaultPort() { int result = -1; logger.atFiner().log("getDefaultPort(): %d", result); return result; } public boolean hasPathCapability(Path path, String capability) throws IOException { switch (validatePathCapabilityArgs(path, capability)) { // TODO: remove string literals in favor of Constants in CommonPathCapabilities.java // from Hadoop 3 when Hadoop 2 is no longer supported case "fs.capability.paths.append": case "fs.capability.paths.concat": return true; default: return false; } } private static String validatePathCapabilityArgs(Path path, String capability) { checkNotNull(path); checkArgument(!isNullOrEmpty(capability), "capability parameter is empty string"); return Ascii.toLowerCase(capability); } /** * Opens the given file for reading. * * @param hadoopPath File to open. * @param bufferSize Size of buffer to use for IO. * @return A readable stream. * @throws FileNotFoundException if the given path does not exist. * @throws IOException if an error occurs. */ @Override public FSDataInputStream open(Path hadoopPath, int bufferSize) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_OPEN, hadoopPath, this.traceFactory, () -> { checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); logger.atFiner().log( "open(hadoopPath: %s, bufferSize: %d [ignored])", hadoopPath, bufferSize); URI gcsPath = getGcsPath(hadoopPath); GoogleCloudStorageReadOptions readChannelOptions = getGcsFs().getOptions().getCloudStorageOptions().getReadChannelOptions(); GoogleHadoopFSInputStream in = new GoogleHadoopFSInputStream(this, gcsPath, readChannelOptions, statistics); return new FSDataInputStream(in); }); } @FunctionalInterface public interface InvocationRaisingIOE { /** * Apply the operation. * * @throws IOException Any IO failure */ R apply() throws IOException; } /** * Opens the given file for writing. * *

Note: This function overrides the given bufferSize value with a higher number unless further * overridden using configuration parameter {@code fs.gs.outputstream.buffer.size}. * * @param hadoopPath The file to open. * @param permission Permissions to set on the new file. Ignored. * @param overwrite If a file with this name already exists, then if true, the file will be * overwritten, and if false an error will be thrown. * @param bufferSize The size of the buffer to use. * @param replication Required block replication for the file. Ignored. * @param blockSize The block-size to be used for the new file. Ignored. * @param progress Progress is reported through this. Ignored. * @return A writable stream. * @throws IOException if an error occurs. * @see #setPermission(Path, FsPermission) */ @Override public FSDataOutputStream create( Path hadoopPath, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_CREATE, hadoopPath, traceFactory, () -> { checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkArgument(replication > 0, "replication must be a positive integer: %s", replication); checkArgument(blockSize > 0, "blockSize must be a positive integer: %s", blockSize); checkOpen(); logger.atFiner().log( "create(hadoopPath: %s, overwrite: %b, bufferSize: %d [ignored])", hadoopPath, overwrite, bufferSize); URI gcsPath = getGcsPath(hadoopPath); OutputStreamType type = GCS_OUTPUT_STREAM_TYPE.get(getConf(), getConf()::getEnum); OutputStream out; switch (type) { case BASIC: out = new GoogleHadoopOutputStream( this, gcsPath, statistics, CreateFileOptions.builder().setOverwriteExisting(overwrite).build()); break; case FLUSHABLE_COMPOSITE: SyncableOutputStreamOptions flushableOutputStreamOptions = SyncableOutputStreamOptions.builder() .setMinSyncInterval( Duration.ofMillis( GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL_MS.get( getConf(), getConf()::getInt))) .setSyncOnFlushEnabled(true) .build(); out = new GoogleHadoopSyncableOutputStream( this, gcsPath, statistics, CreateFileOptions.builder().setOverwriteExisting(overwrite).build(), flushableOutputStreamOptions); break; case SYNCABLE_COMPOSITE: SyncableOutputStreamOptions syncableOutputStreamOptions = SyncableOutputStreamOptions.builder() .setMinSyncInterval( Duration.ofMillis( GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL_MS.get( getConf(), getConf()::getInt))) .build(); out = new GoogleHadoopSyncableOutputStream( this, gcsPath, statistics, CreateFileOptions.builder().setOverwriteExisting(overwrite).build(), syncableOutputStreamOptions); break; default: GoogleCloudStorageEventBus.postOnException(); throw new IOException( String.format( "Unsupported output stream type given for key '%s': '%s'", GCS_OUTPUT_STREAM_TYPE.getKey(), type)); } storageStatistics.filesCreated(); return new FSDataOutputStream(out, /* stats= */ null); }); } /** {@inheritDoc} */ @Override public FSDataOutputStream createNonRecursive( Path hadoopPath, FsPermission permission, EnumSet flags, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_CREATE_NON_RECURSIVE, hadoopPath, traceFactory, () -> { URI gcsPath = getGcsPath(checkNotNull(hadoopPath, "hadoopPath must not be null")); URI parentGcsPath = UriPaths.getParentPath(gcsPath); if (!getGcsFs().getFileInfo(parentGcsPath).exists()) { GoogleCloudStorageEventBus.postOnException(); throw new FileNotFoundException( String.format( "Can not create '%s' file, because parent folder does not exist: %s", gcsPath, parentGcsPath)); } return create( hadoopPath, permission, flags.contains(org.apache.hadoop.fs.CreateFlag.OVERWRITE), bufferSize, replication, blockSize, progress); }); } /** * Appends to an existing file (optional operation). Not supported. * * @param hadoopPath The existing file to be appended. * @param bufferSize The size of the buffer to be used. * @param progress For reporting progress if it is not null. * @return A writable stream. * @throws IOException if an error occurs. */ @Override public FSDataOutputStream append(Path hadoopPath, int bufferSize, Progressable progress) throws IOException { checkArgument(hadoopPath != null, "hadoopPath must not be null"); logger.atFiner().log( "append(hadoopPath: %s, bufferSize: %d [ignored])", hadoopPath, bufferSize); URI filePath = getGcsPath(hadoopPath); SyncableOutputStreamOptions syncableOutputStreamOptions = SyncableOutputStreamOptions.builder() .setAppendEnabled(true) .setMinSyncInterval( Duration.ofMillis( GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL_MS.get(getConf(), getConf()::getInt))) .setSyncOnFlushEnabled( GCS_OUTPUT_STREAM_TYPE.get(getConf(), getConf()::getEnum) == FLUSHABLE_COMPOSITE) .build(); return new FSDataOutputStream( new GoogleHadoopSyncableOutputStream( this, filePath, statistics, DEFAULT_OVERWRITE, syncableOutputStreamOptions), statistics); } /** * Concat existing files into one file. * * @param tgt the path to the target destination. * @param srcs the paths to the sources to use for the concatenation. * @throws IOException IO failure */ @Override public void concat(Path tgt, Path[] srcs) throws IOException { logger.atFiner().log("concat(tgt: %s, srcs: %s)", tgt, lazy(() -> Arrays.toString(srcs))); checkArgument(srcs.length > 0, "srcs must have at least one source"); URI tgtPath = getGcsPath(tgt); List srcPaths = Arrays.stream(srcs).map(this::getGcsPath).collect(toImmutableList()); checkArgument(!srcPaths.contains(tgtPath), "target must not be contained in sources"); List> partitions = Lists.partition(srcPaths, GoogleCloudStorage.MAX_COMPOSE_OBJECTS - 1); logger.atFiner().log("concat(tgt: %s, %d partitions: %s)", tgt, partitions.size(), partitions); for (List partition : partitions) { // We need to include the target in the list of sources to compose since // the GCS FS compose operation will overwrite the target, whereas the Hadoop // concat operation appends to the target. List sources = Lists.newArrayList(tgtPath); sources.addAll(partition); getGcsFs().compose(sources, tgtPath, CreateObjectOptions.CONTENT_TYPE_DEFAULT); } } /** * Renames src to dst. Src must not be equal to the filesystem root. * * @param src Source path. * @param dst Destination path. * @return true if successful, or false if the old name does not exist or if the new name already * belongs to the namespace. * @throws IOException if an error occurs. */ @Override public boolean rename(Path src, Path dst) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_RENAME, String.format("rename(%s -> %s)", src, dst), this.traceFactory, () -> { checkArgument(src != null, "src must not be null"); checkArgument(dst != null, "dst must not be null"); // Even though the underlying GCSFS will also throw an IAE if src is root, since our // filesystem // root happens to equal the global root, we want to explicitly check it here since // derived // classes may not have filesystem roots equal to the global root. if (src.makeQualified(this).equals(getFileSystemRoot())) { logger.atFiner().log("rename(src: %s, dst: %s): false [src is a root]", src, dst); return false; } try { renameInternal(src, dst); } catch (IOException e) { GoogleCloudStorageEventBus.postOnException(); if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { throw e; } logger.atFiner().withCause(e).log("rename(src: %s, dst: %s): false [failed]", src, dst); return false; } return true; }); } /** * Renames src to dst. * * @param src Source path. * @param dst Destination path. * @throws IOException if an error occurs. */ void renameInternal(Path src, Path dst) throws IOException { checkArgument(src != null, "src must not be null"); checkArgument(dst != null, "dst must not be null"); checkOpen(); URI srcPath = getGcsPath(src); URI dstPath = getGcsPath(dst); getGcsFs().rename(srcPath, dstPath); logger.atFiner().log("rename(src: %s, dst: %s): true", src, dst); } /** * Deletes the given file or directory. * * @param hadoopPath The path to delete. * @param recursive If path is a directory and set to true, the directory is deleted, else throws * an exception. In case of a file, the recursive parameter is ignored. * @return true if delete is successful else false. * @throws IOException if an error occurs. */ @Override public boolean delete(Path hadoopPath, boolean recursive) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_DELETE, hadoopPath, traceFactory, () -> { checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); URI gcsPath = getGcsPath(hadoopPath); try { getGcsFs().delete(gcsPath, recursive); } catch (DirectoryNotEmptyException e) { GoogleCloudStorageEventBus.postOnException(); throw e; } catch (IOException e) { if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { GoogleCloudStorageEventBus.postOnException(); throw e; } logger.atFiner().withCause(e).log( "delete(hadoopPath: %s, recursive: %b): false [failed]", hadoopPath, recursive); return false; } logger.atFiner().log( "delete(hadoopPath: %s, recursive: %b): true", hadoopPath, recursive); return true; }); } /** * Lists file status. If the given path points to a directory then the status of children is * returned, otherwise the status of the given file is returned. * * @param hadoopPath Given path. * @return File status list or null if path does not exist. * @throws IOException if an error occurs. */ @Override public FileStatus[] listStatus(Path hadoopPath) throws IOException { checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); logger.atFiner().log("listStatus(hadoopPath: %s)", hadoopPath); URI gcsPath = getGcsPath(hadoopPath); List status; try { List fileInfos = getGcsFs().listFileInfo(gcsPath, LIST_OPTIONS); status = new ArrayList<>(fileInfos.size()); String userName = getUgiUserName(); for (FileInfo fileInfo : fileInfos) { status.add(getFileStatus(fileInfo, userName)); } } catch (FileNotFoundException fnfe) { GoogleCloudStorageEventBus.postOnException(); throw (FileNotFoundException) new FileNotFoundException( String.format( "listStatus(hadoopPath: %s): '%s' does not exist.", hadoopPath, gcsPath)) .initCause(fnfe); } return status.toArray(new FileStatus[0]); } /** * Sets the current working directory to the given path. * * @param hadoopPath New working directory. */ @Override public void setWorkingDirectory(Path hadoopPath) { checkArgument(hadoopPath != null, "hadoopPath must not be null"); URI gcsPath = UriPaths.toDirectory(getGcsPath(hadoopPath)); Path newPath = getHadoopPath(gcsPath); // Ideally we should check (as we did earlier) if the given path really points to an existing // directory. However, it takes considerable amount of time for that check which hurts perf. // Given that HDFS code does not do such checks either, we choose to not do them in favor of // better performance. workingDirectory = newPath; logger.atFiner().log("setWorkingDirectory(hadoopPath: %s): %s", hadoopPath, workingDirectory); } /** * Gets the current working directory. * * @return The current working directory. */ @Override public Path getWorkingDirectory() { logger.atFiner().log("getWorkingDirectory(): %s", workingDirectory); return workingDirectory; } /** * Makes the given path and all non-existent parents directories. Has the semantics of Unix 'mkdir * -p'. * * @param hadoopPath Given path. * @param permission Permissions to set on the given directory. * @return true on success, false otherwise. * @throws IOException if an error occurs. */ @Override public boolean mkdirs(Path hadoopPath, FsPermission permission) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_MKDIRS, hadoopPath, traceFactory, () -> { checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); URI gcsPath = getGcsPath(hadoopPath); try { getGcsFs().mkdirs(gcsPath); } catch (java.nio.file.FileAlreadyExistsException faee) { GoogleCloudStorageEventBus.postOnException(); // Need to convert to the Hadoop flavor of FileAlreadyExistsException. throw (FileAlreadyExistsException) new FileAlreadyExistsException( String.format( "mkdirs(hadoopPath: %s, permission: %s): failed", hadoopPath, permission)) .initCause(faee); } logger.atFiner().log( "mkdirs(hadoopPath: %s, permission: %s): true", hadoopPath, permission); return true; }); } /** Gets the default replication factor. */ @Override public short getDefaultReplication() { return REPLICATION_FACTOR_DEFAULT; } /** * Gets status of the given path item. * * @param hadoopPath The path we want information about. * @return A FileStatus object for the given path. * @throws FileNotFoundException when the path does not exist; * @throws IOException on other errors. */ @Override public FileStatus getFileStatus(Path hadoopPath) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_GET_FILE_STATUS, hadoopPath, traceFactory, () -> { checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); URI gcsPath = getGcsPath(hadoopPath); FileInfo fileInfo = getGcsFs().getFileInfo(gcsPath); if (!fileInfo.exists()) { GoogleCloudStorageEventBus.postOnException(); throw new FileNotFoundException( String.format( "%s not found: %s", fileInfo.isDirectory() ? "Directory" : "File", hadoopPath)); } String userName = getUgiUserName(); return getFileStatus(fileInfo, userName); }); } /** Gets FileStatus corresponding to the given FileInfo value. */ private FileStatus getFileStatus(FileInfo fileInfo, String userName) { // GCS does not provide modification time. It only provides creation time. // It works for objects because they are immutable once created. FileStatus status = new FileStatus( fileInfo.getSize(), fileInfo.isDirectory(), REPLICATION_FACTOR_DEFAULT, defaultBlockSize, /* modificationTime= */ fileInfo.getModificationTime(), /* accessTime= */ fileInfo.getModificationTime(), reportedPermissions, /* owner= */ userName, /* group= */ userName, getHadoopPath(fileInfo.getPath())); logger.atFiner().log( "getFileStatus(path: %s, userName: %s): %s", fileInfo.getPath(), userName, lazy(() -> fileStatusToString(status))); return status; } /** * Determines based on suitability of {@code fixedPath} whether to use flat globbing logic where * we use a single large listing during globStatus to then perform the core globbing logic * in-memory. */ @VisibleForTesting boolean couldUseFlatGlob(Path fixedPath) { // Only works for filesystems where the base Hadoop Path scheme matches the underlying URI // scheme for GCS. if (!getUri().getScheme().equals(SCHEME)) { logger.atFine().log( "Flat glob is on, but doesn't work for scheme '%s', using default behavior.", getUri().getScheme()); return false; } // The full pattern should have a wildcard, otherwise there's no point doing the flat glob. GlobPattern fullPattern = new GlobPattern(fixedPath.toString()); if (!fullPattern.hasWildcard()) { logger.atFine().log( "Flat glob is on, but Path '%s' has no wildcard, using default behavior.", fixedPath); return false; } // To use a flat glob, there must be an authority defined. if (isNullOrEmpty(fixedPath.toUri().getAuthority())) { logger.atFine().log( "Flat glob is on, but Path '%s' has a empty authority, using default behavior.", fixedPath); return false; } // And the authority must not contain a wildcard. GlobPattern authorityPattern = new GlobPattern(fixedPath.toUri().getAuthority()); if (authorityPattern.hasWildcard()) { logger.atFine().log( "Flat glob is on, but Path '%s' has a wildcard authority, using default behavior.", fixedPath); return false; } return true; } @VisibleForTesting String trimToPrefixWithoutGlob(String path) { char[] wildcardChars = "*?{[".toCharArray(); int trimIndex = path.length(); // Find the first occurrence of any one of the wildcard characters, or just path.length() // if none are found. for (char wildcard : wildcardChars) { int wildcardIndex = path.indexOf(wildcard); if (wildcardIndex >= 0 && wildcardIndex < trimIndex) { trimIndex = wildcardIndex; } } return path.substring(0, trimIndex); } /** * Returns an array of FileStatus objects whose path names match pathPattern. * *

Return null if pathPattern has no glob and the path does not exist. Return an empty array if * pathPattern has a glob and no path matches it. * * @param pathPattern A regular expression specifying the path pattern. * @return An array of FileStatus objects. * @throws IOException if an error occurs. */ @Override public FileStatus[] globStatus(Path pathPattern) throws IOException { return globStatus(pathPattern, DEFAULT_FILTER); } /** * Returns an array of FileStatus objects whose path names match pathPattern and is accepted by * the user-supplied path filter. Results are sorted by their path names. * *

Return null if pathPattern has no glob and the path does not exist. Return an empty array if * pathPattern has a glob and no path matches it. * * @param pathPattern A regular expression specifying the path pattern. * @param filter A user-supplied path filter. * @return An array of FileStatus objects. * @throws IOException if an error occurs. */ @Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_GLOB_STATUS, String.format("path=%s; pattern=%s", pathPattern, filter), traceFactory, () -> { checkOpen(); logger.atFiner().log("globStatus(pathPattern: %s, filter: %s)", pathPattern, filter); // URI does not handle glob expressions nicely, for the purpose of // fully-qualifying a path we can URI-encode them. // Using toString() to avoid Path(URI) constructor. Path encodedPath = new Path(pathPattern.toUri().toString()); // We convert pathPattern to GCS path and then to Hadoop path to ensure that it ends up in // the correct format. See note in getHadoopPath for more information. Path encodedFixedPath = getHadoopPath(getGcsPath(encodedPath)); // Decode URI-encoded path back into a glob path. Path fixedPath = new Path(URI.create(encodedFixedPath.toString())); logger.atFiner().log("fixed path pattern: %s => %s", pathPattern, fixedPath); if (globAlgorithm == GlobAlgorithm.CONCURRENT && couldUseFlatGlob(fixedPath)) { return concurrentGlobInternal(fixedPath, filter); } if (globAlgorithm == GlobAlgorithm.FLAT && couldUseFlatGlob(fixedPath)) { return flatGlobInternal(fixedPath, filter); } return super.globStatus(fixedPath, filter); }); } /** * Use 2 glob algorithms that return the same result but one of them could be significantly faster * than another one depending on directory layout. */ private FileStatus[] concurrentGlobInternal(Path fixedPath, PathFilter filter) throws IOException { ExecutorService globExecutor = newFixedThreadPool(2, DAEMON_THREAD_FACTORY); try { return globExecutor.invokeAny( ImmutableList.of( () -> flatGlobInternal(fixedPath, filter), () -> super.globStatus(fixedPath, filter))); } catch (InterruptedException e) { GoogleCloudStorageEventBus.postOnException(); Thread.currentThread().interrupt(); throw new IOException(String.format("Concurrent glob execution failed: %s", e), e); } catch (ExecutionException e) { GoogleCloudStorageEventBus.postOnException(); throw new IOException(String.format("Concurrent glob execution failed: %s", e.getCause()), e); } finally { globExecutor.shutdownNow(); } } private FileStatus[] flatGlobInternal(Path fixedPath, PathFilter filter) throws IOException { String pathString = fixedPath.toString(); String prefixString = trimToPrefixWithoutGlob(pathString); Path prefixPath = new Path(prefixString); URI prefixUri = getGcsPath(prefixPath); if (prefixString.endsWith("/") && !prefixPath.toString().endsWith("/")) { // Path strips a trailing slash unless it's the 'root' path. We want to keep the trailing // slash so that we don't wastefully list sibling files which may match the directory-name // as a strict prefix but would've been omitted due to not containing the '/' at the end. prefixUri = UriPaths.toDirectory(prefixUri); } // Get everything matching the non-glob prefix. logger.atFiner().log("Listing everything with '%s' prefix", prefixUri); List matchedStatuses = null; String pageToken = null; do { ListPage infoPage = getGcsFs().listFileInfoForPrefixPage(prefixUri, LIST_OPTIONS, pageToken); Collection statusPage = toFileStatusesWithImplicitDirectories(infoPage.getItems()); // TODO: refactor to use GlobPattern and PathFilter directly without helper FS FileSystem helperFileSystem = InMemoryGlobberFileSystem.createInstance(getConf(), getWorkingDirectory(), statusPage); FileStatus[] matchedStatusPage = helperFileSystem.globStatus(fixedPath, filter); if (matchedStatusPage != null) { Collections.addAll( (matchedStatuses == null ? matchedStatuses = new ArrayList<>() : matchedStatuses), matchedStatusPage); } pageToken = infoPage.getNextPageToken(); } while (pageToken != null); if (matchedStatuses == null || matchedStatuses.isEmpty()) { return matchedStatuses == null ? null : new FileStatus[0]; } matchedStatuses.sort( ((Comparator) Comparator.naturalOrder()) // Place duplicate implicit directories after real directory .thenComparingInt((FileStatus f) -> isImplicitDirectory(f) ? 1 : 0)); // Remove duplicate file statuses that could be in the matchedStatuses // because of pagination and implicit directories List filteredStatuses = new ArrayList<>(matchedStatuses.size()); FileStatus lastAdded = null; for (FileStatus fileStatus : matchedStatuses) { if (lastAdded == null || lastAdded.compareTo(fileStatus) != 0) { filteredStatuses.add(fileStatus); lastAdded = fileStatus; } } return filteredStatuses.toArray(new FileStatus[0]); } private static boolean isImplicitDirectory(FileStatus curr) { // Modification time of 0 indicates implicit directory. return curr.isDir() && curr.getModificationTime() == 0; } /** Helper method that converts {@link FileInfo} collection to {@link FileStatus} collection. */ private Collection toFileStatusesWithImplicitDirectories( Collection fileInfos) throws IOException { List fileStatuses = new ArrayList<>(fileInfos.size()); Set filePaths = Sets.newHashSetWithExpectedSize(fileInfos.size()); String userName = getUgiUserName(); for (FileInfo fileInfo : fileInfos) { filePaths.add(fileInfo.getPath()); fileStatuses.add(getFileStatus(fileInfo, userName)); } // The flow for populating this doesn't bother to populate metadata entries for parent // directories but we know the parent directories are expected to exist, so we'll just // populate the missing entries explicitly here. Necessary for getFileStatus(parentOfInfo) // to work when using an instance of this class. for (FileInfo fileInfo : fileInfos) { URI parentPath = UriPaths.getParentPath(fileInfo.getPath()); while (parentPath != null && !parentPath.equals(GoogleCloudStorageFileSystem.GCS_ROOT)) { if (!filePaths.contains(parentPath)) { logger.atFiner().log("Adding fake entry for missing parent path '%s'", parentPath); StorageResourceId id = StorageResourceId.fromUriPath(parentPath, true); GoogleCloudStorageItemInfo fakeItemInfo = GoogleCloudStorageItemInfo.createInferredDirectory(id); FileInfo fakeFileInfo = FileInfo.fromItemInfo(fakeItemInfo); filePaths.add(parentPath); fileStatuses.add(getFileStatus(fakeFileInfo, userName)); } parentPath = UriPaths.getParentPath(parentPath); } } return fileStatuses; } /** Helper method to get the UGI short user name */ private static String getUgiUserName() throws IOException { UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); return ugi.getShortUserName(); } /** * Returns home directory of the current user. * *

Note: This directory is only used for Hadoop purposes. It is not the same as a user's OS * home directory. */ @Override public Path getHomeDirectory() { Path result = new Path(getFileSystemRoot(), getHomeDirectorySubpath()); logger.atFiner().log("getHomeDirectory(): %s", result); return result; } /** * Converts the given FileStatus to its string representation. * * @param stat FileStatus to convert. * @return String representation of the given FileStatus. */ private static String fileStatusToString(FileStatus stat) { assert stat != null; return String.format( "path: %s, isDir: %s, len: %d, owner: %s", stat.getPath().toString(), stat.isDir(), stat.getLen(), stat.getOwner()); } /** * {@inheritDoc} * *

Returns the service if delegation tokens are configured, otherwise, null. */ @Override public String getCanonicalServiceName() { String service = null; if (delegationTokens != null) { service = delegationTokens.getService().toString(); } logger.atFiner().log("getCanonicalServiceName(): %s", service); return service; } /** Gets GCS FS instance. */ public GoogleCloudStorageFileSystem getGcsFs() { return gcsFsSupplier.get(); } /** * Loads an {@link AccessTokenProvider} implementation. If the user provided an * AbstractDelegationTokenBinding we get the AccessTokenProvider, otherwise if a class name is * provided (See {@link HadoopCredentialConfiguration#ACCESS_TOKEN_PROVIDER_IMPL_SUFFIX} then we * use it, otherwise it's null. */ private AccessTokenProvider getAccessTokenProvider(Configuration config) throws IOException { // Check if delegation token support is configured AccessTokenProvider accessTokenProvider = delegationTokens != null // If so, use the delegation token to acquire the Google credentials ? delegationTokens.getAccessTokenProvider() // If delegation token support is not configured, check if a // custom AccessTokenProvider implementation is configured : HadoopCredentialConfiguration.getAccessTokenProvider( config, ImmutableList.of(GCS_CONFIG_PREFIX)); if (accessTokenProvider != null) { if (accessTokenProvider.getAccessTokenType() == AccessTokenType.DOWNSCOPED) { checkArgument( HadoopCredentialConfiguration.ENABLE_NULL_CREDENTIAL_SUFFIX .withPrefixes( HadoopCredentialConfiguration.getConfigKeyPrefixes(GCS_CONFIG_PREFIX)) .get(config, config::getBoolean) && !HadoopCredentialConfiguration.ENABLE_SERVICE_ACCOUNTS_SUFFIX .withPrefixes( HadoopCredentialConfiguration.getConfigKeyPrefixes(GCS_CONFIG_PREFIX)) .get(config, config::getBoolean), "When using DOWNSCOPED access token, `fs.gs.auth.null.enabled` should" + " be set to true and `fs.gs.auth.service.account.enable` should be set to false"); } accessTokenProvider.setConf(config); } return accessTokenProvider; } /** * Retrieve user's Credential. If user implemented {@link AccessTokenProvider} and provided the * class name (See {@link HadoopCredentialConfiguration#ACCESS_TOKEN_PROVIDER_IMPL_SUFFIX} then * build a credential with access token provided by this provider; Otherwise obtain credential * through {@link HadoopCredentialConfiguration#getCredentialFactory(Configuration, String...)}. */ private Credential getCredential( Configuration config, GoogleCloudStorageFileSystemOptions gcsFsOptions, AccessTokenProvider accessTokenProvider) throws IOException, GeneralSecurityException { Credential credential; if (accessTokenProvider == null) { // If delegation token support is not configured, check if a // custom AccessTokenProvider implementation is configured, and attempt // to acquire the Google credentials using it credential = CredentialFromAccessTokenProviderClassFactory.credential( config, ImmutableList.of(GCS_CONFIG_PREFIX), CredentialFactory.DEFAULT_SCOPES); if (credential == null) { // Finally, if no credentials have been acquired at this point, employ // the default mechanism. credential = HadoopCredentialConfiguration.getCredentialFactory(config, GCS_CONFIG_PREFIX) .getCredential(CredentialFactory.DEFAULT_SCOPES); } } else { switch (accessTokenProvider.getAccessTokenType()) { case GENERIC: // check if an AccessTokenProvider is configured // if so, try to get the credentials through the access token provider credential = CredentialFromAccessTokenProviderClassFactory.credential( accessTokenProvider, CredentialFactory.DEFAULT_SCOPES); break; case DOWNSCOPED: // If the AccessTokenType is set to DOWNSCOPED`, Credential will be generated // when GCS requests are created. credential = null; break; default: GoogleCloudStorageEventBus.postOnException(); throw new IllegalStateException( String.format( "Unknown AccessTokenType: %s", accessTokenProvider.getAccessTokenType())); } } // If impersonation service account exists, then use current credential to request access token // for the impersonating service account. return getImpersonatedCredential(config, gcsFsOptions, credential).orElse(credential); } /** * Generate a {@link Credential} from the internal access token provider based on the service * account to impersonate. */ private static Optional getImpersonatedCredential( Configuration config, GoogleCloudStorageFileSystemOptions gcsFsOptions, Credential credential) throws IOException { Map userImpersonationServiceAccounts = USER_IMPERSONATION_SERVICE_ACCOUNT_SUFFIX .withPrefixes(CONFIG_KEY_PREFIXES) .getPropsWithPrefix(config); Map groupImpersonationServiceAccounts = GROUP_IMPERSONATION_SERVICE_ACCOUNT_SUFFIX .withPrefixes(CONFIG_KEY_PREFIXES) .getPropsWithPrefix(config); String impersonationServiceAccount = IMPERSONATION_SERVICE_ACCOUNT_SUFFIX .withPrefixes(CONFIG_KEY_PREFIXES) .get(config, config::get); // Exit early if impersonation is not configured if (userImpersonationServiceAccounts.isEmpty() && groupImpersonationServiceAccounts.isEmpty() && isNullOrEmpty(impersonationServiceAccount)) { return Optional.empty(); } UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); Optional serviceAccountToImpersonate = Stream.of( () -> getServiceAccountToImpersonateForUserGroup( userImpersonationServiceAccounts, ImmutableList.of(currentUser.getShortUserName())), () -> getServiceAccountToImpersonateForUserGroup( groupImpersonationServiceAccounts, ImmutableList.copyOf(currentUser.getGroupNames())), (Supplier>) () -> Optional.ofNullable(impersonationServiceAccount)) .map(Supplier::get) .filter(Optional::isPresent) .map(Optional::get) .filter(sa -> !isNullOrEmpty(sa)) .findFirst(); if (serviceAccountToImpersonate.isPresent()) { GoogleCloudStorageOptions options = gcsFsOptions.getCloudStorageOptions(); HttpTransport httpTransport = HttpTransportFactory.createHttpTransport( options.getTransportType(), options.getProxyAddress(), options.getProxyUsername(), options.getProxyPassword(), Duration.ofMillis(options.getHttpRequestReadTimeout())); GoogleCredential impersonatedCredential = new GoogleCredentialWithIamAccessToken( httpTransport, new CredentialHttpRetryInitializer(credential), serviceAccountToImpersonate.get(), CredentialFactory.DEFAULT_SCOPES); logger.atFine().log( "Impersonating '%s' service account for '%s' user", serviceAccountToImpersonate.get(), currentUser); return Optional.of(impersonatedCredential.createScoped(CredentialFactory.DEFAULT_SCOPES)); } return Optional.empty(); } private static Optional getServiceAccountToImpersonateForUserGroup( Map serviceAccountMapping, List userGroups) { return serviceAccountMapping.entrySet().stream() .filter(e -> userGroups.contains(e.getKey())) .map(Map.Entry::getValue) .findFirst(); } /** * Configures GHFS using the supplied configuration. * * @param config Hadoop configuration object. */ private synchronized void configure(Configuration config) throws IOException { logger.atFiner().log("GHFS_ID=%s: configure(config: %s)", GHFS_ID, config); // Set this configuration as the default config for this instance. setConf(config); globAlgorithm = GCS_GLOB_ALGORITHM.get(config, config::getEnum); checksumType = GCS_FILE_CHECKSUM_TYPE.get(config, config::getEnum); defaultBlockSize = BLOCK_SIZE.get(config, config::getLong); reportedPermissions = new FsPermission(PERMISSIONS_TO_REPORT.get(config, config::get)); if (gcsFsSupplier == null) { if (GCS_LAZY_INITIALIZATION_ENABLE.get(config, config::getBoolean)) { gcsFsSupplier = Suppliers.memoize( () -> { try { GoogleCloudStorageFileSystem gcsFs = createGcsFs(config); configureBuckets(gcsFs); configureWorkingDirectory(config); gcsFsInitialized = true; return gcsFs; } catch (IOException e) { GoogleCloudStorageEventBus.postOnException(); throw new RuntimeException("Failed to create GCS FS", e); } }); } else { setGcsFs(createGcsFs(config)); configureBuckets(getGcsFs()); configureWorkingDirectory(config); } } else { configureBuckets(getGcsFs()); configureWorkingDirectory(config); } } private GoogleCloudStorageFileSystem createGcsFs(Configuration config) throws IOException { GoogleCloudStorageFileSystemOptions gcsFsOptions = GoogleHadoopFileSystemConfiguration.getGcsFsOptionsBuilder(config).build(); AccessTokenProvider accessTokenProvider = getAccessTokenProvider(config); Credential credential; try { credential = getCredential(config, gcsFsOptions, accessTokenProvider); } catch (GeneralSecurityException e) { GoogleCloudStorageEventBus.postOnException(); throw new RuntimeException(e); } return new GoogleCloudStorageFileSystem( credential, accessTokenProvider != null && accessTokenProvider.getAccessTokenType() == AccessTokenType.DOWNSCOPED ? accessBoundaries -> accessTokenProvider.getAccessToken(accessBoundaries).getToken() : null, gcsFsOptions); } /** * Validates and possibly creates buckets needed by subclass. * * @param gcsFs {@link GoogleCloudStorageFileSystem} to configure buckets * @throws IOException if bucket name is invalid or cannot be found. */ @VisibleForTesting protected abstract void configureBuckets(GoogleCloudStorageFileSystem gcsFs) throws IOException; private void configureWorkingDirectory(Configuration config) { // Set initial working directory to root so that any configured value gets resolved // against file system root. workingDirectory = getFileSystemRoot(); Path newWorkingDirectory; String configWorkingDirectory = GCS_WORKING_DIRECTORY.get(config, config::get); if (isNullOrEmpty(configWorkingDirectory)) { newWorkingDirectory = getDefaultWorkingDirectory(); logger.atWarning().log( "No working directory configured, using default: '%s'", newWorkingDirectory); } else { newWorkingDirectory = new Path(configWorkingDirectory); } // Use the public method to ensure proper behavior of normalizing and resolving the new // working directory relative to the initial filesystem-root directory. setWorkingDirectory(newWorkingDirectory); logger.atFiner().log( "Configured working directory: %s = %s", GCS_WORKING_DIRECTORY.getKey(), getWorkingDirectory()); } /** Assert that the FileSystem has been initialized and not close()d. */ private void checkOpen() throws IOException { if (isClosed()) { GoogleCloudStorageEventBus.postOnException(); throw new IOException("GoogleHadoopFileSystem has been closed or not initialized."); } } private boolean isClosed() { return gcsFsSupplier == null || gcsFsSupplier.get() == null; } // ================================================================= // Overridden functions for debug tracing. The following functions // do not change functionality. They just log parameters and call base // class' function. // ================================================================= @Override public boolean deleteOnExit(Path f) throws IOException { checkOpen(); boolean result = super.deleteOnExit(f); logger.atFiner().log("deleteOnExit(path: %s): %b", f, result); return result; } @Override protected void processDeleteOnExit() { logger.atFiner().log("processDeleteOnExit()"); super.processDeleteOnExit(); } @Override public ContentSummary getContentSummary(Path f) throws IOException { ContentSummary result = super.getContentSummary(f); logger.atFiner().log("getContentSummary(path: %s): %b", f, result); return result; } @Override public Token getDelegationToken(String renewer) throws IOException { Token result = null; if (delegationTokens != null) { result = delegationTokens.getBoundOrNewDT(renewer); } logger.atFiner().log("getDelegationToken(renewer: %s): %s", renewer, result); return result; } @Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) throws IOException { logger.atFiner().log( "copyFromLocalFile(delSrc: %b, overwrite: %b, %d srcs, dst: %s)", delSrc, overwrite, srcs.length, dst); super.copyFromLocalFile(delSrc, overwrite, srcs, dst); } @Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) throws IOException { logger.atFiner().log( "copyFromLocalFile(delSrc: %b, overwrite: %b, src: %s, dst: %s)", delSrc, overwrite, src, dst); super.copyFromLocalFile(delSrc, overwrite, src, dst); } @Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException { logger.atFiner().log("copyToLocalFile(delSrc: %b, src: %s, dst: %s)", delSrc, src, dst); super.copyToLocalFile(delSrc, src, dst); } @Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException { Path result = super.startLocalOutput(fsOutputFile, tmpLocalFile); logger.atFiner().log( "startLocalOutput(fsOutputFile: %s, tmpLocalFile: %s): %s", fsOutputFile, tmpLocalFile, result); return result; } @Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException { logger.atFiner().log( "startLocalOutput(fsOutputFile: %s, tmpLocalFile: %s)", fsOutputFile, tmpLocalFile); super.completeLocalOutput(fsOutputFile, tmpLocalFile); } @Override public void close() throws IOException { logger.atFiner().log("close()"); super.close(); // NB: We must *first* have the superclass close() before we close the underlying gcsFsSupplier // since the superclass may decide to perform various heavyweight cleanup operations (such as // deleteOnExit). if (gcsFsSupplier != null) { if (gcsFsInitialized) { getGcsFs().close(); } gcsFsSupplier = null; } stopDelegationTokens(); } @Override public long getUsed() throws IOException { long result = super.getUsed(); logger.atFiner().log("getUsed(): %s", result); return result; } @Override public long getDefaultBlockSize() { long result = defaultBlockSize; logger.atFiner().log("getDefaultBlockSize(): %d", result); return result; } @Override public FileChecksum getFileChecksum(Path hadoopPath) throws IOException { storageStatistics.getFileCheckSum(); checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); URI gcsPath = getGcsPath(hadoopPath); final FileInfo fileInfo = getGcsFs().getFileInfo(gcsPath); if (!fileInfo.exists()) { GoogleCloudStorageEventBus.postOnException(); throw new FileNotFoundException( String.format( "%s not found: %s", fileInfo.isDirectory() ? "Directory" : "File", hadoopPath)); } FileChecksum checksum = getFileChecksum(checksumType, fileInfo); logger.atFiner().log( "getFileChecksum(hadoopPath: %s [gcsPath: %s]): %s", hadoopPath, gcsPath, checksum); return checksum; } private static FileChecksum getFileChecksum(GcsFileChecksumType type, FileInfo fileInfo) throws IOException { switch (type) { case NONE: return null; case CRC32C: return new GcsFileChecksum(type, fileInfo.getCrc32cChecksum()); case MD5: return new GcsFileChecksum(type, fileInfo.getMd5Checksum()); } GoogleCloudStorageEventBus.postOnException(); throw new IOException("Unrecognized GcsFileChecksumType: " + type); } @Override public void setVerifyChecksum(boolean verifyChecksum) { logger.atFiner().log("setVerifyChecksum(verifyChecksum: %s)", verifyChecksum); super.setVerifyChecksum(verifyChecksum); } @Override public void setPermission(Path p, FsPermission permission) throws IOException { logger.atFiner().log("setPermission(path: %s, permission: %s)", p, permission); super.setPermission(p, permission); } @Override public void setOwner(Path p, String username, String groupname) throws IOException { logger.atFiner().log("setOwner(path: %s, username: %s, groupname: %s)", p, username, groupname); super.setOwner(p, username, groupname); } @Override public void setTimes(Path p, long mtime, long atime) throws IOException { logger.atFiner().log("setTimes(path: %s, mtime: %d, atime: %d)", p, mtime, atime); super.setTimes(p, mtime, atime); } /** {@inheritDoc} */ @Override public byte[] getXAttr(Path path, String name) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_XATTR_GET_NAMED, path, traceFactory, () -> { checkNotNull(path, "path should not be null"); checkNotNull(name, "name should not be null"); Map attributes = getGcsFs().getFileInfo(getGcsPath(path)).getAttributes(); String xAttrKey = getXAttrKey(name); byte[] xAttr = attributes.containsKey(xAttrKey) ? getXAttrValue(attributes.get(xAttrKey)) : null; logger.atFiner().log( "getXAttr(path: %s, name: %s): %s", path, name, lazy(() -> new String(xAttr, UTF_8))); return xAttr; }); } /** {@inheritDoc} */ @Override public Map getXAttrs(Path path) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_XATTR_GET_MAP, path, traceFactory, () -> { checkNotNull(path, "path should not be null"); FileInfo fileInfo = getGcsFs().getFileInfo(getGcsPath(path)); Map xAttrs = fileInfo.getAttributes().entrySet().stream() .filter(a -> isXAttr(a.getKey())) .collect( HashMap::new, (m, a) -> m.put(getXAttrName(a.getKey()), getXAttrValue(a.getValue())), Map::putAll); logger.atFiner().log("getXAttrs(path: %s): %s", path, xAttrs); return xAttrs; }); } /** {@inheritDoc} */ @Override public Map getXAttrs(Path path, List names) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_XATTR_GET_NAMED_MAP, path, traceFactory, () -> { checkNotNull(path, "path should not be null"); checkNotNull(names, "names should not be null"); Map xAttrs; if (names.isEmpty()) { xAttrs = new HashMap<>(); } else { Set namesSet = new HashSet<>(names); xAttrs = getXAttrs(path).entrySet().stream() .filter(a -> namesSet.contains(a.getKey())) .collect(HashMap::new, (m, a) -> m.put(a.getKey(), a.getValue()), Map::putAll); } logger.atFiner().log("getXAttrs(path: %s, names: %s): %s", path, names, xAttrs); return xAttrs; }); } /** {@inheritDoc} */ @Override public List listXAttrs(Path path) throws IOException { return GhfsStorageStatistics.trackDuration( storageStatistics, GhfsStatistic.INVOCATION_OP_XATTR_LIST, path, traceFactory, () -> { checkNotNull(path, "path should not be null"); FileInfo fileInfo = getGcsFs().getFileInfo(getGcsPath(path)); List xAttrs = fileInfo.getAttributes().keySet().stream() .filter(this::isXAttr) .map(this::getXAttrName) .collect(Collectors.toCollection(ArrayList::new)); logger.atFiner().log("listXAttrs(path: %s): %s", path, xAttrs); return xAttrs; }); } /** {@inheritDoc} */ @Override public void setXAttr(Path path, String name, byte[] value, EnumSet flags) throws IOException { logger.atFiner().log( "setXAttr(path: %s, name: %s, value %s, flags %s", path, name, lazy(() -> new String(value, UTF_8)), flags); checkNotNull(path, "path should not be null"); checkNotNull(name, "name should not be null"); checkArgument(flags != null && !flags.isEmpty(), "flags should not be null or empty"); FileInfo fileInfo = getGcsFs().getFileInfo(getGcsPath(path)); String xAttrKey = getXAttrKey(name); Map attributes = fileInfo.getAttributes(); if (attributes.containsKey(xAttrKey) && !flags.contains(XAttrSetFlag.REPLACE)) { GoogleCloudStorageEventBus.postOnException(); throw new IOException( String.format( "REPLACE flag must be set to update XAttr (name='%s', value='%s') for '%s'", name, new String(value, UTF_8), path)); } if (!attributes.containsKey(xAttrKey) && !flags.contains(XAttrSetFlag.CREATE)) { GoogleCloudStorageEventBus.postOnException(); throw new IOException( String.format( "CREATE flag must be set to create XAttr (name='%s', value='%s') for '%s'", name, new String(value, UTF_8), path)); } UpdatableItemInfo updateInfo = new UpdatableItemInfo( StorageResourceId.fromUriPath(fileInfo.getPath(), /* allowEmptyObjectName= */ false), ImmutableMap.of(xAttrKey, getXAttrValue(value))); getGcsFs().getGcs().updateItems(ImmutableList.of(updateInfo)); } /** {@inheritDoc} */ @Override public void removeXAttr(Path path, String name) throws IOException { logger.atFiner().log("removeXAttr(path: %s, name: %s)", path, name); checkNotNull(path, "path should not be null"); checkNotNull(name, "name should not be null"); FileInfo fileInfo = getGcsFs().getFileInfo(getGcsPath(path)); Map xAttrToRemove = new HashMap<>(); xAttrToRemove.put(getXAttrKey(name), null); UpdatableItemInfo updateInfo = new UpdatableItemInfo( StorageResourceId.fromUriPath(fileInfo.getPath(), /* allowEmptyObjectName= */ false), xAttrToRemove); getGcsFs().getGcs().updateItems(ImmutableList.of(updateInfo)); } /** * Get the storage statistics of this filesystem. * * @return the storage statistics */ @Override public GhfsStorageStatistics getStorageStatistics() { return storageStatistics; } private boolean isXAttr(String key) { return key != null && key.startsWith(XATTR_KEY_PREFIX); } private String getXAttrKey(String name) { return XATTR_KEY_PREFIX + name; } private String getXAttrName(String key) { return key.substring(XATTR_KEY_PREFIX.length()); } private byte[] getXAttrValue(byte[] value) { return value == null ? XATTR_NULL_VALUE : value; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy