com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem Maven / Gradle / Ivy
/**
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.fs.gcs;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.StorageResourceId;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.fs.Path;
/**
* GoogleHadoopFileSystem is a version of GoogleHadoopFileSystemBase which is rooted in a single
* bucket at initialization time; in this case, Hadoop paths no longer correspond directly to
* general GCS paths, and all Hadoop operations going through this FileSystem will never touch
* any GCS bucket other than the bucket on which this FileSystem is rooted.
*
* This implementation sacrifices a small amount of cross-bucket interoperability in favor of
* more straightforward FileSystem semantics and compatibility with existing Hadoop applications.
* In particular, it is not subject to bucket-naming constraints, and files are allowed to be
* placed in root.
*/
public class GoogleHadoopFileSystem
extends GoogleHadoopFileSystemBase {
// The bucket the file system is rooted in used for default values of:
// -- working directory
// -- user home directories (only for Hadoop purposes).
private String rootBucket;
/**
* Constructs an instance of GoogleHadoopFileSystem; the internal
* GoogleCloudStorageFileSystem will be set up with config settings when initialize() is called.
*/
public GoogleHadoopFileSystem() {
super();
}
/**
* Constructs an instance of GoogleHadoopFileSystem using the provided
* GoogleCloudStorageFileSystem; initialize() will not re-initialize it.
*/
public GoogleHadoopFileSystem(GoogleCloudStorageFileSystem gcsfs) {
super(gcsfs);
}
/**
* {@inheritDoc}
*
* Sets and validates the root bucket.
*/
@Override
@VisibleForTesting
public void configureBuckets(String systemBucketName, boolean createConfiguredBuckets)
throws IOException {
super.configureBuckets(systemBucketName, createConfiguredBuckets);
rootBucket = initUri.getAuthority();
if (rootBucket != null) {
// Validate root bucket name
gcsfs.getPathCodec().getPath(rootBucket, null, true);
} else if (systemBucket != null) {
LOG.warn("GHFS.configureBuckets: Warning. No GCS bucket provided. "
+ "Falling back on deprecated fs.gs.system.bucket.");
rootBucket = systemBucket;
} else {
String msg = String.format("No bucket specified in GCS URI: %s", initUri);
throw new IllegalArgumentException(msg);
}
LOG.debug("GHFS.configureBuckets: GoogleHadoopFileSystem root in bucket: ", rootBucket);
}
@Override
protected void checkPath(Path path) {
// Validate scheme
super.checkPath(path);
URI uri = path.toUri();
String bucket = uri.getAuthority();
// Bucketless URIs will be qualified later
if (bucket == null || bucket.equals(rootBucket)) {
return;
} else {
String msg = String.format(
"Wrong bucket: %s, in path: %s, expected bucket: %s",
bucket, path, rootBucket);
throw new IllegalArgumentException(msg);
}
}
/**
* Get the name of the bucket in which file system is rooted.
*/
@VisibleForTesting
String getRootBucketName() {
return rootBucket;
}
/**
* Override to allow a homedir subpath which sits directly on our FileSystem root.
*/
@Override
protected String getHomeDirectorySubpath() {
return "user/" + System.getProperty("user.name");
}
/**
* Validates GCS Path belongs to this file system. The bucket must
* match the root bucket provided at initialization time.
*/
@Override
public Path getHadoopPath(URI gcsPath) {
LOG.debug("GHFS.getHadoopPath: {}", gcsPath);
// Handle root. Delegate to getGcsPath on "gs:/" to resolve the appropriate gs:// URI.
if (gcsPath.equals(getGcsPath(getFileSystemRoot()))) {
return getFileSystemRoot();
}
StorageResourceId resourceId = gcsfs.getPathCodec().validatePathAndGetId(gcsPath, true);
// Unlike the global-rooted GHFS, gs:// has no meaning in the bucket-rooted world.
if (resourceId.isRoot()) {
throw new IllegalArgumentException(String.format(
"Missing authority in gcsPath '%s'", gcsPath.toString()));
}
if (!resourceId.getBucketName().equals(rootBucket)) {
throw new IllegalArgumentException(String.format(
"Authority of URI '%s' doesn't match root bucket '%s'",
resourceId.getBucketName(), rootBucket));
}
Path hadoopPath = new Path(getScheme() + "://"
+ rootBucket + '/' + resourceId.getObjectName());
LOG.debug("GHFS.getHadoopPath: {} -> {}", gcsPath, hadoopPath);
return hadoopPath;
}
/**
* Translates a "gs:/" style hadoopPath (or relative path which is not fully-qualified) into
* the appropriate GCS path which is compatible with the underlying GcsFs or gsutil.
*/
@Override
public URI getGcsPath(Path hadoopPath) {
LOG.debug("GHFS.getGcsPath: {}", hadoopPath);
// Convert to fully qualified absolute path; the Path object will callback to get our current
// workingDirectory as part of fully resolving the path.
Path resolvedPath = makeQualified(hadoopPath);
String objectName = resolvedPath.toUri().getPath();
if (objectName != null && resolvedPath.isAbsolute()) {
// Strip off leading '/' because GoogleCloudStorageFileSystem.getPath appends it explicitly
// between bucket and objectName.
objectName = objectName.substring(1);
}
// Construct GCS path uri.
URI gcsPath = gcsfs.getPathCodec().getPath(rootBucket, objectName, true);
LOG.debug("GHFS.getGcsPath: {} -> {}", hadoopPath, gcsPath);
return gcsPath;
}
/**
* As the global-rooted FileSystem, our hadoop-path "scheme" is exactly equal to the general
* GCS scheme.
*/
@Override
public String getScheme() {
return GoogleCloudStorageFileSystem.SCHEME;
}
@Override
public Path getFileSystemRoot() {
return new Path(getScheme() + "://" + rootBucket + '/');
}
/**
* Gets the default value of working directory.
*/
@Override
public Path getDefaultWorkingDirectory() {
return getFileSystemRoot();
}
}