com.google.cloud.hadoop.fs.gcs.GoogleHadoopSyncableOutputStream Maven / Gradle / Ivy
/*
* Copyright 2016 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.fs.gcs;
import com.google.cloud.hadoop.gcsio.CreateFileOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageItemInfo;
import com.google.cloud.hadoop.gcsio.StorageResourceId;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Syncable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* GoogleHadoopSyncableOutputStream implements the {@code Syncable} interface by composing
* objects created in separate underlying streams for each hsync() call.
*
* Prior to the first hsync(), sync() or close() call, this channel will behave the same way as a
* basic non-syncable channel, writing directly to the destination file.
*
* On the first call to hsync()/sync(), the destination file is committed and a new temporary file
* using a hidden-file prefix (underscore) is created with an additional suffix which differs for
* each subsequent temporary file in the series; during this time readers can read the data
* committed to the destination file, but not the bytes written to the temporary file since the
* last hsync() call.
*
* On each subsequent hsync()/sync() call, the temporary file closed(), composed onto the
* destination file, then deleted, and a new temporary file is opened under a new filename for
* further writes.
*
* Caveats:
* 1. Each hsync()/sync() requires many underlying read and mutation requests occurring
* sequentially, so latency is expected to be fairly high.
* 2. There is a hard limit to the number of times hsync()/sync() can be called due to the
* GCS-level limit on the number of components a composite object can contain (1024). Any
* attempt to hsync() more than this number of times will result in an IOException and
* any data written since the last hsync() should be considered lost (unless manually
* recovered as long as the temporary file wasn't deleted under the hood).
*
* If errors occur mid-stream, there may be one or more temporary files failing to be cleaned up,
* and require manual intervention to discover and delete any such unused files. Data written
* prior to the most recent successful hsync() is persistent and safe in such a case.
*
* If multiple writers are attempting to write to the same destination file, generation ids used
* with low-level precondition checks will cause all but a one writer to fail their precondition
* checks during writes, and a single remaining writer will safely occupy the stream.
*/
public class GoogleHadoopSyncableOutputStream extends OutputStream implements Syncable {
// Prefix used for all temporary files created by this stream.
public static final String TEMPFILE_PREFIX = "_GCS_SYNCABLE_TEMPFILE_";
// Maximum number of components a composite object can have; any attempts to compose onto
// an object already having this many components will fail. This OutputStream will enforce
// the limit before attempting the compose operation at all, so that the stream can be
// considered still safe to use and eventually close() without losing data even if
// intermediate attempts to hsync() throw exceptions due to the component limit.
public static final int MAX_COMPOSITE_COMPONENTS = 1024;
private static final Logger LOG =
LoggerFactory.getLogger(GoogleHadoopSyncableOutputStream.class);
// Temporary files don't need to contain the desired attributes of the final destination file
// since metadata settings get clobbered on final compose() anyways; additionally, due to
// the way we pick temp file names and already ensured directories for the destination file,
// we can optimize tempfile creation by skipping various directory checks.
private static final CreateFileOptions TEMPFILE_CREATE_OPTIONS =
new CreateFileOptions(
/* overwriteExisting= */ false,
CreateFileOptions.DEFAULT_CONTENT_TYPE,
CreateFileOptions.EMPTY_ATTRIBUTES,
/* checkNoDirectoryConflict= */ false,
/* ensureParentDirectoriesExist= */ false,
/* existingGenerationId= */ 0L);
// Deletion of temporary files occurs asynchronously for performance reasons, but in-flight
// deletions are awaited on close() so as long as all output streams are closed, there should
// be no remaining in-flight work occurring inside this threadpool.
private static final ExecutorService TEMPFILE_CLEANUP_THREADPOOL =
Executors.newCachedThreadPool(
new ThreadFactoryBuilder()
.setNameFormat("gcs-syncable-output-stream-cleanup-pool-%d")
.setDaemon(true)
.build());
// Instance of GoogleHadoopFileSystemBase.
private final GoogleHadoopFileSystemBase ghfs;
// The final destination path for this stream.
private final URI finalGcsPath;
// Buffer size to pass through to delegate streams.
private final int bufferSize;
// Statistics tracker provided by the parent GoogleHadoopFileSystemBase for recording
// numbers of bytes written.
private final FileSystem.Statistics statistics;
// Metadata/overwrite options to use on final file.
private final CreateFileOptions fileOptions;
// List of file-deletion futures accrued during the lifetime of this output stream.
private final List> deletionFutures;
private final ExecutorService cleanupThreadpool;
// Current GCS path pointing at the "tail" file which will be appended to the destination
// on each hsync() call.
private URI curGcsPath;
// Current OutputStream pointing at the "tail" file which will be appended to the destination
// on each hsync() call.
private GoogleHadoopOutputStream curDelegate;
// Stores the current component index corresponding curGcsPath. If close() is called, the total
// number of components in the finalGcsPath will be curComponentIndex + 1.
private int curComponentIndex;
// The last known generationId of the final destination file, or possibly
// StorageResourceId.UNKNOWN_GENERATION_ID if unknown.
private long curDestGenerationId;
/**
* Creates a new GoogleHadoopSyncableOutputStream with initial stream initialized and expected
* to begin at file-offset 0. This constructor is not suitable for "appending" to already
* existing files.
*/
public GoogleHadoopSyncableOutputStream(
GoogleHadoopFileSystemBase ghfs, URI gcsPath, int bufferSize,
FileSystem.Statistics statistics, CreateFileOptions createFileOptions)
throws IOException {
this(ghfs, gcsPath, bufferSize, statistics, createFileOptions, TEMPFILE_CLEANUP_THREADPOOL);
}
GoogleHadoopSyncableOutputStream(
GoogleHadoopFileSystemBase ghfs, URI gcsPath, int bufferSize,
FileSystem.Statistics statistics, CreateFileOptions createFileOptions,
ExecutorService cleanupThreadpool)
throws IOException {
LOG.debug("GoogleHadoopSyncableOutputStream({}, {})", gcsPath, bufferSize);
this.ghfs = ghfs;
this.finalGcsPath = gcsPath;
this.bufferSize = bufferSize;
this.statistics = statistics;
this.fileOptions = createFileOptions;
this.deletionFutures = new ArrayList<>();
this.cleanupThreadpool = cleanupThreadpool;
// The first component of the stream will go straight to the destination filename to optimize
// the case where no hsync() or a single hsync() is called during the lifetime of the stream;
// committing the first component thus doesn't require any compose() call under the hood.
this.curGcsPath = gcsPath;
this.curDelegate = new GoogleHadoopOutputStream(
ghfs, curGcsPath, bufferSize, statistics, fileOptions);
// TODO(user): Make sure to initialize this to the correct value if a new stream is created to
// "append" to an existing file.
this.curComponentIndex = 0;
this.curDestGenerationId = StorageResourceId.UNKNOWN_GENERATION_ID;
}
@Override
public void write(int b) throws IOException {
throwIfNotOpen();
curDelegate.write(b);
}
@Override
public void write(byte[] b, int offset, int len) throws IOException {
throwIfNotOpen();
curDelegate.write(b, offset, len);
}
@Override
public void close() throws IOException {
LOG.debug("close(): Current tail file: {} final destination: {}", curGcsPath, finalGcsPath);
if (!isOpen()) {
LOG.debug("close(): Ignoring; stream already closed.");
return;
}
commitCurrentFile();
// null denotes stream closed.
// TODO(user): Add checks which throw IOException if further operations are attempted on a
// closed stream, except for multiple calls to close(), which should behave as no-ops.
curGcsPath = null;
curDelegate = null;
LOG.debug("close(): Awaiting {} deletionFutures", deletionFutures.size());
for (Future> deletion : deletionFutures) {
try {
deletion.get();
} catch (ExecutionException | InterruptedException ee) {
if (ee.getCause() instanceof IOException) {
throw (IOException) ee.getCause();
} else {
throw new IOException(ee);
}
}
}
LOG.debug("close(): done");
}
public void sync() throws IOException {
hsync();
}
/**
* There is no way to flush data to become available for readers without a full-fledged
* hsync(), so this method is a no-op.
* This overrides Syncable.hflush(), but is not annotated as such because the method doesn't
* exist in Hadoop 1.
*/
public void hflush() throws IOException {
LOG.warn(
"hflush() is a no-op; readers will *not* yet see flushed data for {}", finalGcsPath);
throwIfNotOpen();
}
/**
* This overrides Syncable.hsync(), but is not annotated as such because the method doesn't
* exist in Hadoop 1.
*
* @throws CompositeLimitExceededException if this hsync() call would require any future close()
* call to exceed the component limit. If CompositeLimitExceededException is thrown, no
* actual GCS operations are taken and it's safe to subsequently call close() on this
* stream as normal; it just means data written since the last successful hsync() has not
* yet been committed.
*/
public void hsync() throws IOException {
LOG.debug("hsync(): Committing tail file {} to final destination {}", curGcsPath, finalGcsPath);
throwIfNotOpen();
long startTime = System.nanoTime();
// If we were to call close() instead of hsync() right now, the final object would have this
// many components.
int curNumComponents = curComponentIndex + 1;
if (curNumComponents >= MAX_COMPOSITE_COMPONENTS) {
throw new CompositeLimitExceededException(String.format(
"Cannot hsync() '%s' because subsequent component count would exceed limit of %d",
finalGcsPath, MAX_COMPOSITE_COMPONENTS));
}
commitCurrentFile();
// Use a different temporary path for each temporary component to reduce the possible avenues of
// race conditions in the face of low-level retries, etc.
++curComponentIndex;
curGcsPath = getNextTemporaryPath();
LOG.debug("hsync(): Opening next temporary tail file {} as component number {}",
curGcsPath, curComponentIndex);
curDelegate = new GoogleHadoopOutputStream(
ghfs, curGcsPath, bufferSize, statistics, TEMPFILE_CREATE_OPTIONS);
long endTime = System.nanoTime();
LOG.debug("Took {} ns to hsync()", endTime - startTime);
}
private void commitCurrentFile() throws IOException {
// TODO(user): Optimize the case where 0 bytes have been written in the current component
// to return early.
WritableByteChannel innerChannel = curDelegate.getInternalChannel();
curDelegate.close();
long generationId = StorageResourceId.UNKNOWN_GENERATION_ID;
if (innerChannel instanceof GoogleCloudStorageItemInfo.Provider) {
generationId = ((GoogleCloudStorageItemInfo.Provider) innerChannel)
.getItemInfo().getContentGeneration();
LOG.debug(
"innerChannel is GoogleCloudStorageItemInfo.Provider; closed generationId {}.",
generationId);
} else {
LOG.debug("innerChannel NOT instanceof provider: {}", innerChannel.getClass());
}
// On the first component, curGcsPath will equal finalGcsPath, and no compose() call is
// necessary. Otherwise, we compose in-place into the destination object and then delete
// the temporary object.
if (!finalGcsPath.equals(curGcsPath)) {
StorageResourceId destResourceId =
StorageResourceId.fromObjectName(finalGcsPath.toString(), curDestGenerationId);
final StorageResourceId tempResourceId =
StorageResourceId.fromObjectName(curGcsPath.toString(), generationId);
if (!destResourceId.getBucketName().equals(tempResourceId.getBucketName())) {
throw new IllegalStateException(String.format(
"Destination bucket in path '%s' doesn't match temp file bucket in path '%s'",
finalGcsPath, curGcsPath));
}
GoogleCloudStorageItemInfo composedObject = ghfs.getGcsFs().getGcs().composeObjects(
ImmutableList.of(destResourceId, tempResourceId),
destResourceId,
GoogleCloudStorageFileSystem.objectOptionsFromFileOptions(fileOptions));
curDestGenerationId = composedObject.getContentGeneration();
deletionFutures.add(cleanupThreadpool.submit(new Callable() {
@Override
public Void call() throws IOException {
ghfs.getGcsFs().getGcs().deleteObjects(ImmutableList.of(tempResourceId));
return null;
}
}));
} else {
// First commit was direct to the destination; the generationId of the object we just
// committed will be used as the destination generation id for future compose calls.
curDestGenerationId = generationId;
}
}
/**
* Returns URI to be used for the next "tail" file in the series.
*/
private URI getNextTemporaryPath() {
Path basePath = ghfs.getHadoopPath(finalGcsPath);
Path baseDir = basePath.getParent();
Path tempPath = new Path(
baseDir,
String.format("%s%s.%d.%s",
TEMPFILE_PREFIX, basePath.getName(), curComponentIndex, UUID.randomUUID().toString()));
return ghfs.getGcsPath(tempPath);
}
private boolean isOpen() {
return curDelegate != null;
}
private void throwIfNotOpen() throws IOException {
if (!isOpen()) {
throw new ClosedChannelException();
}
}
}