com.google.cloud.hadoop.fs.gcs.GoogleHadoopSyncableOutputStream Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2016 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.hadoop.fs.gcs;

import com.google.cloud.hadoop.gcsio.CreateFileOptions;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageItemInfo;
import com.google.cloud.hadoop.gcsio.StorageResourceId;
import com.google.cloud.hadoop.util.GoogleCloudStorageEventBus;
import com.google.cloud.hadoop.util.ITraceFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.flogger.GoogleLogger;
import com.google.common.util.concurrent.RateLimiter;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.WritableByteChannel;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Syncable;

/**
 * GoogleHadoopSyncableOutputStream implements the {@code Syncable} interface by composing objects
 * created in separate underlying streams for each hsync() call.
 *
 * Prior to the first hsync(), sync() or close() call, this channel will behave the same way as a
 * basic non-syncable channel, writing directly to the destination file.
 *
 * 
On the first call to hsync()/sync(), the destination file is committed and a new temporary
 * file using a hidden-file prefix (underscore) is created with an additional suffix which differs
 * for each subsequent temporary file in the series; during this time readers can read the data
 * committed to the destination file, but not the bytes written to the temporary file since the last
 * hsync() call.
 *
 * 
On each subsequent hsync()/sync() call, the temporary file closed(), composed onto the
 * destination file, then deleted, and a new temporary file is opened under a new filename for
 * further writes.
 *
 * 
Caveat: each hsync()/sync() requires many underlying read and mutation requests occurring
 * sequentially, so latency is expected to be fairly high.
 *
 * 
If errors occur mid-stream, there may be one or more temporary files failing to be cleaned up,
 * and require manual intervention to discover and delete any such unused files. Data written prior
 * to the most recent successful hsync() is persistent and safe in such a case.
 *
 * 
If multiple writers are attempting to write to the same destination file, generation ids used
 * with low-level precondition checks will cause all but a one writer to fail their precondition
 * checks during writes, and a single remaining writer will safely occupy the stream.
 */
public class GoogleHadoopSyncableOutputStream extends OutputStream implements Syncable {
  private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();

  // Prefix used for all temporary files created by this stream.
  public static final String TEMPFILE_PREFIX = "_GCS_SYNCABLE_TEMPFILE_";

  // Temporary files don't need to contain the desired attributes of the final destination file
  // since metadata settings get clobbered on final compose() anyways; additionally, due to
  // the way we pick temp file names and already ensured directories for the destination file,
  // we can optimize tempfile creation by skipping various directory checks.
  private static final CreateFileOptions TEMPFILE_CREATE_OPTIONS =
      CreateFileOptions.DEFAULT_NO_OVERWRITE
          .toBuilder()
          .setEnsureNoDirectoryConflict(false)
          .setOverwriteGenerationId(0)
          .build();

  // Deletion of temporary files occurs asynchronously for performance reasons, but in-flight
  // deletions are awaited on close() so as long as all output streams are closed, there should
  // be no remaining in-flight work occurring inside this threadpool.
  private static final ExecutorService TEMPFILE_CLEANUP_THREADPOOL =
      Executors.newCachedThreadPool(
          new ThreadFactoryBuilder()
              .setNameFormat("gcs-syncable-output-stream-cleanup-pool-%d")
              .setDaemon(true)
              .build());

  // Instance of GoogleHadoopFileSystemBase.
  private final GoogleHadoopFileSystemBase ghfs;

  // The final destination path for this stream.
  private final URI finalGcsPath;

  // Statistics tracker provided by the parent GoogleHadoopFileSystemBase for recording
  // numbers of bytes written.
  private final FileSystem.Statistics statistics;

  // Metadata/overwrite options to use on final file.
  private final CreateFileOptions fileOptions;

  // List of file-deletion futures accrued during the lifetime of this output stream.
  private final List> deletionFutures;

  private final SyncableOutputStreamOptions options;

  private final RateLimiter syncRateLimiter;

  private final ExecutorService cleanupThreadpool;
  private final GhfsStorageStatistics storageStatistics;
  private final ITraceFactory traceFactory;

  // Current GCS path pointing at the "tail" file which will be appended to the destination
  // on each hsync() call.
  private URI curGcsPath;

  // Current OutputStream pointing at the "tail" file which will be appended to the destination
  // on each hsync() call.
  private GoogleHadoopOutputStream curDelegate;

  // Stores the current component index corresponding curGcsPath. If close() is called, the total
  // number of components in the finalGcsPath will be curComponentIndex + 1.
  private int curComponentIndex;

  // The last known generationId of the final destination file, or possibly
  // StorageResourceId.UNKNOWN_GENERATION_ID if unknown.
  private long curDestGenerationId;

  /** Creates a new GoogleHadoopSyncableOutputStream. */
  public GoogleHadoopSyncableOutputStream(
      GoogleHadoopFileSystemBase ghfs,
      URI gcsPath,
      FileSystem.Statistics statistics,
      CreateFileOptions createFileOptions,
      SyncableOutputStreamOptions options)
      throws IOException {
    this(ghfs, gcsPath, statistics, createFileOptions, options, TEMPFILE_CLEANUP_THREADPOOL);
  }

  @VisibleForTesting
  GoogleHadoopSyncableOutputStream(
      GoogleHadoopFileSystemBase ghfs,
      URI gcsPath,
      FileSystem.Statistics statistics,
      CreateFileOptions createFileOptions,
      SyncableOutputStreamOptions options,
      ExecutorService cleanupThreadpool)
      throws IOException {
    logger.atFiner().log(
        "GoogleHadoopSyncableOutputStream(gcsPath: %s, createFileOptions:  %s, options: %s)",
        gcsPath, createFileOptions, options);
    this.ghfs = ghfs;
    this.finalGcsPath = gcsPath;
    this.statistics = statistics;
    this.fileOptions = createFileOptions;
    this.deletionFutures = new ArrayList<>();
    this.cleanupThreadpool = cleanupThreadpool;
    this.options = options;
    this.syncRateLimiter = createRateLimiter(options.getMinSyncInterval());

    if (options.isAppendEnabled()) {
      // When appending first component has to go to new temporary file.
      this.curGcsPath = getNextTemporaryPath();
      this.curComponentIndex = 1;
    } else {
      // The first component of the stream will go straight to the destination filename to optimize
      // the case where no hsync() or a single hsync() is called during the lifetime of the stream;
      // committing the first component thus doesn't require any compose() call under the hood.
      this.curGcsPath = gcsPath;
      this.curComponentIndex = 0;
    }

    this.curDelegate = new GoogleHadoopOutputStream(ghfs, curGcsPath, statistics, fileOptions);
    this.curDestGenerationId = StorageResourceId.UNKNOWN_GENERATION_ID;

    this.storageStatistics = ghfs.getStorageStatistics();
    this.traceFactory = ghfs.getTraceFactory();
  }

  private static RateLimiter createRateLimiter(Duration minSyncInterval) {
    if (minSyncInterval.isNegative() || minSyncInterval.isZero()) {
      return null;
    }
    double permitsPerSecond = 1000.0 / minSyncInterval.toMillis();
    return RateLimiter.create(permitsPerSecond);
  }

  @Override
  public void write(int b) throws IOException {
    throwIfNotOpen();
    curDelegate.write(b);
  }

  @Override
  public void write(byte[] b, int offset, int len) throws IOException {
    throwIfNotOpen();
    curDelegate.write(b, offset, len);
  }

  @Override
  public void close() throws IOException {
    logger.atFiner().log(
        "close(): Current tail file: %s final destination: %s", curGcsPath, finalGcsPath);
    if (!isOpen()) {
      logger.atFiner().log("close(): Ignoring; stream already closed.");
      return;
    }
    commitCurrentFile();

    // null denotes stream closed.
    // TODO(user): Add checks which throw IOException if further operations are attempted on a
    // closed stream, except for multiple calls to close(), which should behave as no-ops.
    curGcsPath = null;
    curDelegate = null;

    logger.atFiner().log("close(): Awaiting %s deletionFutures", deletionFutures.size());
    for (Future deletion : deletionFutures) {
      try {
        deletion.get();
      } catch (ExecutionException | InterruptedException e) {
        if (e instanceof InterruptedException) {
          Thread.currentThread().interrupt();
        }
        GoogleCloudStorageEventBus.postOnException();
        throw new IOException("Failed to delete files while closing stream", e);
      }
    }
  }

  public void sync() throws IOException {
    hsync();
  }

  /**
   * There is no way to flush data to become available for readers without a full-fledged hsync(),
   * If the output stream is only syncable, this method is a no-op. If the output stream is also
   * flushable, this method will simply use the same implementation of hsync().
   *
   * If it is rate limited, unlike hsync(), which will try to acquire the permits and block, it
   * will do nothing.
   */
  @Override
  public void hflush() throws IOException {
    GhfsStorageStatistics.trackDuration(
        storageStatistics,
        GhfsStatistic.INVOCATION_HFLUSH,
        finalGcsPath,
        traceFactory,
        () -> {
          long startTimeNs = System.nanoTime();
          if (!options.isSyncOnFlushEnabled()) {
            logger.atWarning().log(
                "hflush(): No-op: readers will *not* yet see flushed data for %s", finalGcsPath);
            throwIfNotOpen();
            return null;
          }
          // If rate limit not set or permit acquired than use hsync()
          if (syncRateLimiter == null || syncRateLimiter.tryAcquire()) {
            logger.atFine().log("hflush() uses hsync() for %s", finalGcsPath);
            hsyncInternal(startTimeNs);
            return null;
          }
          logger.atInfo().atMostEvery(1, TimeUnit.MINUTES).log(
              "hflush(): No-op due to rate limit (%s): readers will *not* yet see flushed data for %s",
              syncRateLimiter, finalGcsPath);
          throwIfNotOpen();

          return null;
        });
  }

  @Override
  public void hsync() throws IOException {
    GhfsStorageStatistics.trackDuration(
        storageStatistics,
        GhfsStatistic.INVOCATION_HSYNC,
        finalGcsPath,
        traceFactory,
        () -> {
          long startTimeNs = System.nanoTime();
          if (syncRateLimiter != null) {
            logger.atFiner().log(
                "hsync(): Rate limited (%s) with blocking permit acquisition for %s",
                syncRateLimiter, finalGcsPath);
            syncRateLimiter.acquire();
          }
          hsyncInternal(startTimeNs);

          return null;
        });
  }

  /** Internal implementation of hsync, can be reused by hflush() as well. */
  private void hsyncInternal(long startTimeNs) throws IOException {
    logger.atFiner().log(
        "hsync(): Committing tail file %s to final destination %s", curGcsPath, finalGcsPath);
    throwIfNotOpen();

    commitCurrentFile();

    // Use a different temporary path for each temporary component to reduce the possible avenues of
    // race conditions in the face of low-level retries, etc.
    ++curComponentIndex;
    curGcsPath = getNextTemporaryPath();

    logger.atFiner().log(
        "hsync(): Opening next temporary tail file %s as component number %s",
        curGcsPath, curComponentIndex);
    curDelegate =
        new GoogleHadoopOutputStream(ghfs, curGcsPath, statistics, TEMPFILE_CREATE_OPTIONS);

    long finishTimeNs = System.nanoTime();
    logger.atFiner().log("Took %d ns to sync() for %s", finishTimeNs - startTimeNs, finalGcsPath);
  }

  private void commitCurrentFile() throws IOException {
    // TODO(user): Optimize the case where 0 bytes have been written in the current component
    // to return early.
    WritableByteChannel innerChannel = curDelegate.getInternalChannel();
    curDelegate.close();

    long generationId = StorageResourceId.UNKNOWN_GENERATION_ID;
    if (innerChannel instanceof GoogleCloudStorageItemInfo.Provider) {
      generationId =
          ((GoogleCloudStorageItemInfo.Provider) innerChannel).getItemInfo().getContentGeneration();
      logger.atFiner().log(
          "innerChannel is GoogleCloudStorageItemInfo.Provider; closed generationId %s.",
          generationId);
    } else {
      logger.atFiner().log("innerChannel NOT instanceof provider: %s", innerChannel.getClass());
    }

    // On the first component, curGcsPath will equal finalGcsPath, and no compose() call is
    // necessary. Otherwise, we compose in-place into the destination object and then delete
    // the temporary object.
    if (!finalGcsPath.equals(curGcsPath)) {
      StorageResourceId destResourceId =
          StorageResourceId.fromStringPath(finalGcsPath.toString(), curDestGenerationId);
      final StorageResourceId tempResourceId =
          StorageResourceId.fromStringPath(curGcsPath.toString(), generationId);
      if (!destResourceId.getBucketName().equals(tempResourceId.getBucketName())) {
        GoogleCloudStorageEventBus.postOnException();
        throw new IllegalStateException(
            String.format(
                "Destination bucket in path '%s' doesn't match temp file bucket in path '%s'",
                finalGcsPath, curGcsPath));
      }
      GoogleCloudStorageItemInfo composedObject =
          ghfs.getGcsFs()
              .getGcs()
              .composeObjects(
                  ImmutableList.of(destResourceId, tempResourceId),
                  destResourceId,
                  GoogleCloudStorageFileSystem.objectOptionsFromFileOptions(fileOptions));
      curDestGenerationId = composedObject.getContentGeneration();
      deletionFutures.add(
          cleanupThreadpool.submit(
              () -> {
                ghfs.getGcsFs().getGcs().deleteObjects(ImmutableList.of(tempResourceId));
                return null;
              }));
    } else {
      // First commit was direct to the destination; the generationId of the object we just
      // committed will be used as the destination generation id for future compose calls.
      curDestGenerationId = generationId;
    }
  }

  /** Returns URI to be used for the next "tail" file in the series. */
  private URI getNextTemporaryPath() {
    Path basePath = ghfs.getHadoopPath(finalGcsPath);
    Path baseDir = basePath.getParent();
    Path tempPath =
        new Path(
            baseDir,
            String.format(
                "%s%s.%d.%s",
                TEMPFILE_PREFIX,
                basePath.getName(),
                curComponentIndex,
                UUID.randomUUID().toString()));
    return ghfs.getGcsPath(tempPath);
  }

  private boolean isOpen() {
    return curDelegate != null;
  }

  private void throwIfNotOpen() throws IOException {
    if (!isOpen()) {
      GoogleCloudStorageEventBus.postOnException();
      throw new ClosedChannelException();
    }
  }
}