All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.hadoop.gcsio.GoogleCloudStorageClientReadChannel Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting Google Cloud Storage

There is a newer version: 3.0.4
Show newest version
/*
 * Copyright 2023 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.hadoop.gcsio;

import static com.google.cloud.hadoop.gcsio.GoogleCloudStorageExceptions.createFileNotFoundException;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Strings.nullToEmpty;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Math.toIntExact;

import com.google.cloud.ReadChannel;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions.Fadvise;
import com.google.cloud.hadoop.util.ErrorTypeExtractor;
import com.google.cloud.hadoop.util.GoogleCloudStorageEventBus;
import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.Storage.BlobSourceOption;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.flogger.GoogleLogger;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nullable;

/** Provides seekable read access to GCS via java-storage library. */
@VisibleForTesting
class GoogleCloudStorageClientReadChannel implements SeekableByteChannel {

  private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();

  private static final String GZIP_ENCODING = "gzip";

  private final StorageResourceId resourceId;
  private final GoogleCloudStorageReadOptions readOptions;
  private final GoogleCloudStorageOptions storageOptions;
  private final Storage storage;
  // The size of this object generation, in bytes.
  private long objectSize;
  private final ErrorTypeExtractor errorExtractor;
  private ContentReadChannel contentReadChannel;
  private boolean gzipEncoded = false;
  private boolean open = true;

  // Current position in this channel, it could be different from contentChannelCurrentPosition if
  // position(long) method calls were made without calls to read(ByteBuffer) method.
  private long currentPosition = 0;

  public GoogleCloudStorageClientReadChannel(
      Storage storage,
      GoogleCloudStorageItemInfo itemInfo,
      GoogleCloudStorageReadOptions readOptions,
      ErrorTypeExtractor errorExtractor,
      GoogleCloudStorageOptions storageOptions)
      throws IOException {
    validate(itemInfo);
    this.storage = storage;
    this.errorExtractor = errorExtractor;
    this.resourceId =
        new StorageResourceId(
            itemInfo.getBucketName(), itemInfo.getObjectName(), itemInfo.getContentGeneration());
    this.readOptions = readOptions;
    this.storageOptions = storageOptions;
    this.contentReadChannel = new ContentReadChannel(readOptions, resourceId);
    initMetadata(itemInfo.getContentEncoding(), itemInfo.getSize());
  }

  protected void initMetadata(@Nullable String encoding, long sizeFromMetadata) throws IOException {
    gzipEncoded = nullToEmpty(encoding).contains(GZIP_ENCODING);
    if (gzipEncoded && !readOptions.getSupportGzipEncoding()) {
      throw new IOException(
          "Cannot read GZIP encoded files - content encoding support is disabled.");
    }
    objectSize = gzipEncoded ? Long.MAX_VALUE : sizeFromMetadata;
  }

  @Override
  public int read(ByteBuffer dst) throws IOException {
    throwIfNotOpen();

    // Don't try to read if the buffer has no space.
    if (dst.remaining() == 0) {
      return 0;
    }
    logger.atFiner().log(
        "Reading %d bytes at %d position from '%s'", dst.remaining(), currentPosition, resourceId);
    if (currentPosition == objectSize) {
      return -1;
    }
    return contentReadChannel.readContent(dst);
  }

  @Override
  public int write(ByteBuffer src) throws IOException {
    GoogleCloudStorageEventBus.postOnException();
    throw new UnsupportedOperationException("Cannot mutate read-only channel");
  }

  @Override
  public long position() throws IOException {
    return currentPosition;
  }

  /**
   * Sets this channel's position.
   *
   * 

This method will throw an exception if {@code newPosition} is greater than object size, * which contradicts {@link SeekableByteChannel#position(long) SeekableByteChannel} contract. * TODO(user): decide if this needs to be fixed. * * @param newPosition the new position, counting the number of bytes from the beginning. * @return this channel instance * @throws FileNotFoundException if the underlying object does not exist. * @throws IOException on IO error */ @Override public SeekableByteChannel position(long newPosition) throws IOException { throwIfNotOpen(); if (newPosition == currentPosition) { return this; } validatePosition(newPosition); logger.atFiner().log( "Seek from %s to %s position for '%s'", currentPosition, newPosition, resourceId); currentPosition = newPosition; return this; } @Override public long size() throws IOException { return objectSize; } @Override public SeekableByteChannel truncate(long size) throws IOException { GoogleCloudStorageEventBus.postOnException(); throw new UnsupportedOperationException("Cannot mutate read-only channel"); } @Override public boolean isOpen() { return open; } @Override public void close() throws IOException { if (open) { try { logger.atFiner().log("Closing channel for '%s'", resourceId); contentReadChannel.closeContentChannel(); } catch (Exception e) { GoogleCloudStorageEventBus.postOnException(); throw new IOException( String.format("Exception occurred while closing channel '%s'", resourceId), e); } finally { contentReadChannel = null; open = false; } } } /** * This class own the responsibility of opening up contentChannel. It also implements the Fadvise, * which helps in deciding the boundaries of content channel being opened and also caching the * footer of an object. */ private class ContentReadChannel { // Size of buffer to allocate for skipping bytes in-place when performing in-place seeks. private static final int SKIP_BUFFER_SIZE = 8192; private final BlobId blobId; // This is the actual current position in `contentChannel` from where read can happen. // This remains unchanged of position(long) method call. private long contentChannelCurrentPosition = -1; private long contentChannelEnd = -1; // Prefetched footer content. private byte[] footerContent; // Used as scratch space when reading bytes just to discard them when trying to perform small // in-place seeks. private byte[] skipBuffer = null; private ReadableByteChannel byteChannel = null; private boolean randomAccess; public ContentReadChannel( GoogleCloudStorageReadOptions readOptions, StorageResourceId resourceId) { this.blobId = BlobId.of( resourceId.getBucketName(), resourceId.getObjectName(), resourceId.getGenerationId()); this.randomAccess = readOptions.getFadvise() == Fadvise.RANDOM; } public int readContent(ByteBuffer dst) throws IOException { performPendingSeeks(); checkState( contentChannelCurrentPosition == currentPosition || byteChannel == null, "contentChannelCurrentPosition (%s) should be equal to currentPosition (%s) after lazy seek, if channel is open", contentChannelCurrentPosition, currentPosition); int totalBytesRead = 0; // We read from a streaming source. We may not get all the bytes we asked for // in the first read. Therefore, loop till we either read the required number of // bytes or we reach end-of-stream. while (dst.hasRemaining()) { int remainingBeforeRead = dst.remaining(); try { if (byteChannel == null) { byteChannel = openByteChannel(dst.remaining()); // We adjust the start index of content channel in following cases // 1. request range is in footer boundaries --> request the whole footer // 2. requested content is gzip encoded -> request always from start of file. // Case(1) is handled with reading and caching the extra read bytes, for all other cases // we need to skip all the unrequested bytes before start reading from current position. if (currentPosition > contentChannelCurrentPosition) { skipInPlace(); } // making sure that currentPosition is in alignment with currentReadPosition before // actual read starts to avoid read discrepancies. checkState( contentChannelCurrentPosition == currentPosition, "position of read offset isn't in alignment with channel's read offset"); } int bytesRead = byteChannel.read(dst); if (bytesRead == 0) { throw new IOException( String.format("Read 0 bytes without blocking from object: '%s'", resourceId)); } if (bytesRead < 0) { // Because we don't know decompressed object size for gzip-encoded objects, // assume that this is an object end. if (gzipEncoded) { objectSize = currentPosition; contentChannelEnd = currentPosition; } if (currentPosition != contentChannelEnd && currentPosition != objectSize) { GoogleCloudStorageEventBus.postOnException(); throw new IOException( String.format( "Received end of stream result before all requestedBytes were received;" + "EndOf stream signal received at offset: %d where as stream was suppose to end at: %d for resource: %s of size: %d", currentPosition, contentChannelEnd, resourceId, objectSize)); } // If we have reached an end of a contentChannel but not an end of an object. // then close contentChannel and continue reading an object if necessary. if (contentChannelEnd != objectSize && currentPosition == contentChannelEnd) { closeContentChannel(); continue; } else { break; } } totalBytesRead += bytesRead; currentPosition += bytesRead; contentChannelCurrentPosition += bytesRead; checkState( contentChannelCurrentPosition == currentPosition, "contentChannelPosition (%s) should be equal to currentPosition (%s)" + " after successful read", contentChannelCurrentPosition, currentPosition); } catch (Exception e) { int partialBytes = partiallyReadBytes(remainingBeforeRead, dst); totalBytesRead += partialBytes; currentPosition += partialBytes; logger.atFine().log( "Closing contentChannel after %s exception for '%s'.", e.getMessage(), resourceId); closeContentChannel(); throw convertError(e); } } return totalBytesRead; } private int partiallyReadBytes(int remainingBeforeRead, ByteBuffer dst) { int partialReadBytes = 0; if (remainingBeforeRead != dst.remaining()) { partialReadBytes = remainingBeforeRead - dst.remaining(); } return partialReadBytes; } private boolean shouldDetectRandomAccess() { return !gzipEncoded && !randomAccess && readOptions.getFadvise() == Fadvise.AUTO; } private void setRandomAccess() { randomAccess = true; } private ReadableByteChannel openByteChannel(long bytesToRead) throws IOException { checkArgument( bytesToRead > 0, "bytesToRead should be greater than 0, but was %s", bytesToRead); checkState( byteChannel == null && contentChannelEnd < 0, "contentChannel and contentChannelEnd should be not initialized yet for '%s'", resourceId); if (footerContent != null && currentPosition >= objectSize - footerContent.length) { return serveFooterContent(); } setChannelBoundaries(bytesToRead); ReadableByteChannel readableByteChannel = getStorageReadChannel(contentChannelCurrentPosition, contentChannelEnd); if (contentChannelEnd == objectSize && (contentChannelEnd - contentChannelCurrentPosition) <= readOptions.getMinRangeRequestSize()) { if (footerContent == null) { cacheFooter(readableByteChannel); } return serveFooterContent(); } return readableByteChannel; } private void setChannelBoundaries(long bytesToRead) { contentChannelCurrentPosition = getRangeRequestStart(); contentChannelEnd = getRangeRequestEnd(contentChannelCurrentPosition, bytesToRead); checkState( contentChannelEnd >= contentChannelCurrentPosition, String.format( "Start position should be <= endPosition startPosition:%d, endPosition: %d", contentChannelCurrentPosition, contentChannelEnd)); } private void cacheFooter(ReadableByteChannel readableByteChannel) throws IOException { int footerSize = toIntExact(objectSize - contentChannelCurrentPosition); footerContent = new byte[footerSize]; try (InputStream footerStream = Channels.newInputStream(readableByteChannel)) { int totalBytesRead = 0; int bytesRead; do { bytesRead = footerStream.read(footerContent, totalBytesRead, footerSize - totalBytesRead); if (bytesRead >= 0) { totalBytesRead += bytesRead; } } while (bytesRead >= 0 && totalBytesRead < footerSize); checkState( bytesRead >= 0, "footerStream shouldn't be empty before reading the footer of size %s, totalBytesRead %s, read via last call %s, for '%s'", footerSize, totalBytesRead, bytesRead, resourceId); checkState( totalBytesRead == footerSize, "totalBytesRead (%s) should equal footerSize (%s) for '%s'", totalBytesRead, footerSize, resourceId); } catch (Exception e) { footerContent = null; throw e; } logger.atFiner().log("Prefetched %s bytes footer for '%s'", footerContent.length, resourceId); } private ReadableByteChannel serveFooterContent() { contentChannelCurrentPosition = currentPosition; int offset = toIntExact(currentPosition - (objectSize - footerContent.length)); int length = footerContent.length - offset; logger.atFiner().log( "Opened channel (prefetched footer) from %d position for '%s'", currentPosition, resourceId); return Channels.newChannel(new ByteArrayInputStream(footerContent, offset, length)); } private long getRangeRequestStart() { if (gzipEncoded) { return 0; } if (readOptions.getFadvise() != Fadvise.SEQUENTIAL && isFooterRead()) { // Prefetch footer and adjust start position to footerStart. return max(0, objectSize - readOptions.getMinRangeRequestSize()); } return currentPosition; } private long getRangeRequestEnd(long startPosition, long bytesToRead) { // Always read gzip-encoded files till the end - they do not support range reads. if (gzipEncoded) { return objectSize; } long endPosition = objectSize; if (randomAccess) { // opening a channel for whole object doesn't make sense as anyhow it will not be utilized // for further reads. endPosition = startPosition + max(bytesToRead, readOptions.getMinRangeRequestSize()); } if (footerContent != null) { // If footer is cached open just till footerStart. // Remaining content ill be served from cached footer itself. endPosition = min(endPosition, objectSize - footerContent.length); } return endPosition; } public void closeContentChannel() { if (byteChannel != null) { logger.atFiner().log("Closing internal contentChannel for '%s'", resourceId); try { byteChannel.close(); } catch (Exception e) { logger.atFine().withCause(e).log( "Got an exception on contentChannel.close() for '%s'; ignoring it.", resourceId); } finally { byteChannel = null; reset(); } } } private void reset() { checkState(byteChannel == null, "contentChannel should be null for '%s'", resourceId); contentChannelCurrentPosition = -1; contentChannelEnd = -1; } private boolean isInRangeSeek() { long seekDistance = currentPosition - contentChannelCurrentPosition; if (byteChannel != null && seekDistance > 0 // for gzip encoded content always seek in place && (gzipEncoded || seekDistance <= readOptions.getInplaceSeekLimit()) && currentPosition < contentChannelEnd) { return true; } return false; } private void skipInPlace() { if (skipBuffer == null) { skipBuffer = new byte[SKIP_BUFFER_SIZE]; } long seekDistance = currentPosition - contentChannelCurrentPosition; while (seekDistance > 0 && byteChannel != null) { try { int bufferSize = toIntExact(min(skipBuffer.length, seekDistance)); int bytesRead = byteChannel.read(ByteBuffer.wrap(skipBuffer, 0, bufferSize)); if (bytesRead < 0) { logger.atInfo().log( "Somehow read %d bytes trying to skip %d bytes to seek to position %d, size: %d", bytesRead, seekDistance, currentPosition, objectSize); closeContentChannel(); } else { seekDistance -= bytesRead; contentChannelCurrentPosition += bytesRead; } } catch (Exception e) { logger.atInfo().withCause(e).log( "Got an IO exception on contentChannel.read(), a lazy-seek will be pending for '%s'", resourceId); closeContentChannel(); } } checkState( byteChannel == null || contentChannelCurrentPosition == currentPosition, "contentChannelPosition (%s) should be equal to currentPosition (%s)" + " after successful in-place skip", contentChannelCurrentPosition, currentPosition); } private void performPendingSeeks() { // Return quickly if there is no pending seek operation, i.e. position didn't change. if (currentPosition == contentChannelCurrentPosition && byteChannel != null) { return; } logger.atFiner().log( "Performing lazySeek from %s to %s position '%s'", contentChannelCurrentPosition, currentPosition, resourceId); if (isInRangeSeek()) { skipInPlace(); } else { if (isRandomAccessPattern()) { setRandomAccess(); } // close existing contentChannel as requested bytes can't be served from current // contentChannel; closeContentChannel(); } } private boolean isRandomAccessPattern() { if (!shouldDetectRandomAccess()) { return false; } if (currentPosition < contentChannelCurrentPosition) { logger.atFine().log( "Detected backward read from %s to %s position, switching to random IO for '%s'", contentChannelCurrentPosition, currentPosition, resourceId); return true; } if (contentChannelCurrentPosition >= 0 && contentChannelCurrentPosition + readOptions.getInplaceSeekLimit() < currentPosition) { logger.atFine().log( "Detected forward read from %s to %s position over %s threshold," + " switching to random IO for '%s'", contentChannelCurrentPosition, currentPosition, readOptions.getInplaceSeekLimit(), resourceId); return true; } return false; } private ReadableByteChannel getStorageReadChannel(long seek, long limit) throws IOException { ReadChannel readChannel = storage.reader(blobId, generateReadOptions(blobId)); try { readChannel.seek(seek); readChannel.limit(limit); return readChannel; } catch (Exception e) { GoogleCloudStorageEventBus.postOnException(); throw new IOException( String.format( "Unable to update the boundaries/Range of contentChannel %s", resourceId.toString()), e); } } private BlobSourceOption[] generateReadOptions(BlobId blobId) { List blobReadOptions = new ArrayList<>(); // To get decoded content blobReadOptions.add(BlobSourceOption.shouldReturnRawInputStream(false)); if (blobId.getGeneration() != null) { blobReadOptions.add(BlobSourceOption.generationMatch(blobId.getGeneration())); } if (storageOptions.getEncryptionKey() != null) { blobReadOptions.add( BlobSourceOption.decryptionKey(storageOptions.getEncryptionKey().value())); } return blobReadOptions.toArray(new BlobSourceOption[blobReadOptions.size()]); } private boolean isFooterRead() { return objectSize - currentPosition <= readOptions.getMinRangeRequestSize(); } } @VisibleForTesting boolean randomAccessStatus() { return contentReadChannel.randomAccess; } private static void validate(GoogleCloudStorageItemInfo itemInfo) throws IOException { checkNotNull(itemInfo, "itemInfo cannot be null"); StorageResourceId resourceId = itemInfo.getResourceId(); checkArgument( resourceId.isStorageObject(), "Can not open a non-file object for read: %s", resourceId); if (!itemInfo.exists()) { GoogleCloudStorageEventBus.postOnException(); throw new FileNotFoundException(String.format("Item not found: %s", resourceId)); } } private IOException convertError(Exception error) { String msg = String.format("Error reading '%s'", resourceId); switch (errorExtractor.getErrorType(error)) { case NOT_FOUND: return createFileNotFoundException( resourceId.getBucketName(), resourceId.getObjectName(), new IOException(msg, error)); case OUT_OF_RANGE: return (IOException) new EOFException(msg).initCause(error); default: return new IOException(msg, error); } } /** Validates that the given position is valid for this channel. */ private void validatePosition(long position) throws IOException { if (position < 0) { GoogleCloudStorageEventBus.postOnException(); throw new EOFException( String.format( "Invalid seek offset: position value (%d) must be >= 0 for '%s'", position, resourceId)); } if (objectSize >= 0 && position >= objectSize) { GoogleCloudStorageEventBus.postOnException(); throw new EOFException( String.format( "Invalid seek offset: position value (%d) must be between 0 and %d for '%s'", position, objectSize, resourceId)); } } /** Throws if this channel is not currently open. */ private void throwIfNotOpen() throws IOException { if (!isOpen()) { GoogleCloudStorageEventBus.postOnException(); throw new ClosedChannelException(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy