
com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadChannel Maven / Gradle / Ivy
/**
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.gcsio;
import com.google.api.client.http.HttpResponse;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.ExponentialBackOff;
import com.google.api.client.util.NanoClock;
import com.google.api.client.util.Sleeper;
import com.google.api.services.storage.Storage;
import com.google.api.services.storage.model.StorageObject;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.cloud.hadoop.util.ClientRequestHelper;
import com.google.cloud.hadoop.util.LogUtil;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.ReadableByteChannel;
import java.util.regex.Pattern;
import javax.net.ssl.SSLException;
/**
* Provides seekable read access to GCS.
*/
public class GoogleCloudStorageReadChannel
implements SeekableReadableByteChannel {
// Logger.
private static LogUtil log = new LogUtil(GoogleCloudStorageReadChannel.class);
// Used to separate elements of a Content-Range
private static final Pattern SLASH = Pattern.compile("/");
// GCS access instance.
private Storage gcs;
// Name of the bucket containing the object being read.
private String bucketName;
// Name of the object being read.
private String objectName;
// Read channel.
private ReadableByteChannel readChannel;
// True if this channel is open, false otherwise.
private boolean channelIsOpen;
// Current read position in the channel.
private long currentPosition = -1;
// When a caller calls position(long) to set stream position, we record the target position
// and defer the actual seek operation until the caller tries to read from the channel.
// This allows us to avoid an unnecessary seek to position 0 that would take place on creation
// of this instance in cases where caller intends to start reading at some other offset.
// If lazySeekPending is set to true, it indicates that a target position has been set
// but the actual seek operation is still pending.
private boolean lazySeekPending;
// Size of the object being read.
private long size = -1;
private boolean isCompressedStream;
// Maximum number of automatic retries when reading from the underlying channel without making
// progress; each time at least one byte is successfully read, the counter of attempted retries
// is reset.
// TODO(user): Wire this setting out to GHFS; it should correspond to adding the wiring for
// setting the equivalent value inside HttpRequest.java which determines the low-level retries
// during "execute()" calls. The default in HttpRequest.java is also 10.
private int maxRetries = 10;
// Helper delegate for turning IOExceptions from API calls into higher-level semantics.
private final ApiErrorExtractor errorExtractor;
// Request helper to use to set extra headers
private final ClientRequestHelper clientRequestHelper;
// Sleeper used for waiting between retries.
private Sleeper sleeper = Sleeper.DEFAULT;
// The clock used by ExponentialBackOff to determine when the maximum total elapsed time has
// passed doing a series of retries.
private NanoClock clock = NanoClock.SYSTEM;
// Lazily initialized BackOff for sleeping between retries; only ever initialized if a retry is
// necessary.
private BackOff backOff = null;
// Settings used for instantiating the default BackOff used for determining wait time between
// retries. TODO(user): Wire these out to be settable by the Hadoop configs.
// The number of milliseconds to wait before the very first retry in a series of retries.
public static final int DEFAULT_BACKOFF_INITIAL_INTERVAL_MILLIS = 200;
// The amount of jitter introduced when computing the next retry sleep interval so that when
// many clients are retrying, they don't all retry at the same time.
public static final double DEFAULT_BACKOFF_RANDOMIZATION_FACTOR = 0.5;
// The base of the exponent used for exponential backoff; each subsequent sleep interval is
// roughly this many times the previous interval.
public static final double DEFAULT_BACKOFF_MULTIPLIER = 1.5;
// The maximum amount of sleep between retries; at this point, there will be no further
// exponential backoff. This prevents intervals from growing unreasonably large.
public static final int DEFAULT_BACKOFF_MAX_INTERVAL_MILLIS = 10 * 1000;
// The maximum total time elapsed since the first retry over the course of a series of retries.
// This makes it easier to bound the maximum time it takes to respond to a permanent failure
// without having to calculate the summation of a series of exponentiated intervals while
// accounting for the randomization of backoff intervals.
public static final int DEFAULT_BACKOFF_MAX_ELAPSED_TIME_MILLIS = 2 * 60 * 1000;
/**
* Constructs an instance of GoogleCloudStorageReadChannel.
*
* @param gcs storage object instance
* @param bucketName name of the bucket containing the object to read
* @param objectName name of the object to read
* @param requestHelper a ClientRequestHelper used to set any extra headers
* @throws FileNotFoundException if the given object does not exist
* @throws IOException on IO error
*/
GoogleCloudStorageReadChannel(
Storage gcs,
String bucketName,
String objectName,
ApiErrorExtractor errorExtractor,
ClientRequestHelper requestHelper)
throws IOException {
this.gcs = gcs;
this.clientRequestHelper = requestHelper;
this.bucketName = bucketName;
this.objectName = objectName;
this.errorExtractor = errorExtractor;
channelIsOpen = true;
position(0);
}
/**
* Constructs an instance of GoogleCloudStorageReadChannel.
* Used for unit testing only. Do not use elsewhere.
*
* @throws IOException on IO error
*/
@VisibleForTesting
GoogleCloudStorageReadChannel()
throws IOException {
this.clientRequestHelper = null;
this.errorExtractor = null;
channelIsOpen = true;
position(0);
}
/**
* Sets the Sleeper used for sleeping between retries.
*/
@VisibleForTesting
void setSleeper(Sleeper sleeper) {
Preconditions.checkArgument(sleeper != null, "sleeper must not be null!");
this.sleeper = sleeper;
}
/**
* Sets the clock to be used for determining when max total time has elapsed doing retries.
*/
@VisibleForTesting
void setNanoClock(NanoClock clock) {
Preconditions.checkArgument(clock != null, "clock must not be null!");
this.clock = clock;
}
/**
* Sets the backoff for determining sleep duration between retries.
*
* @param backOff May be null to force the next usage to auto-initialize with default settings.
*/
@VisibleForTesting
void setBackOff(BackOff backOff) {
this.backOff = backOff;
}
/**
* Gets the backoff used for determining sleep duration between retries. May be null if it was
* never lazily initialized.
*/
@VisibleForTesting
BackOff getBackOff() {
return backOff;
}
/**
* Helper for initializing the BackOff used for retries.
*/
private BackOff createBackOff() {
return new ExponentialBackOff.Builder()
.setInitialIntervalMillis(DEFAULT_BACKOFF_INITIAL_INTERVAL_MILLIS)
.setRandomizationFactor(DEFAULT_BACKOFF_RANDOMIZATION_FACTOR)
.setMultiplier(DEFAULT_BACKOFF_MULTIPLIER)
.setMaxIntervalMillis(DEFAULT_BACKOFF_MAX_INTERVAL_MILLIS)
.setMaxElapsedTimeMillis(DEFAULT_BACKOFF_MAX_ELAPSED_TIME_MILLIS)
.setNanoClock(clock)
.build();
}
/**
* Sets the number of times to automatically retry by re-opening the underlying readChannel
* whenever an exception occurs while reading from it. The count of attempted retries is reset
* whenever at least one byte is successfully read, so this number of retries refers to retries
* made without achieving any forward progress.
*/
public void setMaxRetries(int maxRetries) {
this.maxRetries = maxRetries;
}
/**
* Reads from this channel and stores read data in the given buffer.
*
* @param buffer buffer to read data into
* @return number of bytes read or -1 on end-of-stream
* @throws IOException on IO error
*/
@Override
public int read(ByteBuffer buffer)
throws IOException {
throwIfNotOpen();
// Don't try to read if the buffer has no space.
if (buffer.remaining() == 0) {
return 0;
}
// Perform a lazy seek if not done already.
performLazySeek();
int totalBytesRead = 0;
int retriesAttempted = 0;
// We read from a streaming source. We may not get all the bytes we asked for
// in the first read. Therefore, loop till we either read the required number of
// bytes or we reach end-of-stream.
do {
int remainingBeforeRead = buffer.remaining();
try {
int numBytesRead = readChannel.read(buffer);
Preconditions.checkState(numBytesRead != 0, "Read 0 bytes without blocking!");
if (numBytesRead < 0) {
// Check that we didn't get a premature End of Stream signal by checking the number of
// bytes read against the stream size. Unfortunately we don't have information about the
// actual size of the data stream when stream compression is used, so we can only ignore
// this case here.
Preconditions.checkState(isCompressedStream || currentPosition == size,
"Received end of stream result before all the file data has been received; "
+ "totalBytesRead: %s, currentPosition: %s, size: %s",
totalBytesRead, currentPosition, size);
break;
}
totalBytesRead += numBytesRead;
currentPosition += numBytesRead;
// The count of retriesAttempted is per low-level readChannel.read call; each time we make
// progress we reset the retry counter.
retriesAttempted = 0;
} catch (IOException ioe) {
// TODO(user): Refactor any reusable logic for retries into a separate RetryHelper class.
if (retriesAttempted == maxRetries) {
log.error(
"Already attempted max of %d retries while reading '%s'; throwing exception.",
maxRetries, StorageResourceId.createReadableString(bucketName, objectName));
throw ioe;
} else {
if (retriesAttempted == 0) {
// If this is the first of a series of retries, we also want to reset the backOff
// to have fresh initial values.
if (backOff == null) {
backOff = createBackOff();
} else {
backOff.reset();
}
}
++retriesAttempted;
log.warn("Got exception: %s while reading '%s'; retry # %d. Sleeping...",
ioe.getMessage(), StorageResourceId.createReadableString(bucketName, objectName),
retriesAttempted);
try {
boolean backOffSuccessful = BackOffUtils.next(sleeper, backOff);
if (!backOffSuccessful) {
log.error("BackOff returned false; maximum total elapsed time exhausted. Giving up "
+ "after %d retries for '%s'", retriesAttempted,
StorageResourceId.createReadableString(bucketName, objectName));
throw ioe;
}
} catch (InterruptedException ie) {
log.error("Interrupted while sleeping before retry. Giving up "
+ "after %d retries for '%s'", retriesAttempted,
StorageResourceId.createReadableString(bucketName, objectName));
ioe.addSuppressed(ie);
throw ioe;
}
log.info("Done sleeping before retry for '%s'; retry # %d.",
StorageResourceId.createReadableString(bucketName, objectName), retriesAttempted);
if (buffer.remaining() != remainingBeforeRead) {
int partialRead = remainingBeforeRead - buffer.remaining();
log.info("Despite exception, had partial read of %d bytes; resetting retry count.",
partialRead);
retriesAttempted = 0;
totalBytesRead += partialRead;
currentPosition += partialRead;
}
// Force the stream to be reopened by seeking to the current position.
long newPosition = currentPosition;
currentPosition = -1;
position(newPosition);
// Before performing lazy seek, explicitly close the underlying channel if necessary,
// catching and ignoring SSLException since the retry indicates an error occurred, so
// there's a high probability that SSL connections would be broken in a way that
// causes close() itself to throw an exception, even though underlying sockets have
// already been cleaned up; close() on an SSLSocketImpl requires a shutdown handshake
// in order to shutdown cleanly, and if the connection has been broken already, then
// this is not possible, and the SSLSocketImpl was already responsible for performing
// local cleanup at the time the exception was raised.
if (lazySeekPending && readChannel != null) {
try {
readChannel.close();
readChannel = null;
} catch (SSLException ssle) {
log.warn("Got SSLException on readChannel.close() before retry; ignoring it.", ssle);
readChannel = null;
}
// For "other" exceptions, we'll let it propagate out without setting readChannel to
// null, in case the caller is able to handle it and then properly try to close()
// again.
}
performLazySeek();
}
}
} while (buffer.remaining() > 0);
// If this method was called when the stream was already at EOF
// (indicated by totalBytesRead == 0) then return EOF else,
// return the number of bytes read.
boolean isEndOfStream = (totalBytesRead == 0);
if (isEndOfStream) {
// Check that we didn't get a premature End of Stream signal by checking the number of bytes
// read against the stream size. Unfortunately we don't have information about the actual size
// of the data stream when stream compression is used, so we can only ignore this case here.
Preconditions.checkState(isCompressedStream || currentPosition == size,
"Failed to read any data before all the file data has been received; "
+ "currentPosition: %s, size: %s", currentPosition, size);
return -1;
} else {
return totalBytesRead;
}
}
/**
* Tells whether this channel is open.
*
* @return a value indicating whether this channel is open
*/
@Override
public boolean isOpen() {
return channelIsOpen;
}
/**
* Closes this channel.
*
* @throws IOException on IO error
*/
@Override
public void close()
throws IOException {
throwIfNotOpen();
channelIsOpen = false;
if (readChannel != null) {
readChannel.close();
}
}
/**
* Returns this channel's current position.
*
* @return this channel's current position
*/
@Override
public long position()
throws IOException {
throwIfNotOpen();
return currentPosition;
}
/**
* Sets this channel's position.
*
* @param newPosition the new position, counting the number of bytes from the beginning.
* @return this channel instance
* @throws FileNotFoundException if the underlying object does not exist.
* @throws IOException on IO error
*/
@Override
public SeekableReadableByteChannel position(long newPosition)
throws IOException {
throwIfNotOpen();
// If the position has not changed, avoid the expensive operation.
if (newPosition == currentPosition) {
return this;
}
validatePosition(newPosition);
currentPosition = newPosition;
lazySeekPending = true;
return this;
}
/**
* Returns size of the object to which this channel is connected.
*
* @return size of the object to which this channel is connected
* @throws IOException on IO error
*/
@Override
public long size()
throws IOException {
throwIfNotOpen();
// Perform a lazy seek if not done already so that size of this channel is set correctly.
performLazySeek();
return size;
}
/**
* Sets size of this channel to the given value.
*/
protected void setSize(long size) {
this.size = size;
}
/**
* Validates that the given position is valid for this channel.
*/
protected void validatePosition(long newPosition) {
// Validate: 0 <= newPosition
if (newPosition < 0) {
throw new IllegalArgumentException(
String.format("Invalid seek offset: position value (%d) must be >= 0", newPosition));
}
// Validate: newPosition < size
// Note that we access this.size directly rather than calling size() to avoid initiating
// lazy seek that leads to recursive error. We validate newPosition < size only when size of
// this channel has been computed by a prior call. This means that position could be
// potentially set to an invalid value (>= size) by position(long). However, that error
// gets caught during lazy seek.
if ((size >= 0) && (newPosition >= size)) {
throw new IllegalArgumentException(
String.format(
"Invalid seek offset: position value (%d) must be between 0 and %d",
newPosition, size));
}
}
/**
* Seeks to the given position in the underlying stream.
*
* Note: Seek is an expensive operation because a new stream is opened each time.
*
* @throws FileNotFoundException if the underlying object does not exist.
* @throws IOException on IO error
*/
private void performLazySeek()
throws IOException {
// Return quickly if there is no pending seek operation.
if (!lazySeekPending) {
return;
}
// Close the underlying channel if it is open.
if (readChannel != null) {
readChannel.close();
}
InputStream objectContentStream = openStreamAndSetSize(currentPosition);
readChannel = Channels.newChannel(objectContentStream);
lazySeekPending = false;
}
/**
* Opens the underlying stream, sets its position to the given value and sets size based on
* stream content size.
*
* @param newPosition position to seek into the new stream.
* @throws IOException on IO error
*/
protected InputStream openStreamAndSetSize(long newPosition)
throws IOException {
validatePosition(newPosition);
Storage.Objects.Get getObject = gcs.objects().get(bucketName, objectName);
// Set the range on the existing request headers which may have been initialized with things
// like user-agent already.
clientRequestHelper.getRequestHeaders(getObject)
.setRange(String.format("bytes=%d-", newPosition));
HttpResponse response;
try {
response = getObject.executeMedia();
} catch (IOException e) {
if (errorExtractor.itemNotFound(e)) {
throw GoogleCloudStorageExceptions.getFileNotFoundException(bucketName, objectName);
} else if (errorExtractor.rangeNotSatisfiable(e)
&& newPosition == 0
&& size == -1) {
// We don't know the size yet (size == -1) and we're seeking to byte 0, but got 'range
// not satisfiable'; the object must be empty.
log.info("Got 'range not satisfiable' for reading %s at position 0; assuming empty.",
StorageResourceId.createReadableString(bucketName, objectName));
size = 0;
return new ByteArrayInputStream(new byte[0]);
} else {
String msg = String.format("Error reading %s at position %d",
StorageResourceId.createReadableString(bucketName, objectName), newPosition);
throw new IOException(msg, e);
}
}
// If the content is compressed, content length reported in the header is counting the number of
// compressed bytes. That means that we cannot rely on the reported content length to check that
// we have received all the data from the data stream.
String contentEncoding = response.getContentEncoding();
isCompressedStream = (contentEncoding != null && contentEncoding.contains("gzip"));
String contentRange = response.getHeaders().getContentRange();
if (response.getHeaders().getContentLength() != null) {
size = response.getHeaders().getContentLength() + newPosition;
} else if (contentRange != null) {
String sizeStr = SLASH.split(contentRange)[1];
try {
size = Long.parseLong(sizeStr);
} catch (NumberFormatException e) {
throw new IOException(
"Could not determine size from response from Content-Range: " + contentRange, e);
}
} else {
throw new IOException("Could not determine size of response");
}
return response.getContent();
}
/**
* Throws if this channel is not currently open.
*/
private void throwIfNotOpen()
throws IOException {
if (!isOpen()) {
throw new ClosedChannelException();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy