com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gcs-connector Show documentation
Show all versions of gcs-connector Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting Google Cloud Storage
/**
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.fs.gcs;
import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadOptions;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A seekable and positionable FSInputStream that provides read access to a file.
*/
class GoogleHadoopFSInputStream
extends FSInputStream {
// Logging helper.
private static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFSInputStream.class);
// Instance of GoogleHadoopFileSystemBase.
private GoogleHadoopFileSystemBase ghfs;
// All store IO access goes through this.
private SeekableByteChannel channel;
// Internal buffer.
private ByteBuffer buffer;
// Path of the file to read.
private URI gcsPath;
// Number of bytes read through this channel.
private long totalBytesRead;
// Statistics tracker provided by the parent GoogleHadoopFileSystemBase for recording
// numbers of bytes read.
private final FileSystem.Statistics statistics;
// Time of initialization
private long initTime;
// Used for single-byte reads.
private final byte[] singleReadBuf = new byte[1];
/**
* Constructs an instance of GoogleHadoopFSInputStream object.
*
* @param ghfs Instance of GoogleHadoopFileSystemBase.
* @param gcsPath Path of the file to read from.
* @param bufferSize Size of the buffer to use.
* @param statistics File system statistics object.
* @throws IOException if an IO error occurs.
*/
GoogleHadoopFSInputStream(
GoogleHadoopFileSystemBase ghfs, URI gcsPath, int bufferSize,
FileSystem.Statistics statistics)
throws IOException {
LOG.debug("GoogleHadoopFSInputStream({}, {})", gcsPath, bufferSize);
this.ghfs = ghfs;
this.gcsPath = gcsPath;
this.statistics = statistics;
initTime = System.nanoTime();
totalBytesRead = 0;
boolean enableInternalBuffer = ghfs.getConf().getBoolean(
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_KEY,
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_INTERNALBUFFER_ENABLE_DEFAULT);
LOG.debug("enableInternalBuffer: {}", enableInternalBuffer);
boolean supportContentEncoding = ghfs.getConf().getBoolean(
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_KEY,
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_SUPPORT_CONTENT_ENCODING_ENABLE_DEFAULT);
LOG.debug("supportContentEncoding: {}", supportContentEncoding);
boolean fastFailOnNotFound = ghfs.getConf().getBoolean(
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_FAST_FAIL_ON_NOT_FOUND_ENABLE_KEY,
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_FAST_FAIL_ON_NOT_FOUND_ENABLE_DEFAULT);
LOG.debug("fastFailOnNotFound: {}", fastFailOnNotFound);
long inplaceSeekLimit = ghfs.getConf().getLong(
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_KEY,
GoogleHadoopFileSystemBase.GCS_INPUTSTREAM_INPLACE_SEEK_LIMIT_DEFAULT);
LOG.debug("inplaceSeekLimit: {}", inplaceSeekLimit);
GoogleCloudStorageReadOptions.Builder readOptions = new GoogleCloudStorageReadOptions.Builder()
.setSupportContentEncoding(supportContentEncoding)
.setFastFailOnNotFound(fastFailOnNotFound)
.setInplaceSeekLimit(inplaceSeekLimit);
if (enableInternalBuffer) {
buffer = ByteBuffer.allocate(bufferSize);
buffer.limit(0);
buffer.rewind();
// If we're using a buffer in this layer, skip the lower-level buffer.
readOptions.setBufferSize(0);
} else {
buffer = null;
// If not using internal buffer, let the lower-level channel figure out how to do buffering.
readOptions.setBufferSize(bufferSize);
}
channel = ghfs.getGcsFs().open(gcsPath, readOptions.build());
}
/**
* Reads a single byte from the underlying store.
*
* @return A single byte from the underlying store or -1 on EOF.
* @throws IOException if an IO error occurs.
*/
@Override
public synchronized int read()
throws IOException {
long startTime = System.nanoTime();
byte b;
if (buffer == null) {
// TODO(user): Wrap this in a while-loop if we ever introduce a non-blocking mode for the
// underlying channel.
int numRead = channel.read(ByteBuffer.wrap(singleReadBuf));
if (numRead == -1) {
return -1;
} else if (numRead != 1) {
throw new IOException(String.format(
"Somehow read %d bytes using single-byte buffer for path %s ending in position %d!",
numRead, gcsPath, channel.position()));
}
b = singleReadBuf[0];
} else {
// Refill the internal buffer if necessary.
if (!buffer.hasRemaining()) {
buffer.clear();
int numBytesRead = channel.read(buffer);
if (numBytesRead <= 0) {
buffer.limit(0);
buffer.rewind();
return -1;
}
buffer.flip();
}
b = buffer.get();
}
totalBytesRead++;
statistics.incrementBytesRead(1);
long duration = System.nanoTime() - startTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ1);
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ1_TIME, duration);
return (b & 0xff);
}
/**
* Reads up to length bytes from the underlying store and stores
* them starting at the specified offset in the given buffer.
* Less than length bytes may be returned.
*
* @param buf The buffer into which data is returned.
* @param offset The offset at which data is written.
* @param length Maximum number of bytes to read.
*
* @return Number of bytes read or -1 on EOF.
* @throws IOException if an IO error occurs.
*/
@Override
public int read(byte[] buf, int offset, int length)
throws IOException {
long startTime = System.nanoTime();
Preconditions.checkNotNull(buf, "buf must not be null");
if (offset < 0 || length < 0 || length > buf.length - offset) {
throw new IndexOutOfBoundsException();
}
int numRead = 0;
if (buffer == null) {
numRead = channel.read(ByteBuffer.wrap(buf, offset, length));
} else {
while (numRead < length) {
int needToRead = length - numRead;
if (buffer.remaining() >= needToRead) {
// There are sufficient bytes, we'll only read a (not-necessarily-proper) subset of the
// internal buffer.
buffer.get(buf, offset + numRead, needToRead);
numRead += needToRead;
} else if (buffer.hasRemaining()) {
// We must take everything from the buffer and loop again.
int singleRead = buffer.remaining();
buffer.get(buf, offset + numRead, singleRead);
numRead += singleRead;
} else {
// Buffer is empty AND we still need more bytes to be read.
long channelTime = System.nanoTime();
buffer.clear();
int numNewBytes = channel.read(buffer);
long channelDuration = System.nanoTime() - channelTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ_FROM_CHANNEL);
ghfs.increment(
GoogleHadoopFileSystemBase.Counter.READ_FROM_CHANNEL_TIME, channelDuration);
if (numNewBytes <= 0) {
// Ran out of underlying channel bytes.
buffer.limit(0);
buffer.rewind();
if (numRead == 0) {
// Never read anything at all; return -1 to indicate EOF. Otherwise, we'll leave
// numRead untouched and return the number of bytes we did manage to retrieve.
numRead = -1;
}
break;
} else {
// Successfully got some new bytes from the channel; keep looping.
buffer.flip();
}
}
}
}
if (numRead > 0) {
// -1 means we actually read 0 bytes, but requested at least one byte.
statistics.incrementBytesRead(numRead);
totalBytesRead += numRead;
}
long duration = System.nanoTime() - startTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ);
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ_TIME, duration);
return numRead;
}
/**
* Reads up to length bytes from the underlying store and stores
* them starting at the specified offset in the given buffer.
* Less than length bytes may be returned. Reading starts at the
* given position.
*
* @param position Data is read from the stream starting at this position.
* @param buf The buffer into which data is returned.
* @param offset The offset at which data is written.
* @param length Maximum number of bytes to read.
*
* @return Number of bytes read or -1 on EOF.
* @throws IOException if an IO error occurs.
*/
@Override
public int read(long position, byte[] buf, int offset, int length)
throws IOException {
long startTime = System.nanoTime();
int result = super.read(position, buf, offset, length);
long duration = System.nanoTime() - startTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ_POS);
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ_POS_TIME, duration);
return result;
}
/**
* Gets the current position within the file being read.
*
* @return The current position within the file being read.
* @throws IOException if an IO error occurs.
*/
@Override
public synchronized long getPos()
throws IOException {
int bufRemaining = (buffer == null ? 0 : buffer.remaining());
long pos = channel.position() - bufRemaining;
LOG.debug("getPos: {}", pos);
return pos;
}
/**
* Sets the current position within the file being read.
*
* @param pos The position to seek to.
* @throws IOException if an IO error occurs or if the target position is invalid.
*/
@Override
public synchronized void seek(long pos)
throws IOException {
long startTime = System.nanoTime();
LOG.debug("seek: {}", pos);
if (buffer == null) {
try {
channel.position(pos);
} catch (IllegalArgumentException e) {
throw new IOException(e);
}
} else {
long curPos = getPos();
if (curPos == pos) {
LOG.debug("Skipping no-op seek.");
} else if (pos < curPos && curPos - pos <= buffer.position()) {
// Skip backwards few enough bytes that our current buffer still has those bytes around
// so that we simply need to reposition the buffer backwards a bit.
long skipBack = curPos - pos;
// Guaranteed safe to cast as an (int) because curPos - pos is <= buffer.position(), and
// position() is itself an int.
int newBufferPosition = buffer.position() - (int) skipBack;
LOG.debug("Skipping backward {} bytes in-place from buffer pos {} to new pos {}",
skipBack, buffer.position(), newBufferPosition);
buffer.position(newBufferPosition);
} else if (curPos < pos && pos < channel.position()) {
// Skip forwards--between curPos and channel.position() are the bytes we already have
// available in the buffer.
long skipBytes = pos - curPos;
Preconditions.checkState(skipBytes < buffer.remaining(),
"skipBytes (%s) must be less than buffer.remaining() (%s)",
skipBytes, buffer.remaining());
// We know skipBytes is castable as (int) even if the top-level position is capable of
// overflowing an int, since we at least assert that skipBytes < buffer.remaining(),
// which is itself less than Integer.MAX_VALUE.
int newBufferPosition = buffer.position() + (int) skipBytes;
LOG.debug("Skipping {} bytes in-place from buffer pos {} to new pos {}",
skipBytes, buffer.position(), newBufferPosition);
buffer.position(newBufferPosition);
} else {
LOG.debug("New position '{}' out of range of inplace buffer, with curPos ({}), "
+ "buffer.position() ({}) and buffer.remaining() ({}).",
pos, curPos, buffer.position(), buffer.remaining());
try {
channel.position(pos);
} catch (IllegalArgumentException e) {
throw new IOException(e);
}
buffer.limit(0);
buffer.rewind();
}
}
long duration = System.nanoTime() - startTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.SEEK);
ghfs.increment(GoogleHadoopFileSystemBase.Counter.SEEK_TIME, duration);
}
/**
* Seeks a different copy of the data. Not supported.
*
* @return true if a new source is found, false otherwise.
*/
@Override
public synchronized boolean seekToNewSource(long targetPos)
throws IOException {
return false;
}
/**
* Closes the current stream.
*
* @throws IOException if an IO error occurs.
*/
@Override
public synchronized void close()
throws IOException {
if (channel != null) {
long startTime = System.nanoTime();
try {
LOG.debug("close: file: {}, totalBytesRead: {}", gcsPath, totalBytesRead);
channel.close();
long duration = System.nanoTime() - startTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ_CLOSE);
ghfs.increment(GoogleHadoopFileSystemBase.Counter.READ_CLOSE_TIME, duration);
long streamDuration = System.nanoTime() - initTime;
ghfs.increment(GoogleHadoopFileSystemBase.Counter.INPUT_STREAM);
ghfs.increment(
GoogleHadoopFileSystemBase.Counter.INPUT_STREAM_TIME, streamDuration);
} finally {
channel = null;
}
}
}
/**
* Indicates whether this stream supports the 'mark' functionality.
*
* @return false (functionality not supported).
*/
@Override
public boolean markSupported() {
// HDFS does not support it either and most Hadoop tools do not expect it.
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy