com.sap.hana.datalake.files.directaccess.BaseDirectAccessInputStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sap-hdlfs Show documentation
Show all versions of sap-hdlfs Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.
// © 2023 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess;
import com.sap.hana.datalake.files.classification.InterfaceAudience;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@InterfaceAudience.Private
public abstract class BaseDirectAccessInputStream extends InputStream implements Seekable, PositionedReadable {
private static final Logger LOG = LoggerFactory.getLogger(BaseDirectAccessInputStream.class);
private static final String ETAG_HEADER = "ETag";
private static final String RANGE_HEADER = "Range";
protected final int signedUrlExpirationSafetyMargin;
protected final WebHdfsFileSystem webHdfsFileSystem;
protected final Path path;
private final int chunkSize;
private final String pathUri;
private final HttpClient httpClient;
private final byte[] singleByte = new byte[1];
private long fileOffset;
private int bufferOffset;
private int bufferLength;
private long backendStoredContentLength;
private boolean isInitialized;
private String backendObjectETag;
private boolean closed = false;
private InputStream wrappedStream;
public BaseDirectAccessInputStream(final Path path,
final int chunkSize,
final int signedUrlExpirationSafetyMargin,
final WebHdfsFileSystem webHdfsFileSystem,
final HttpClient httpClient) {
if (chunkSize <= 0) {
throw new IllegalArgumentException("Chunk size must be bigger than 0");
}
if (signedUrlExpirationSafetyMargin < 0) {
throw new IllegalArgumentException("Signed URL safety margin must be equal to or higher than 0");
}
this.path = path;
this.pathUri = path.toUri().getPath();
this.chunkSize = chunkSize;
this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;
this.webHdfsFileSystem = webHdfsFileSystem;
this.fileOffset = 0;
this.bufferOffset = 0;
this.bufferLength = 0;
this.httpClient = httpClient;
this.backendStoredContentLength = 0;
this.isInitialized = false;
}
@Override
public synchronized int read(@Nonnull final byte[] b, final int offset, final int length) throws IOException {
LOG.trace("Starting read operation for object [{}] with offset {} and length {}", this.pathUri, offset, length);
ArrayUtils.validateBoundaries(b, offset, length);
this.checkNotClosed();
if (length <= 0) {
return 0; // no need to do anything if there is nothing to read
}
if (this.needToLoadNextChunk() || !this.isInitialized()) {
try {
this.loadNextChunk();
} catch (final EOFException ex) {
LOG.info("Reached EOF [{}], ignoring and returning -1 for BaseDirectAccessInputStream.read call," +
" since an attempt to read after end of stream was made.", ex.getMessage());
return -1; // EOF reached, end of stream
}
}
if (this.fileOffset == this.backendStoredContentLength && this.getBytesLeftOnBuffer() == 0) {
return -1; // nothing left to read
}
final long objectPointerBeforeRead = this.getPos();
int totalWrittenBytes = 0;
int bytesToWrite = length;
int newOffset = offset;
while (bytesToWrite > 0) {
// read from buffer, if there is any left, but do not read more than what we want to write
int bytesToCopy = Math.min(this.getBytesLeftOnBuffer(), bytesToWrite);
// if bytes to copy is 0, it means we did not load a chunk, and our byte array is empty.
// This scenario should only happen if we see that there isn't anything else to load from the backend object
// and therefore we can assume we are at EOF.
// aside from this scenario, there should always be bytes left on the buffer.
if (bytesToCopy == 0) {
// end of file, since there is nothing left to copy
return totalWrittenBytes;
}
final int writtenBytes = this.wrappedStream.read(b, newOffset, bytesToCopy);
this.bufferOffset += writtenBytes;
totalWrittenBytes += writtenBytes;
bytesToWrite -= writtenBytes;
newOffset = newOffset + writtenBytes;
// load next chunk, we test if bytesToWrite is not 0 since we
// want to prevent loading a chunk that might not be used
if (bytesToWrite > 0 && this.needToLoadNextChunk()) {
try {
this.loadNextChunk();
} catch (final IOException ex) {
LOG.error("Could not load next chunk due to exception [{}]; Rolling back stream state to before read call.", ex.getMessage());
this.fileOffset = objectPointerBeforeRead;
this.bufferOffset = 0;
this.bufferLength = 0;
throw ex;
}
}
LOG.trace("Read {} bytes of object [{}], Chunk progress is currently at {}/{}", writtenBytes, this.pathUri, this.bufferOffset, this.bufferLength);
}
return totalWrittenBytes;
}
@Override
public synchronized long skip(final long n) throws IOException {
this.checkNotClosed();
if (n <= 0) {
return n == 0 ? 0 : -1;
}
final long prevVirtualFileOffset = this.getPos();
final int prevBytesLeftOnBuffer = this.getBytesLeftOnBuffer();
long toIncrement = 0;
if (prevBytesLeftOnBuffer <= 0) {
// nothing left on buffer, any bytes skipped will be bytes skipped on the file on the backend
toIncrement = n;
} else if (n > prevBytesLeftOnBuffer) {
// we will skip over the rest of the buffer, and some of the file
toIncrement = n - prevBytesLeftOnBuffer;
}
// else the skip was inside the buffer, and we dont need to update file offset
if (this.isInitialized()) {
// actually shift the file offset, if it wont be seeking past EOF
if (this.fileOffset + toIncrement <= this.backendStoredContentLength) {
this.fileOffset += toIncrement;
} else {
throw new EOFException(String.format("Attempting to seek past EOF for object [%s]", this.pathUri));
}
// skip through our internal buffer, if we are still possibly reading from it
if (prevBytesLeftOnBuffer > n) {
LOG.debug("The skip of {} bytes is contained within the buffer (pointing to byte {} / {}), so we dont need to change file pointer.", n, this.bufferOffset+n, this.bufferLength);
this.bufferOffset += this.wrappedStream.skip(n);
} else { // else we skipped at least to the end of the buffer, so "empty" it out
LOG.debug("The skip of {} bytes invalidated the buffer, and we also shifted the file pointer by {} bytes.", n, n - prevBytesLeftOnBuffer);
this.bufferOffset = this.bufferLength;
}
final long currentVirtualFileOffset = this.getPos();
final long virtualSkippedBytes = currentVirtualFileOffset - prevVirtualFileOffset;
LOG.debug("{} bytes were skipped on object {}. File pointer shifted from {} to {}.",
virtualSkippedBytes, this.pathUri, prevVirtualFileOffset, currentVirtualFileOffset);
return virtualSkippedBytes;
} else {
// we don't know the actual file size until we start reading (for performance reasons)
// so lets just say we skipped n bytes
this.fileOffset += toIncrement;
LOG.debug("Skipped before fetching first chunk - returned value for BaseDirectAccessInputStream::skip call might be bigger than total file size.");
return n;
}
}
@Override
public void close() throws IOException {
super.close();
this.closeResourcesQuietly();
this.wrappedStream = null;
this.closed = true;
}
@Override
public void mark(final int readLimit) {
throw new UnsupportedOperationException(String.format("mark method not supported on %s", BaseDirectAccessInputStream.class));
}
@Override
public void reset() {
throw new UnsupportedOperationException(String.format("reset method not supported on %s", BaseDirectAccessInputStream.class));
}
@Override
public boolean markSupported() {
return false;
}
@Override
public synchronized int read() throws IOException {
final int v = this.read(this.singleByte, 0, 1);
if (v == -1) {
return -1; // no byte was read, return -1
}
/* we are converting a byte to an int, if the last bit on the byte was 1, to keep the numeric value the same,
* java would fill all other bits after with 1, but "changing" the bit value stored. To prevent this change we
* use a bitwise AND (&) with the value 255 to 0 all other values outside of the byte range - that is, we move
* from -128, 127 range into 0 to 255 range, and all bits outside of the byte area are 0 */
return this.singleByte[0] & 0xff;
}
@Override
public String toString() {
final String className = this.getClass().getSimpleName();
if (!this.isInitialized()) {
return String.format("%s{}", className); // not initialized yet
}
final long startVirtualFileOffset = this.getPos();
final long endVirtualFileOffset = this.fileOffset - 1;
final String contentRange;
if (endVirtualFileOffset >= startVirtualFileOffset) {
contentRange = String.format("%d-%d", startVirtualFileOffset, endVirtualFileOffset);
} else {
contentRange = "None";
}
return String.format("%s{" +
"path=" + this.path +
", chunkSize=" + this.chunkSize +
", backendFileSize=" + this.backendStoredContentLength +
", backendFileETag='" + this.backendObjectETag + '\'' +
", contentRange='" + contentRange + '\'' +
'}', className);
}
@Override
public synchronized void seek(final long newFileOffset) throws IOException {
this.checkNotClosed();
final long oldVirtualFileOffset = this.getPos();
final long bytesDelta = newFileOffset - oldVirtualFileOffset;
if (bytesDelta <= this.getBytesLeftOnBuffer() && bytesDelta >= 0) {
// the shift ends up inside the buffer, so we can just take the delta in bytes and skip it
this.skip(bytesDelta);
} else {
// seek is outside buffer, so we reset everything and skip to the desired position
this.fileOffset = 0;
this.bufferOffset = this.bufferLength; // empty buffer, since it will be useless
this.skip(newFileOffset);
}
}
@Override
// return position from the point of view of an external user
public long getPos() {
return this.fileOffset - this.getBytesLeftOnBuffer();
}
@Override
public boolean seekToNewSource(final long length) {
return false; // true if a new source is found, false otherwise
}
@Override
public synchronized int read(final long position, final byte[] buffer, final int offset, final int length) throws IOException {
final long oldPosition = this.getPos();
this.seek(position);
try {
return this.read(buffer, offset, length);
} finally {
this.seek(oldPosition); // ensure we roll back the position, even if an exception happened
}
}
@Override
public synchronized void readFully(final long position, final byte[] buffer, final int offset, final int length) throws IOException {
final long oldPosition = this.getPos();
this.seek(position);
int bytesToRead = length;
int readBytes = 0;
try {
while (readBytes != -1 && bytesToRead > 0) {
readBytes = this.read(buffer, offset, length);
bytesToRead -= readBytes;
}
} finally {
this.seek(oldPosition); // ensure we roll back the position, even if an exception happened
}
if (readBytes == -1 && bytesToRead > 0) {
throw new EOFException("End of data was reached before required number of bytes were read.");
}
}
@Override
public void readFully(final long position, final byte[] buffer) throws IOException {
this.readFully(position, buffer, 0, buffer.length);
}
private boolean isInitialized() {
return this.isInitialized;
}
// this method returns if we have already reached the end of the file in the storage backend and therefore subsequent
// calls to loadNextChunk will be met with "416 Requested range not satisfiable" from the backend.
private boolean isAtEndOfFile() {
return this.fileOffset >= this.backendStoredContentLength && this.isInitialized();
}
private int getBytesLeftOnBuffer() {
return this.bufferLength - this.bufferOffset;
}
private boolean needToLoadNextChunk() {
return this.bufferOffset >= this.bufferLength && !this.isAtEndOfFile();
}
private synchronized void loadNextChunk() throws IOException {
final BaseSignedUrl signedUrl = this.getSignedUrl();
final URL url = new URL(signedUrl.getUrl());
final Map headers = new HashMap<>(signedUrl.getHeaders());
headers.putAll(this.buildRangeHeader());
final HttpGet request;
try {
request = new HttpGet(url.toURI());
} catch (final URISyntaxException ex) {
throw new IOException(ex);
}
// map the range header and all headers received with the signed url to the headers of the request
headers.forEach(request::setHeader);
// if you are loading the next chunk, you should be done with the current one
this.closeResourcesQuietly();
HttpResponse response = null;
try {
LOG.debug("Starting fetching new chunk for object [{}] with headers [{}]", this.pathUri, headers);
response = this.httpClient.execute(request);
final int statusCode = response.getStatusLine().getStatusCode();
LOG.debug("Fetched new chunk of object [{}] with headers {} - response status [{}]", this.pathUri, headers, statusCode);
this.wrappedStream = response.getEntity().getContent();
if (statusCode >= HttpStatus.SC_OK && statusCode < HttpStatus.SC_MULTIPLE_CHOICES) {
this.getObjectMetadata(response);
final int length = Math.toIntExact(response.getEntity().getContentLength());
this.bufferLength = length;
this.fileOffset += length;
this.bufferOffset = 0;
} else if (statusCode == HttpStatus.SC_NOT_FOUND) {
throw new FileNotFoundException(String.format("object [%s] not found.", this.pathUri));
} else if (statusCode == HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE) {
throw new EOFException(String.format("Request failed with message [%s]; Request headers: [%s]. Range was not valid for object %s.", response.getStatusLine().toString(), headers, this.pathUri));
} else {
throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error downloading object [%s]", this.pathUri), LOG);
}
} catch (final Exception ex) {
if (ex instanceof FileNotFoundException) {
LOG.debug("Object does not exist [{}]", this.pathUri, ex);
} else if (ex instanceof EOFException) {
LOG.info("Trying to read outside of object range for [{}]", this.pathUri, ex);
} else {
LOG.error("Failed to read object [{}]", this.pathUri, ex);
}
HttpClientUtils.closeHttpResponseQuietly(response);
this.closeResourcesQuietly();
throw ex;
}
}
protected abstract BaseSignedUrl getSignedUrl() throws IOException;
protected abstract long getBackendStoredContentLength(final HttpResponse response);
private Map buildRangeHeader() {
final String rangeHeaderValue = String.format("bytes=%d-%d", this.fileOffset, this.fileOffset + this.chunkSize - 1);
return Collections.singletonMap(RANGE_HEADER, rangeHeaderValue);
}
private void closeResourcesQuietly() {
IOUtils.closeQuietly(this.wrappedStream);
}
private void getObjectMetadata(final HttpResponse response) {
final Header etagHeader = response.getFirstHeader(ETAG_HEADER);
if (etagHeader == null) {
throw new IllegalStateException(String.format("Header [%s] should never be null", ETAG_HEADER));
}
final String backendFileEtag = etagHeader.getValue();
if (!this.isInitialized) {
this.isInitialized = true;
this.backendObjectETag = backendFileEtag;
this.backendStoredContentLength = this.getBackendStoredContentLength(response);
} else if (!this.backendObjectETag.equals(backendFileEtag)) {
throw new IllegalStateException(String.format("File ETAG changed during read operation from %s to %s.", this.backendObjectETag, backendFileEtag));
}
}
private void checkNotClosed() throws IOException {
if (this.closed) {
throw new IOException("Stream is closed!");
}
}
}
// © 2023 SAP SE or an SAP affiliate company. All rights reserved.
© 2015 - 2025 Weber Informatics LLC | Privacy Policy