All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.directaccess.BaseDirectAccessInputStream Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2023 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess;

import com.sap.hana.datalake.files.classification.InterfaceAudience;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

@InterfaceAudience.Private
public abstract class BaseDirectAccessInputStream extends InputStream implements Seekable, PositionedReadable {

  private static final Logger LOG = LoggerFactory.getLogger(BaseDirectAccessInputStream.class);
  private static final String ETAG_HEADER = "ETag";
  private static final String RANGE_HEADER = "Range";

  protected final int signedUrlExpirationSafetyMargin;
  protected final WebHdfsFileSystem webHdfsFileSystem;
  protected final Path path;

  private final int chunkSize;
  private final String pathUri;
  private final HttpClient httpClient;
  private final byte[] singleByte = new byte[1];
  private long fileOffset;
  private int bufferOffset;
  private int bufferLength;
  private long backendStoredContentLength;
  private boolean isInitialized;
  private String backendObjectETag;
  private boolean closed = false;
  private InputStream wrappedStream;

  public BaseDirectAccessInputStream(final Path path,
      final int chunkSize,
      final int signedUrlExpirationSafetyMargin,
      final WebHdfsFileSystem webHdfsFileSystem,
      final HttpClient httpClient) {
    if (chunkSize <= 0) {
      throw new IllegalArgumentException("Chunk size must be bigger than 0");
    }

    if (signedUrlExpirationSafetyMargin < 0) {
      throw new IllegalArgumentException("Signed URL safety margin must be equal to or higher than 0");
    }

    this.path = path;
    this.pathUri = path.toUri().getPath();
    this.chunkSize = chunkSize;
    this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;
    this.webHdfsFileSystem = webHdfsFileSystem;
    this.fileOffset = 0;
    this.bufferOffset = 0;
    this.bufferLength = 0;
    this.httpClient = httpClient;
    this.backendStoredContentLength = 0;
    this.isInitialized = false;
  }

  @Override
  public synchronized int read(@Nonnull final byte[] b, final int offset, final int length) throws IOException {
    LOG.trace("Starting read operation for object [{}] with offset {} and length {}", this.pathUri, offset, length);
    ArrayUtils.validateBoundaries(b, offset, length);
    this.checkNotClosed();

    if (length <= 0) {
      return 0; // no need to do anything if there is nothing to read
    }

    if (this.needToLoadNextChunk() || !this.isInitialized()) {
      try {
        this.loadNextChunk();
      } catch (final EOFException ex) {
        LOG.info("Reached EOF [{}], ignoring and returning -1 for BaseDirectAccessInputStream.read call," +
            " since an attempt to read after end of stream was made.", ex.getMessage());
        return -1; // EOF reached, end of stream
      }
    }

    if (this.fileOffset == this.backendStoredContentLength && this.getBytesLeftOnBuffer() == 0) {
      return -1; // nothing left to read
    }

    final long objectPointerBeforeRead = this.getPos();
    int totalWrittenBytes = 0;
    int bytesToWrite = length;
    int newOffset = offset;

    while (bytesToWrite > 0) {
      // read from buffer, if there is any left, but do not read more than what we want to write
      int bytesToCopy = Math.min(this.getBytesLeftOnBuffer(), bytesToWrite);

      // if bytes to copy is 0, it means we did not load a chunk, and our byte array is empty.
      // This scenario should only happen if we see that there isn't anything else to load from the backend object
      // and therefore we can assume we are at EOF.
      // aside from this scenario, there should always be bytes left on the buffer.
      if (bytesToCopy == 0) {
        // end of file, since there is nothing left to copy
        return totalWrittenBytes;
      }

      final int writtenBytes = this.wrappedStream.read(b, newOffset, bytesToCopy);

      this.bufferOffset += writtenBytes;
      totalWrittenBytes += writtenBytes;
      bytesToWrite -= writtenBytes;
      newOffset = newOffset + writtenBytes;

      // load next chunk, we test if bytesToWrite is not 0 since we
      // want to prevent loading a chunk that might not be used
      if (bytesToWrite > 0 && this.needToLoadNextChunk()) {
        try {
          this.loadNextChunk();
        } catch (final IOException ex) {
          LOG.error("Could not load next chunk due to exception [{}]; Rolling back stream state to before read call.", ex.getMessage());
          this.fileOffset = objectPointerBeforeRead;
          this.bufferOffset = 0;
          this.bufferLength = 0;
          throw ex;
        }
      }

      LOG.trace("Read {} bytes of object [{}], Chunk progress is currently at {}/{}", writtenBytes, this.pathUri, this.bufferOffset, this.bufferLength);
    }

    return totalWrittenBytes;
  }

  @Override
  public synchronized long skip(final long n) throws IOException {
    this.checkNotClosed();

    if (n <= 0) {
      return n == 0 ? 0 : -1;
    }

    final long prevVirtualFileOffset = this.getPos();
    final int prevBytesLeftOnBuffer = this.getBytesLeftOnBuffer();

    long toIncrement = 0;

    if (prevBytesLeftOnBuffer <= 0) {
      // nothing left on buffer, any bytes skipped will be bytes skipped on the file on the backend
      toIncrement = n;
    } else if (n > prevBytesLeftOnBuffer) {
      // we will skip over the rest of the buffer, and some of the file
      toIncrement = n - prevBytesLeftOnBuffer;
    }
    // else the skip was inside the buffer, and we dont need to update file offset

    if (this.isInitialized()) {
      // actually shift the file offset, if it wont be seeking past EOF
      if (this.fileOffset + toIncrement <= this.backendStoredContentLength) {
        this.fileOffset += toIncrement;
      } else {
        throw new EOFException(String.format("Attempting to seek past EOF for object [%s]", this.pathUri));
      }

      // skip through our internal buffer, if we are still possibly reading from it
      if (prevBytesLeftOnBuffer > n) {
        LOG.debug("The skip of {} bytes is contained within the buffer (pointing to byte {} / {}), so we dont need to change file pointer.", n, this.bufferOffset+n, this.bufferLength);
        this.bufferOffset += this.wrappedStream.skip(n);
      } else { // else we skipped at least to the end of the buffer, so "empty" it out
        LOG.debug("The skip of {} bytes invalidated the buffer, and we also shifted the file pointer by {} bytes.", n, n - prevBytesLeftOnBuffer);
        this.bufferOffset = this.bufferLength;
      }

      final long currentVirtualFileOffset = this.getPos();
      final long virtualSkippedBytes = currentVirtualFileOffset - prevVirtualFileOffset;
      LOG.debug("{} bytes were skipped on object {}. File pointer shifted from {} to {}.",
          virtualSkippedBytes, this.pathUri, prevVirtualFileOffset, currentVirtualFileOffset);

      return virtualSkippedBytes;
    } else {
      // we don't know the actual file size until we start reading (for performance reasons)
      // so lets just say we skipped n bytes
      this.fileOffset += toIncrement;
      LOG.debug("Skipped before fetching first chunk - returned value for BaseDirectAccessInputStream::skip call might be bigger than total file size.");

      return n;
    }
  }

  @Override
  public void close() throws IOException {
    super.close();
    this.closeResourcesQuietly();
    this.wrappedStream = null;
    this.closed = true;
  }

  @Override
  public void mark(final int readLimit) {
    throw new UnsupportedOperationException(String.format("mark method not supported on %s", BaseDirectAccessInputStream.class));
  }

  @Override
  public void reset() {
    throw new UnsupportedOperationException(String.format("reset method not supported on %s", BaseDirectAccessInputStream.class));
  }

  @Override
  public boolean markSupported() {
    return false;
  }

  @Override
  public synchronized int read() throws IOException {
    final int v = this.read(this.singleByte, 0, 1);

    if (v == -1) {
      return -1; // no byte was read, return -1
    }

    /* we are converting a byte to an int, if the last bit on the byte was 1, to keep the numeric value the same,
     * java would fill all other bits after with 1, but "changing" the bit value stored. To prevent this change we
     * use a bitwise AND (&) with the value 255 to 0 all other values outside of the byte range - that is, we move
     * from -128, 127 range into 0 to 255 range, and all bits outside of the byte area are 0 */
    return this.singleByte[0] & 0xff;
  }

  @Override
  public String toString() {
    final String className = this.getClass().getSimpleName();
    if (!this.isInitialized()) {
      return String.format("%s{}", className); // not initialized yet
    }

    final long startVirtualFileOffset = this.getPos();
    final long endVirtualFileOffset = this.fileOffset - 1;
    final String contentRange;

    if (endVirtualFileOffset >= startVirtualFileOffset) {
      contentRange = String.format("%d-%d", startVirtualFileOffset, endVirtualFileOffset);
    } else {
      contentRange = "None";
    }

    return String.format("%s{" +
        "path=" + this.path +
        ", chunkSize=" + this.chunkSize +
        ", backendFileSize=" + this.backendStoredContentLength +
        ", backendFileETag='" + this.backendObjectETag + '\'' +
        ", contentRange='" + contentRange + '\'' +
        '}', className);
  }

  @Override
  public synchronized void seek(final long newFileOffset) throws IOException {
    this.checkNotClosed();
    final long oldVirtualFileOffset = this.getPos();
    final long bytesDelta = newFileOffset - oldVirtualFileOffset;

    if (bytesDelta <= this.getBytesLeftOnBuffer() && bytesDelta >= 0) {
      // the shift ends up inside the buffer, so we can just take the delta in bytes and skip it
      this.skip(bytesDelta);
    } else {
      // seek is outside buffer, so we reset everything and skip to the desired position
      this.fileOffset = 0;
      this.bufferOffset = this.bufferLength; // empty buffer, since it will be useless
      this.skip(newFileOffset);
    }
  }

  @Override
  // return position from the point of view of an external user
  public long getPos() {
    return this.fileOffset - this.getBytesLeftOnBuffer();
  }

  @Override
  public boolean seekToNewSource(final long length) {
    return false; // true if a new source is found, false otherwise
  }

  @Override
  public synchronized int read(final long position, final byte[] buffer, final int offset, final int length) throws IOException {
    final long oldPosition = this.getPos();
    this.seek(position);

    try {
      return this.read(buffer, offset, length);
    } finally {
      this.seek(oldPosition); // ensure we roll back the position, even if an exception happened
    }
  }

  @Override
  public synchronized void readFully(final long position, final byte[] buffer, final int offset, final int length) throws IOException {
    final long oldPosition = this.getPos();
    this.seek(position);
    int bytesToRead = length;
    int readBytes = 0;

    try {
      while (readBytes != -1 && bytesToRead > 0) {
        readBytes = this.read(buffer, offset, length);
        bytesToRead -= readBytes;
      }
    } finally {
      this.seek(oldPosition); // ensure we roll back the position, even if an exception happened
    }

    if (readBytes == -1 && bytesToRead > 0) {
      throw new EOFException("End of data was reached before required number of bytes were read.");
    }
  }

  @Override
  public void readFully(final long position, final byte[] buffer) throws IOException {
    this.readFully(position, buffer, 0, buffer.length);
  }

  private boolean isInitialized() {
    return this.isInitialized;
  }

  // this method returns if we have already reached the end of the file in the storage backend and therefore subsequent
  // calls to loadNextChunk will be met with "416 Requested range not satisfiable" from the backend.
  private boolean isAtEndOfFile() {
    return this.fileOffset >= this.backendStoredContentLength && this.isInitialized();
  }

  private int getBytesLeftOnBuffer() {
    return this.bufferLength - this.bufferOffset;
  }

  private boolean needToLoadNextChunk() {
    return this.bufferOffset >= this.bufferLength && !this.isAtEndOfFile();
  }

  private synchronized void loadNextChunk() throws IOException {
    final BaseSignedUrl signedUrl = this.getSignedUrl();

    final URL url = new URL(signedUrl.getUrl());

    final Map headers = new HashMap<>(signedUrl.getHeaders());
    headers.putAll(this.buildRangeHeader());

    final HttpGet request;

    try {
      request = new HttpGet(url.toURI());
    } catch (final URISyntaxException ex) {
      throw new IOException(ex);
    }

    // map the range header and all headers received with the signed url to the headers of the request
    headers.forEach(request::setHeader);

    // if you are loading the next chunk, you should be done with the current one
    this.closeResourcesQuietly();
    HttpResponse response = null;

    try {
      LOG.debug("Starting fetching new chunk for object [{}] with headers [{}]", this.pathUri, headers);
      response = this.httpClient.execute(request);
      final int statusCode = response.getStatusLine().getStatusCode();
      LOG.debug("Fetched new chunk of object [{}] with headers {} - response status [{}]", this.pathUri, headers, statusCode);
      this.wrappedStream = response.getEntity().getContent();

      if (statusCode >= HttpStatus.SC_OK && statusCode < HttpStatus.SC_MULTIPLE_CHOICES) {
        this.getObjectMetadata(response);
        final int length = Math.toIntExact(response.getEntity().getContentLength());
        this.bufferLength = length;
        this.fileOffset += length;
        this.bufferOffset = 0;
      } else if (statusCode == HttpStatus.SC_NOT_FOUND) {
        throw new FileNotFoundException(String.format("object [%s] not found.", this.pathUri));
      } else if (statusCode == HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE) {
        throw new EOFException(String.format("Request failed with message [%s]; Request headers: [%s]. Range was not valid for object %s.", response.getStatusLine().toString(), headers, this.pathUri));
      } else {
        throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error downloading object [%s]", this.pathUri), LOG);
      }
    } catch (final Exception ex) {
      if (ex instanceof FileNotFoundException) {
        LOG.debug("Object does not exist [{}]", this.pathUri, ex);
      } else if (ex instanceof EOFException) {
        LOG.info("Trying to read outside of object range for [{}]", this.pathUri, ex);
      } else {
        LOG.error("Failed to read object [{}]", this.pathUri, ex);
      }

      HttpClientUtils.closeHttpResponseQuietly(response);
      this.closeResourcesQuietly();
      throw ex;
    }
  }

  protected abstract BaseSignedUrl getSignedUrl() throws IOException;

  protected abstract long getBackendStoredContentLength(final HttpResponse response);

  private Map buildRangeHeader() {
    final String rangeHeaderValue = String.format("bytes=%d-%d", this.fileOffset, this.fileOffset + this.chunkSize - 1);

    return Collections.singletonMap(RANGE_HEADER, rangeHeaderValue);
  }

  private void closeResourcesQuietly() {
    IOUtils.closeQuietly(this.wrappedStream);
  }

  private void getObjectMetadata(final HttpResponse response) {
    final Header etagHeader = response.getFirstHeader(ETAG_HEADER);

    if (etagHeader == null) {
      throw new IllegalStateException(String.format("Header [%s] should never be null", ETAG_HEADER));
    }

    final String backendFileEtag = etagHeader.getValue();

    if (!this.isInitialized) {
      this.isInitialized = true;
      this.backendObjectETag = backendFileEtag;
      this.backendStoredContentLength = this.getBackendStoredContentLength(response);
    } else if (!this.backendObjectETag.equals(backendFileEtag)) {
      throw new IllegalStateException(String.format("File ETAG changed during read operation from %s to %s.", this.backendObjectETag, backendFileEtag));
    }
  }

  private void checkNotClosed() throws IOException {
    if (this.closed) {
      throw new IOException("Stream is closed!");
    }
  }
}

// © 2023 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy