All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.directaccess.s3.S3DirectAccessOutputStream Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess.s3;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.util.Preconditions;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPut;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.util.Base64;
import java.util.Map;
import java.util.regex.Pattern;

import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities;
import com.sap.hana.datalake.files.HdlfsOutputStream;
import com.sap.hana.datalake.files.directaccess.BaseDirectAccessOutputStream;
import com.sap.hana.datalake.files.directaccess.BaseSignedUrl;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import com.sap.hana.datalake.files.utils.http.RepeatableInputStreamEntity;

/* package-private */ class S3DirectAccessOutputStream extends BaseDirectAccessOutputStream {

  private static final Logger LOG = LoggerFactory.getLogger(S3DirectAccessOutputStream.class);
  private static final Pattern S3_SIGNATURE_TOKEN_PATTERN = Pattern.compile("([?&])(X-Amz-Security-Token|X-Amz-Signature|X-Amz-Credential)=[^\\s&?$]*", Pattern.CASE_INSENSITIVE);

  private final int signedUrlExpirationSafetyMargin;
  private final HdlfsOutputStream.Builder prebuiltOutputStream;
  private HdlfsOutputStream hdlfsOutputStream;
  private S3UploadSignedUrl signedUrl;
  private boolean isMultipartUpload;

  /* package-private */ S3DirectAccessOutputStream(final Path path,
          final boolean overwrite,
          final int chunkSize,
          final int signedUrlExpirationSafetyMargin,
          final WebHdfsFileSystem webHdfsFileSystem,
          final HttpClient httpClient,
          final HdlfsOutputStream.Builder prebuiltOutputStream) {
    super(path, overwrite, chunkSize, webHdfsFileSystem, httpClient);

    this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;
    this.prebuiltOutputStream = prebuiltOutputStream;
    this.isMultipartUpload = false;
  }

  @Override
  public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
    LOG.trace("Started write operation into stream: {}", this);
    Preconditions.checkNotNull(data, "data buffer cannot be null");
    ArrayUtils.validateBoundaries(data, offset, length);

    if (this.closed) {
      throw new IllegalStateException("Stream already closed");
    }

    if (length == 0) {
      return;
    }

    final int writtenBytes;

    if (this.isMultipartUpload) {
      this.hdlfsOutputStream.write(data, offset, length);
      writtenBytes = length; // assume we were able to write the full content using the output stream, which should administrate DataChunks by itself
    } else {
      final DataChunk chunk = this.getOrCreateCurrentChunk();

      writtenBytes = chunk.write(data, offset, length);
      this.md5Hasher.update(data, offset, writtenBytes);

      LOG.trace("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunk.getRemainingCapacity());
    }

    if (writtenBytes < length) {
      // The current chunk ran out of space and there is still data left to be written, so we move to HdlfsMultiPartUpload.
      if (!this.isMultipartUpload) {
        this.isMultipartUpload = true;
        this.hdlfsOutputStream = this.prebuiltOutputStream.withInitialChunk(this.currentChunk).build();
        this.currentChunk = null;
      }

      final int leftOverOffset = offset + writtenBytes;
      final int leftOverLength = length - writtenBytes;

      this.write(data, leftOverOffset, leftOverLength);
    }

    LOG.trace("Write operation into stream {} completed", this);
  }

  @Override
  public synchronized void close() throws IOException {
    if (this.closed) {
      return;
    }

    this.closed = true;

    try {
      if (this.isMultipartUpload) {
        // multipart upload complete
        this.hdlfsOutputStream.close();
      } else {
        // closed while we are at less than or equal to one chunk, we can upload directly
        this.uploadSingleChunk();
      }
    } finally {
      this.cleanUpOnClose();
    }
  }

  @Override
  public synchronized AbortableResult abort() {
    if (this.isMultipartUpload) {
      return this.hdlfsOutputStream.abort();
    } else {
      // aborted while we are at less than or equal to one chunk, we can just close the chunk and prevent uploading
      if (this.closed) {
        return new AbortableResultImpl(/* alreadyClosed */ true, /* cleanupException */ null);
      }

      this.cleanUpOnClose();
      this.closed = true;

      return new AbortableResultImpl(/* alreadyClosed */ false, /* cleanupException */  null);
    }
  }

  @Override
  protected boolean isFileAlreadyExistError(final HttpResponse httpResponse) {
    // S3 does not support mutual exclusion for concurrent writes, we will need to use FSCache for this behavior since `overwrite=false` behavior is not supported natively
    return false;
  }

  protected synchronized BaseSignedUrl getSignedUrlForCurrentChunk() throws IOException {
    final DataChunk chunk = this.getOrCreateCurrentChunk();

    final boolean signedUrlNotNull = this.signedUrl != null;
    final boolean isSignedUrlForCurrentChunk = signedUrlNotNull && this.signedUrl.getChunkId().equals(chunk.getId());
    final boolean isSignedUrlExpired = signedUrlNotNull && this.signedUrl.isExpired();

    if (isSignedUrlForCurrentChunk && !isSignedUrlExpired) {
      return this.signedUrl;
    }

    final HdlfsFileSystemCapabilities.S3CreateDirectAccessOptions directAccessOptions =
            HdlfsFileSystemCapabilities.S3CreateDirectAccessOptions.builder(chunk.getMd5Checksum()).build();

    final HdlfsFileSystemCapabilities.DirectAccessResponse directAccessResponse = this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions);
    final HdlfsFileSystemCapabilities.S3DirectAccessProperties s3DirectAccessProperties =
            (HdlfsFileSystemCapabilities.S3DirectAccessProperties) directAccessResponse.getProperties();

    this.signedUrl = S3UploadSignedUrl.from(s3DirectAccessProperties.getSignedUrl(), this.signedUrlExpirationSafetyMargin, chunk.getId());

    return this.signedUrl;
  }

  private void uploadSingleChunk() throws IOException {
    if (this.currentChunk == null) {
      this.currentChunk = this.getOrCreateCurrentChunk();
    }

    final String chunkMd5 = Base64.getEncoder().encodeToString(this.md5Hasher.digest());
    this.currentChunk.setMd5Checksum(chunkMd5);

    try {
      final BaseSignedUrl signedUrlForCurrentChunk = this.getSignedUrlForCurrentChunk();

      final HttpPut request = new HttpPut(signedUrlForCurrentChunk.getUrl());
      this.setChunkRequestEntity(request, this.currentChunk);
      this.setRequestHeaders(request, signedUrlForCurrentChunk);

      // Execute the request
      LOG.debug("Uploading {} of {} using signed URL [{}]", this.currentChunk, this, this.getRequestUrlWithNoAuthInfo(request));
      final HttpResponse response = this.httpClient.execute(request);
      this.eTag = this.getETagFromResponse(response);

      this.validateUploadResponse(response, this.currentChunk);
    } catch (final Exception ex) {
      final String errorMessage = String.format("Could not upload %s", this.currentChunk);

      throw new IOException(errorMessage, ex);
    } finally {
      IOUtils.closeQuietly(this.currentChunk);
    }
  }

  private void setRequestHeaders(final HttpPut request, final BaseSignedUrl signedUrl) {
    final Map signedUrlHeaders = signedUrl.getHeaders();

    if (!signedUrlHeaders.containsKey(HttpHeaders.CONTENT_MD5)) {
      throw new IllegalStateException(String.format("Signed URL does not contain required header [%s]", HttpHeaders.CONTENT_MD5));
    }

    signedUrlHeaders.forEach(request::setHeader);
  }

  private String getRequestUrlWithNoAuthInfo(final HttpPut request) {
    return S3_SIGNATURE_TOKEN_PATTERN.matcher(request.getURI().toString()).replaceAll("$1$2=[REDACTED]");
  }

  private void validateUploadResponse(final HttpResponse response, final Object object) throws IOException {
    final int statusCode = response.getStatusLine().getStatusCode();

    LOG.debug("{} upload request responded with status code: {}", object, statusCode);

    if (!HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
      throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error uploading %s", object), LOG);
    }
  }

  private void setChunkRequestEntity(final HttpPut request, final DataChunk chunk) {
    final int chunkDataSize = chunk.getDataSize();
    final InputStream chunkInputStream = chunk.getInputStream();

    request.setEntity(new RepeatableInputStreamEntity(chunkInputStream, chunkDataSize));
  }

}

// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy