All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.directaccess.wasb.WasbDirectAccessOutputStream Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess.wasb;

import com.sap.hana.datalake.files.HdlfsConstants;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.DirectAccessResponse;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.WasbCreateDirectAccessOptions;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.WasbDirectAccessProperties;
import com.sap.hana.datalake.files.directaccess.BaseDirectAccessOutputStream;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import com.sap.hana.datalake.files.utils.http.RepeatableInputStreamEntity;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.Path;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.util.SemaphoredDelegatingExecutor;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.entity.ContentType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;

public class WasbDirectAccessOutputStream extends BaseDirectAccessOutputStream {

  protected static final String HEADER_X_MS_BLOB_TYPE_NAME = "X-MS-Blob-Type";
  protected static final String MPU_BLOCK_PARAMETER = "&comp=block&blockid=";
  protected static final String MPU_BLOCKLIST_PARAMETER = "&comp=blocklist";

  private static final Logger LOG = LoggerFactory.getLogger(WasbDirectAccessOutputStream.class);
  private static final Pattern WASB_SIGNATURE_TOKEN_PATTERN = Pattern.compile("([?&])(sig|skoid|sktid)=[^\\s&$]*", Pattern.CASE_INSENSITIVE);
  private static final String EMPTY_STRING = "";

  private final int signedUrlExpirationSafetyMargin;
  private final ExecutorService executorService;
  private final Lock signedUrlLock;

  private WasbSignedUrl signedUrl;
  private List> chunkUploadFutures;

  public static WasbDirectAccessOutputStream newInstance(final Path path, final boolean overwrite, final int chunkSize,
                                                         final WebHdfsFileSystem webHdfsFileSystem, final HttpClient httpClient,
                                                         final ExecutorService multipartUploadThreadPool, final int maxActiveChunks,
                                                         final int signedUrlExpirationSafetyMargin) throws IOException {
    return new WasbDirectAccessOutputStream(path, overwrite, chunkSize, webHdfsFileSystem, httpClient, multipartUploadThreadPool,
            maxActiveChunks, signedUrlExpirationSafetyMargin);
  }

  public WasbDirectAccessOutputStream(final Path path, final boolean overwrite, final int chunkSize,
                                      final WebHdfsFileSystem webHdfsFileSystem, final HttpClient httpClient,
                                      final ExecutorService multipartUploadThreadPool, final int maxActiveChunks,
                                      final int signedUrlExpirationSafetyMargin) throws IOException {
    super(path, overwrite, chunkSize, webHdfsFileSystem, httpClient);

    this.executorService = new SemaphoredDelegatingExecutor(multipartUploadThreadPool, maxActiveChunks, /* fair */ true);
    this.signedUrlLock = new ReentrantLock();
    this.signedUrl = this.getSignedUrl();
    this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;

    this.getOrCreateCurrentChunk();
  }

  @Override
  public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
    LOG.trace("Started write operation into stream {}", this);
    ArrayUtils.validateBoundaries(data, offset, length);

    if (this.closed) {
      throw new IllegalStateException("Stream already closed");
    }

    if (length == 0) {
      return;
    }

    final DataChunk chunk = this.getOrCreateCurrentChunk();
    final int writtenBytes = this.writeDataToCurrentChunk(data, offset, length);
    final int chunkRemainingCapacity = chunk.getRemainingCapacity();

    LOG.debug("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunkRemainingCapacity);

    if (writtenBytes < length) {
      LOG.debug("Current data chunk ran out of space; dispatching it for upload");

      // Note: this call might block depending on the number of Chunks being currently uploaded
      this.uploadCurrentChunkAsync();

      final int leftOverOffset = offset + writtenBytes;
      final int leftOverLength = length - writtenBytes;

      this.write(data, leftOverOffset, leftOverLength);
    }

    LOG.debug("Write operation into {} completed", this);
  }

  @Override
  public synchronized void close() throws IOException {
    if (this.closed) {
      LOG.debug("Ignoring close() as stream was already closed");
      return;
    }

    this.closed = true;

    LOG.debug("Closing stream {}", this);

    try {
      final HttpResponse response;

      if (this.chunksCount == 1) {
        response = this.uploadSingleChunk();
      } else {
        response = this.finishMpuUploadAndClose();
      }

      this.eTag = this.getETagFromResponse(response);

      LOG.info("Object upload complete; ETag=[{}]", this.eTag);
    } catch (final Exception ex) {
      LOG.error("Error occurred while closing {}", this, ex);
      throw ex;
    } finally {
      this.cleanUpOnClose();
    }
  }

  @Override
  public synchronized AbortableResult abort() {
    if (this.closed) {
      return new AbortableResultImpl(/* alreadyClosed */ true, /* cleanupException */ null);
    }

    this.closed = true;

    LOG.debug("Aborting {}", this);

    this.chunkUploadFutures.forEach(future -> future.cancel(/* mayInterruptIfRunning */ true));
    this.cleanUpOnClose();

    return new AbortableResultImpl(/* alreadyClosed */ false, /* cleanupException */ null);
  }

  @Override
  public String toString() {
    return "WasbDirectAccessOutputStream{" +
            "path=" + this.path +
            ", overwrite=" + this.overwrite +
            ", chunkSize=" + this.chunkSize +
            ", closed=" + this.closed +
            ", eTag='" + this.eTag + '\'' +
            '}';
  }

  @Override
  protected boolean isFileAlreadyExistError(final HttpResponse httpResponse) {
    return !this.overwrite && httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_CONFLICT;
  }

  private WasbSignedUrl getSignedUrl() throws IOException {
    if (this.signedUrl != null && !this.signedUrl.isExpired()) {
      return this.signedUrl;
    }

    try {
      // We need a dedicated lock because the signed URL might expire while we have the main thread waiting for the
      // upload threads to finish during `close`, which is synchronized
      this.signedUrlLock.lock();

      if (this.signedUrl == null || this.signedUrl.isExpired()) {
        final WasbCreateDirectAccessOptions directAccessOptions = WasbCreateDirectAccessOptions.builder(this.overwrite).build();
        final DirectAccessResponse directAccessResponse = this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions);
        final WasbDirectAccessProperties directAccessResponseProperties = (WasbDirectAccessProperties) directAccessResponse.getProperties();

        this.signedUrl = WasbSignedUrl.from(directAccessResponseProperties.getSignedUrl(), this.signedUrlExpirationSafetyMargin);
      }
    } finally {
      this.signedUrlLock.unlock();
    }

    return this.signedUrl;
  }

  private int writeDataToCurrentChunk(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
    final DataChunk chunk = this.getOrCreateCurrentChunk();

    final int writtenBytes = chunk.write(data, offset, length);
    this.md5Hasher.update(data, offset, writtenBytes);

    return writtenBytes;
  }

  private void uploadCurrentChunkAsync() {
    if (this.chunkUploadFutures == null) {
      this.chunkUploadFutures = new ArrayList<>();
    }

    final DataChunk chunk = this.getCurrentChunkWithCalculatedChecksum();

    LOG.debug("Submitting upload of {} from {}", chunk, this);

    final Future chunkUploadFuture = this.executorService.submit(this.getChunkUploadTask(chunk));
    this.chunkUploadFutures.add(chunkUploadFuture);

    this.currentChunk = null;
    this.md5Hasher.reset();
  }

  private DataChunk getCurrentChunkWithCalculatedChecksum() {
    final DataChunk chunk = this.getOrCreateCurrentChunk();
    final String chunkMd5 = Base64.getEncoder().encodeToString(this.md5Hasher.digest());

    chunk.setMd5Checksum(chunkMd5);

    return chunk;
  }

  private HttpResponse uploadSingleChunk() throws IOException {
    final DataChunk chunk = this.getCurrentChunkWithCalculatedChecksum();
    final HttpPut request = this.buildSingleChunkUploadRequest(chunk);

    LOG.info("Uploading {} of {} using signed URL [{}]", chunk, this, this.getRedactedRequestUrl(request));

    final HttpResponse response = this.httpClient.execute(request);

    this.validateUploadResponse(response, chunk);

    return response;
  }

  private void validateUploadResponse(final HttpResponse response, final Object object) throws IOException {
    final int statusCode = response.getStatusLine().getStatusCode();

    LOG.debug("{} upload request responded with status code: {}", object, statusCode);

    if (!HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
      if (this.isFileAlreadyExistError(response)) {
        throw new FileAlreadyExistsException(this.path.toString());
      }

      throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error uploading %s", object), LOG);
    }
  }

  private HttpResponse finishMpuUploadAndClose() throws IOException {
    this.uploadCurrentChunkAsync();
    this.waitForChunksToBeUploaded();

    final HttpPut request = this.buildCommitChunksRequest();

    LOG.info("Sending commit request for [{}] chunks of {} using signed URL [{}]", this.chunksCount, this,
            this.getRedactedRequestUrl(request));

    final HttpResponse response = this.httpClient.execute(request);

    this.validateUploadResponse(response, "BlockList");

    return response;
  }

  private void waitForChunksToBeUploaded() throws IOException {
    LOG.debug("Waiting for [{}] chunks to be uploaded", this.chunkUploadFutures.size());

    for (final Future chunkUploadFuture : this.chunkUploadFutures) {
      try {
        chunkUploadFuture.get();
      } catch (final ExecutionException ex) {
        final String errorMessage = "Error while waiting for Chunks to be uploaded; aborting and cleaning up";
        final Throwable actualException = Optional.ofNullable(ex.getCause()).orElse(ex);

        LOG.error(errorMessage, actualException);
        this.abort();

        throw new IOException(errorMessage, actualException);
      } catch (final InterruptedException ex) {
        final String errorMessage = "Thread interrupted while waiting for Chunks to be uploaded";
        LOG.error(errorMessage, ex);

        Thread.currentThread().interrupt();

        throw new IOException(errorMessage, ex);
      }
    }
  }

  private HttpPut buildSingleChunkUploadRequest(final DataChunk chunk) throws IOException {
    final URI uri = this.buildRequestURI();

    return this.buildBaseChunkUploadRequest(chunk, uri);
  }

  private HttpPut buildMultiChunkUploadRequest(final DataChunk chunk) throws IOException {
    final URI uri = this.buildRequestURI(MPU_BLOCK_PARAMETER + chunk.getId());

    return this.buildBaseChunkUploadRequest(chunk, uri);
  }

  private HttpPut buildBaseChunkUploadRequest(final DataChunk chunk, final URI uri) throws IOException {
    final HttpPut request = new HttpPut(uri);

    this.setChunkRequestEntity(request, chunk);
    this.getSignedUrl().getHeaders().forEach(request::setHeader);

    // delegate MD5 validation to Blob Service
    request.setHeader(HttpHeaders.CONTENT_MD5, chunk.getMd5Checksum());

    return request;
  }

  private HttpPut buildCommitChunksRequest() throws IOException {
    final URI uri = this.buildRequestURI(MPU_BLOCKLIST_PARAMETER);
    final String requestContent = this.buildCommitChunksRequestContent();
    final HttpPut request = new HttpPut(uri);

    this.setCommitRequestContent(request, requestContent);

    this.getSignedUrl().getHeaders().forEach((key, value) -> {
      if (!key.equals(HEADER_X_MS_BLOB_TYPE_NAME)) {
        request.setHeader(key, value);
      }
    });

    return request;
  }

  private String buildCommitChunksRequestContent() {
    final StringBuilder blockListStringBuilder = new StringBuilder("");

    for (int chunkNumber = 1; chunkNumber <= this.chunksCount; chunkNumber++) {
      final String chunkId = this.generateChunkId(chunkNumber);
      blockListStringBuilder.append(String.format("%s", chunkId));
    }

    blockListStringBuilder.append("");

    return blockListStringBuilder.toString();
  }

  private void setCommitRequestContent(final HttpPut request, final String requestContent) {
    final byte[] requestContentBytes = requestContent.getBytes(HdlfsConstants.DEFAULT_CHARSET);

    this.md5Hasher.reset();
    this.md5Hasher.update(requestContentBytes);

    final String requestContentMd5 = Base64.getEncoder().encodeToString(this.md5Hasher.digest());
    final ByteArrayEntity requestEntity = new ByteArrayEntity(requestContentBytes, ContentType.APPLICATION_OCTET_STREAM);

    request.setEntity(requestEntity);
    request.setHeader(HttpHeaders.CONTENT_MD5, requestContentMd5);
  }

  private URI buildRequestURI() throws IOException {
    return this.buildRequestURI(EMPTY_STRING);
  }

  private URI buildRequestURI(final String queryParameters) throws IOException {
    final String signedUrl = this.getSignedUrl().getUrl();

    try {
      return new URI(signedUrl + queryParameters);
    } catch (final URISyntaxException ex) {
      throw new RuntimeException(ex);
    }
  }

  private void setChunkRequestEntity(final HttpPut request, final DataChunk chunk) {
    final int chunkDataSize = chunk.getDataSize();
    final InputStream chunkInputStream = chunk.getInputStream();

    request.setEntity(new RepeatableInputStreamEntity(chunkInputStream, chunkDataSize));
  }

  private Callable getChunkUploadTask(final DataChunk chunk) {
    return () -> {
      try {
        final HttpPut request = this.buildMultiChunkUploadRequest(chunk);

        LOG.debug("Uploading {} of {} using signed URL [{}]", chunk, this, this.getRedactedRequestUrl(request));

        final HttpResponse response = this.httpClient.execute(request);

        this.validateUploadResponse(response, chunk);
      } catch (final Exception ex) {
        final String errorMessage = String.format("Could not upload %s", chunk);

        throw new IOException(errorMessage, ex);
      } finally {
        IOUtils.closeQuietly(chunk);
      }

      return null;
    };
  }

  private String getRedactedRequestUrl(final HttpPut request) {
    final String requestUrl = request.getURI().toString();

    return WASB_SIGNATURE_TOKEN_PATTERN.matcher(requestUrl).replaceAll("$1$2=[REDACTED]");
  }

}

// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy