All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.directaccess.gcs.GcsDirectAccessOutputStream Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess.gcs;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.Path;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.util.Preconditions;
import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.MessageDigest;
import java.util.Arrays;
import java.util.Base64;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Pattern;

import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.GcsCreateDirectAccessOptions;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.GcsCreateDirectAccessProperties;
import com.sap.hana.datalake.files.directaccess.BaseDirectAccessOutputStream;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import com.sap.hana.datalake.files.utils.http.RepeatableInputStreamEntity;

class GcsDirectAccessOutputStream extends BaseDirectAccessOutputStream {

  protected static final String HEADER_X_GOOG_HASH_NAME = "x-goog-hash";

  private static final Logger LOG = LoggerFactory.getLogger(GcsDirectAccessOutputStream.class);
  private static final Pattern GCS_URL_SIGNATURE_PATTERN = Pattern.compile("([?&])X-Goog-Signature=[^\\s&$]*", Pattern.CASE_INSENSITIVE);

  private final Path originalPath;
  private final int signedUrlExpirationSafetyMargin;

  private URI uploadSessionUri;
  private long totalBytesServerReceived;
  private IOException lastError;
  private GcsCreateDirectAccessProperties directAccessProperties;

  public GcsDirectAccessOutputStream(final Path delegatedFsPath, final boolean overwrite, final int chunkSize,
                                     final WebHdfsFileSystem webHdfsFileSystem, final HttpClient httpClient, final Path originalPath, final int signedUrlExpirationSafetyMargin) throws IOException {
    super(delegatedFsPath, overwrite, chunkSize, webHdfsFileSystem, httpClient);

    final GcsCreateDirectAccessOptions directAccessOptions = GcsCreateDirectAccessOptions.builder(this.overwrite)
        .withUploadTypes(Arrays.asList(GcsCreateDirectAccessOptions.UploadType.RESUMABLE, GcsCreateDirectAccessOptions.UploadType.DIRECT))
        .build();

    this.directAccessProperties = (GcsCreateDirectAccessProperties) this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions).getProperties();

    this.originalPath = originalPath;
    this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;

    // Check for the existence of DirectUpload URL.
    final HdlfsFileSystemCapabilities.DirectAccessSignedUrl directUploadUrl = this.directAccessProperties.getDirectUpload();

    if (directUploadUrl == null || directUploadUrl.getEndpoint() == null) {
      this.uploadSessionUri = this.initiateResumableUploadSession();
    }

    this.currentChunk = this.getOrCreateCurrentChunk();
  }

  @Override
  public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
    LOG.trace("Started write operation into stream: {}", this);
    ArrayUtils.validateBoundaries(data, offset, length);

    if (this.closed) {
      throw new IllegalStateException("Stream already closed");
    }

    if (length == 0) {
      return;
    }

    final DataChunk chunk = this.getOrCreateCurrentChunk();
    final int writtenBytes = chunk.write(data, offset, length);
    LOG.trace("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunk.getRemainingCapacity());

    this.md5Hasher.update(data, offset, writtenBytes);

    if (writtenBytes < length) {
      // The current chunk ran out of space and there is still data left to be written.
      // So, upload the current chunk and then process the rest.

      LOG.debug("Current data chunk ran out of space; uploading it to GCS");
      this.uploadCurrentChunk(/* isLast */ false);

      final int leftOverOffset = offset + writtenBytes;
      final int leftOverLength = length - writtenBytes;

      this.write(data, leftOverOffset, leftOverLength);
    }

    LOG.trace("Write operation into stream {} completed", this);
  }

  @Override
  public String toString() {
    return "GcsDirectAccessOutputStream{" +
            "path=" + this.path +
            ", overwrite=" + this.overwrite +
            ", chunkSize=" + this.chunkSize +
            ", closed=" + this.closed +
            ", eTag='" + this.eTag + '\'' +
            '}';
  }

  @Override
  @SuppressWarnings({ "ThrowableNotThrown" })
  public synchronized void close() throws IOException {
    if (this.closed) {
      return;
    }

    this.closed = true;

    try {
      // if anything (either the upload initiation or a previous chunk upload)
      // has failed before, rethrow the error
      if (this.lastError != null) {
        throw this.lastError;
      }

      this.uploadCurrentChunk(/* isLast */ true);
    } catch (final IOException ex) {
      LOG.error("Error closing the stream; canceling upload session", ex);
      this.cancelUploadSession();
      throw ex;
    } finally {
      this.cleanUpOnClose();
    }
  }

  @Override
  public synchronized AbortableResult abort() {
    if (this.closed) {
      return new AbortableResultImpl(/* alreadyClosed */ true, /* cleanupException */ null);
    }

    this.closed = true;

    final IOException cleanupException = this.cancelUploadSession();
    this.cleanUpOnClose();

    return new AbortableResultImpl(/* alreadyClosed */ false, cleanupException);
  }

  private IOException cancelUploadSession() {
    if (this.uploadSessionUri == null) {
      return null;
    }

    final HttpDelete request = new HttpDelete(this.uploadSessionUri);
    request.setHeader(HttpHeaders.CONTENT_LENGTH, "0");

    HttpResponse response = null;

    try {
      response = this.httpClient.execute(request);
      LOG.info("Upload session canceled; response status code: {}", response.getStatusLine().getStatusCode());
      return null;
    } catch (final IOException ex) {
      LOG.warn("Error canceling upload session", ex);
      return ex;
    } finally {
      this.uploadSessionUri = null;
      HttpClientUtils.closeHttpResponseQuietly(response);
    }
  }

  private URI initiateResumableUploadSession() throws IOException {
    final HdlfsFileSystemCapabilities.DirectAccessSignedUrl resumableUpload = this.getSignedUrl(GcsCreateDirectAccessOptions.UploadType.RESUMABLE);

    final String signedUrl = resumableUpload.getEndpoint();
    final Map signedHeaders = resumableUpload.getHeaders();
    final HttpPost request = new HttpPost(signedUrl);

    LOG.debug("Initiating resumable upload session for {}; URL: [{}], headers=[{}]", this, this.redactUrlSignature(signedUrl), signedHeaders);
    signedHeaders.forEach(request::setHeader);

    HttpResponse httpResponse = null;

    try {
      httpResponse = this.httpClient.execute(request);

      final int statusCode = httpResponse.getStatusLine().getStatusCode();

      LOG.debug("Upload initiation request responded with status code: {}", statusCode);

      if (!HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
        throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(httpResponse, "Failed to initiate resumable upload", LOG);
      }

      final String locationHeader = HttpClientUtils.getHeaderValue(httpResponse, HttpHeaders.LOCATION);

      if (locationHeader == null) {
        throw new IOException(String.format("Upload initiation response does not contain %s header", HttpHeaders.LOCATION));
      }

      final URI uploadLocation = new URI(locationHeader);
      LOG.info("Successfully initiated upload session for {}; URI: [{}]", this, this.redactUrlSignature(uploadLocation.toString()));

      return uploadLocation;
    } catch (final IOException | URISyntaxException ex) {
      this.lastError = new IOException("Error initiating resumable upload", ex);
      LOG.error(ex.getMessage(), ex);
      throw this.lastError;
    } finally {
      HttpClientUtils.closeHttpResponseQuietly(httpResponse);
    }
  }

  protected Path getPath() {
    return this.path;
  }

  protected Path getOriginalPath() {
    return this.originalPath;
  }

  protected boolean isOverwrite() {
    return this.overwrite;
  }

  protected int getChunkSize() {
    return this.chunkSize;
  }

  protected WebHdfsFileSystem getWebHdfsFileSystem() {
    return this.webHdfsFileSystem;
  }

  protected HttpClient getHttpClient() {
    return this.httpClient;
  }

  protected MessageDigest getMd5Hasher() {
    return this.md5Hasher;
  }

  protected URI getUploadSessionUri() {
    return this.uploadSessionUri;
  }

  protected DataChunk getCurrentChunk() {
    return this.currentChunk;
  }

  protected boolean isClosed() {
    return this.closed;
  }

  @Override
  protected boolean isFileAlreadyExistError(final HttpResponse httpResponse) {
    return !this.overwrite && httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_PRECONDITION_FAILED;
  }

  private DataChunk prepareCurrentChunkForUpload(final boolean isLast) {
    final DataChunk chunk = this.getOrCreateCurrentChunk();
    final int chunkDataSize = chunk.getDataSize();

    // If this is the last chunk, we know the total size of the object,
    // and we should inform the server so that it can finish the resumable upload
    final String totalObjectSizeStr = isLast ? String.valueOf(this.totalBytesServerReceived + chunkDataSize) : "*";

    final String contentRange;

    if (chunkDataSize == 0) {
      // No bytes to upload. Either empty object being uploaded, or a server failure on the
      // last write. Either way, totalObjectSizeStr will contain the actual object size.
      contentRange = String.format("bytes */%s", totalObjectSizeStr);
    } else {
      final long chunkFirstByte = this.totalBytesServerReceived;
      final long chunkLastByte = chunkFirstByte + chunkDataSize - 1;
      contentRange = String.format("bytes %d-%d/%s", chunkFirstByte, chunkLastByte, totalObjectSizeStr);
    }

    chunk.setContentRange(contentRange);

    LOG.debug("Prepared upload of {} of {}", chunk, this);

    return chunk;
  }

  private void uploadCurrentChunk(final boolean isLast) throws IOException {
    final DataChunk chunk = this.prepareCurrentChunkForUpload(isLast);
    final int chunkDataSize = chunk.getDataSize();
    final InputStream currentChunkInputStream = chunk.getInputStream();

    final HttpPut request = new HttpPut();

    final HdlfsFileSystemCapabilities.DirectAccessSignedUrl directUpload = this.getSignedUrl(GcsCreateDirectAccessOptions.UploadType.DIRECT);

    // defined session uri means legacy sgw (no direct upload possible, must use resumable upload)
    if (this.uploadSessionUri == null) {
      if (isLast && directUpload != null && directUpload.getEndpoint() != null) {
        LOG.debug("Using direct upload");
        this.uploadSessionUri = URI.create(directUpload.getEndpoint());
        directUpload.getHeaders().forEach(request::setHeader);
      } else {
        LOG.debug("Using resumable upload");
        this.uploadSessionUri = this.initiateResumableUploadSession();
      }
    }

    request.setURI(this.uploadSessionUri);

    LOG.info("Uploading {} of {} to {}", chunk, this, this.redactUrlSignature(this.uploadSessionUri.toString()));

    currentChunkInputStream.mark(this.chunkSize);
    request.setEntity(new RepeatableInputStreamEntity(currentChunkInputStream, chunkDataSize));
    request.setHeader(HttpHeaders.CONTENT_RANGE, chunk.getContentRange());

    HttpResponse response = null;

    try {
      try {
        response = this.httpClient.execute(request);
      } catch (final IOException ex) {
        throw new IOException(String.format("Error uploading %s", chunk), ex);
      }

      final int statusCode = response.getStatusLine().getStatusCode();

      LOG.debug("Chunk upload request responded with status code: {}", statusCode);

      if (HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
        this.eTag = this.getETagFromResponse(response);
        final String expectedMd5Hash = Base64.getEncoder().encodeToString(this.md5Hasher.digest());
        final String md5 = this.getMd5FromResponse(response);

        if (!expectedMd5Hash.equals(md5)) {
          throw new IllegalStateException(String.format("MD5 hash mismatch; expected %s, got %s", expectedMd5Hash, md5));
        }

        LOG.info("Object upload complete; ETag: [{}], MD5: [{}]", this.eTag, md5);
        return;
      }

      if (statusCode != HttpClientUtils.SC_PERMANENT_REDIRECT) {
        if (this.isFileAlreadyExistError(response)) {
          throw new FileAlreadyExistsException(this.originalPath.toString());
        }

        throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error uploading %s", chunk), LOG);
      }

      LOG.info("Upload of {} complete", chunk);

      // Check if the upload URL has changed on the server
      if (this.maybeUpdateUploadSessionURI(response)) {
        LOG.info("Upload session URI updated: [{}]", this.redactUrlSignature(this.uploadSessionUri.toString()));
      }

      // Check if the server received all bytes that were sent
      final long newBytesServerReceived = this.getNextByteFromResponse(response);
      final long bytesServerReceivedFromCurrentChunk = newBytesServerReceived - this.totalBytesServerReceived;

      Preconditions.checkState(bytesServerReceivedFromCurrentChunk >= 0 && bytesServerReceivedFromCurrentChunk <= chunkDataSize);

      this.totalBytesServerReceived = newBytesServerReceived;

      final long bytesThatMustBeSentAgain = chunkDataSize - bytesServerReceivedFromCurrentChunk;

      if (bytesThatMustBeSentAgain == 0L) {
        // Server got all the bytes from the current chunk; releasing it
        LOG.debug("Server got all bytes from {}; releasing it", chunk);
        this.currentChunk.close();
        this.currentChunk = null;
      } else {
        // Server didn't get all bytes from the current chunk
        // The bytes that were missed will be moved to a new chunk and will be sent again along with the next chunk upload
        LOG.warn("Server didn't got all bytes from {}; the last {} bytes will be sent again along with the next chunk upload", chunk, bytesThatMustBeSentAgain);

        currentChunkInputStream.reset();
        final long skippedBytes = currentChunkInputStream.skip(bytesServerReceivedFromCurrentChunk);
        Preconditions.checkState(skippedBytes == bytesServerReceivedFromCurrentChunk);

        this.chunksCount++;
        final DataChunk newChunk = new DataChunk(this.chunkSize, this.generateChunkId(this.chunksCount));
        final int copiedBytes = newChunk.write(currentChunkInputStream);
        Preconditions.checkState(copiedBytes == bytesThatMustBeSentAgain);

        this.currentChunk.close();
        this.currentChunk = newChunk;

        // If this was the last chunk,  we need to manually trigger the upload the bytes that were not received by the server
        if (isLast) {
          this.uploadCurrentChunk(/* isLast */ true);
        }
      }
      // -->
    } catch (final IOException ex) {
      this.lastError = ex;
      LOG.error(ex.getMessage(), ex);
      throw ex;
    } finally {
      IOUtils.closeQuietly(chunk);
      HttpClientUtils.closeHttpResponseQuietly(response);
    }
  }

  private HdlfsFileSystemCapabilities.DirectAccessSignedUrl getSignedUrl(final GcsCreateDirectAccessOptions.UploadType uploadType) throws IOException {
    final HdlfsFileSystemCapabilities.DirectAccessSignedUrl signedUrl = getUploadMethodBasedOnType(uploadType);
    final GcsSignedUrl gcsSignedUrl = GcsSignedUrl.from(signedUrl, this.signedUrlExpirationSafetyMargin);

    this.directAccessProperties = this.getDirectAccessProperties(gcsSignedUrl);

    return this.getUploadMethodBasedOnType(uploadType);
  }

  private HdlfsFileSystemCapabilities.DirectAccessSignedUrl getUploadMethodBasedOnType(final GcsCreateDirectAccessOptions.UploadType uploadType) {
    return uploadType == GcsCreateDirectAccessOptions.UploadType.DIRECT ?
            this.directAccessProperties.getDirectUpload() :
            this.directAccessProperties.getResumableUpload();
  }

  private boolean maybeUpdateUploadSessionURI(final HttpResponse response) throws IOException {
    final String updatedUploadSessionUri = HttpClientUtils.getHeaderValue(response, HttpHeaders.LOCATION);

    if (updatedUploadSessionUri != null) {
      try {
        this.uploadSessionUri = new URI(updatedUploadSessionUri);
        return true;
      } catch (final URISyntaxException ex) {
        throw new IOException("Error updating upload session URI", ex);
      }
    }

    return false;
  }

  private String redactUrlSignature(final String url) {
    return GCS_URL_SIGNATURE_PATTERN.matcher(url).replaceAll("$1X-Goog-Signature=[REDACTED]");
  }

  private long getNextByteFromResponse(final HttpResponse response) {
    return Optional.ofNullable(HttpClientUtils.getHeaderValue(response, HttpHeaders.RANGE))
            .map(s -> s.substring(s.indexOf("-") + 1))
            .map(s -> (Long.parseLong(s) + 1L))
            .orElse(0L);
  }

  private String getMd5FromResponse(final HttpResponse response) {
    final Header[] hashHeaders = response.getHeaders(HEADER_X_GOOG_HASH_NAME);
    final Optional md5HeaderValue = Arrays.stream(hashHeaders)
            .map(Header::getValue)
            .filter(v -> v.startsWith(MD5_DIGEST_ALGORITHM_LOWERCASE))
            .findFirst();

    return md5HeaderValue.map(s -> s.substring(s.indexOf("=") + 1)).orElse(null);
  }

  private GcsCreateDirectAccessProperties getDirectAccessProperties(final GcsSignedUrl signedUrl) throws IOException {
    if (signedUrl.isExpired()) {
      final GcsCreateDirectAccessOptions directAccessOptions = GcsCreateDirectAccessOptions.builder(this.overwrite)
              .withUploadTypes(Arrays.asList(GcsCreateDirectAccessOptions.UploadType.RESUMABLE, GcsCreateDirectAccessOptions.UploadType.DIRECT))
              .build();

      return (GcsCreateDirectAccessProperties) this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions).getProperties();
    }

    return this.directAccessProperties;
  }

}

// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy