com.sap.hana.datalake.files.directaccess.gcs.GcsDirectAccessOutputStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sap-hdlfs Show documentation
Show all versions of sap-hdlfs Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess.gcs;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.Path;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.util.Preconditions;
import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.MessageDigest;
import java.util.Arrays;
import java.util.Base64;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Pattern;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.GcsCreateDirectAccessOptions;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.GcsCreateDirectAccessProperties;
import com.sap.hana.datalake.files.directaccess.BaseDirectAccessOutputStream;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import com.sap.hana.datalake.files.utils.http.RepeatableInputStreamEntity;
class GcsDirectAccessOutputStream extends BaseDirectAccessOutputStream {
protected static final String HEADER_X_GOOG_HASH_NAME = "x-goog-hash";
private static final Logger LOG = LoggerFactory.getLogger(GcsDirectAccessOutputStream.class);
private static final Pattern GCS_URL_SIGNATURE_PATTERN = Pattern.compile("([?&])X-Goog-Signature=[^\\s&$]*", Pattern.CASE_INSENSITIVE);
private final Path originalPath;
private final int signedUrlExpirationSafetyMargin;
private URI uploadSessionUri;
private long totalBytesServerReceived;
private IOException lastError;
private GcsCreateDirectAccessProperties directAccessProperties;
public GcsDirectAccessOutputStream(final Path delegatedFsPath, final boolean overwrite, final int chunkSize,
final WebHdfsFileSystem webHdfsFileSystem, final HttpClient httpClient, final Path originalPath, final int signedUrlExpirationSafetyMargin) throws IOException {
super(delegatedFsPath, overwrite, chunkSize, webHdfsFileSystem, httpClient);
final GcsCreateDirectAccessOptions directAccessOptions = GcsCreateDirectAccessOptions.builder(this.overwrite)
.withUploadTypes(Arrays.asList(GcsCreateDirectAccessOptions.UploadType.RESUMABLE, GcsCreateDirectAccessOptions.UploadType.DIRECT))
.build();
this.directAccessProperties = (GcsCreateDirectAccessProperties) this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions).getProperties();
this.originalPath = originalPath;
this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;
// Check for the existence of DirectUpload URL.
final HdlfsFileSystemCapabilities.DirectAccessSignedUrl directUploadUrl = this.directAccessProperties.getDirectUpload();
if (directUploadUrl == null || directUploadUrl.getEndpoint() == null) {
this.uploadSessionUri = this.initiateResumableUploadSession();
}
this.currentChunk = this.getOrCreateCurrentChunk();
}
@Override
public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
LOG.trace("Started write operation into stream: {}", this);
ArrayUtils.validateBoundaries(data, offset, length);
if (this.closed) {
throw new IllegalStateException("Stream already closed");
}
if (length == 0) {
return;
}
final DataChunk chunk = this.getOrCreateCurrentChunk();
final int writtenBytes = chunk.write(data, offset, length);
LOG.trace("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunk.getRemainingCapacity());
this.md5Hasher.update(data, offset, writtenBytes);
if (writtenBytes < length) {
// The current chunk ran out of space and there is still data left to be written.
// So, upload the current chunk and then process the rest.
LOG.debug("Current data chunk ran out of space; uploading it to GCS");
this.uploadCurrentChunk(/* isLast */ false);
final int leftOverOffset = offset + writtenBytes;
final int leftOverLength = length - writtenBytes;
this.write(data, leftOverOffset, leftOverLength);
}
LOG.trace("Write operation into stream {} completed", this);
}
@Override
public String toString() {
return "GcsDirectAccessOutputStream{" +
"path=" + this.path +
", overwrite=" + this.overwrite +
", chunkSize=" + this.chunkSize +
", closed=" + this.closed +
", eTag='" + this.eTag + '\'' +
'}';
}
@Override
@SuppressWarnings({ "ThrowableNotThrown" })
public synchronized void close() throws IOException {
if (this.closed) {
return;
}
this.closed = true;
try {
// if anything (either the upload initiation or a previous chunk upload)
// has failed before, rethrow the error
if (this.lastError != null) {
throw this.lastError;
}
this.uploadCurrentChunk(/* isLast */ true);
} catch (final IOException ex) {
LOG.error("Error closing the stream; canceling upload session", ex);
this.cancelUploadSession();
throw ex;
} finally {
this.cleanUpOnClose();
}
}
@Override
public synchronized AbortableResult abort() {
if (this.closed) {
return new AbortableResultImpl(/* alreadyClosed */ true, /* cleanupException */ null);
}
this.closed = true;
final IOException cleanupException = this.cancelUploadSession();
this.cleanUpOnClose();
return new AbortableResultImpl(/* alreadyClosed */ false, cleanupException);
}
private IOException cancelUploadSession() {
if (this.uploadSessionUri == null) {
return null;
}
final HttpDelete request = new HttpDelete(this.uploadSessionUri);
request.setHeader(HttpHeaders.CONTENT_LENGTH, "0");
HttpResponse response = null;
try {
response = this.httpClient.execute(request);
LOG.info("Upload session canceled; response status code: {}", response.getStatusLine().getStatusCode());
return null;
} catch (final IOException ex) {
LOG.warn("Error canceling upload session", ex);
return ex;
} finally {
this.uploadSessionUri = null;
HttpClientUtils.closeHttpResponseQuietly(response);
}
}
private URI initiateResumableUploadSession() throws IOException {
final HdlfsFileSystemCapabilities.DirectAccessSignedUrl resumableUpload = this.getSignedUrl(GcsCreateDirectAccessOptions.UploadType.RESUMABLE);
final String signedUrl = resumableUpload.getEndpoint();
final Map signedHeaders = resumableUpload.getHeaders();
final HttpPost request = new HttpPost(signedUrl);
LOG.debug("Initiating resumable upload session for {}; URL: [{}], headers=[{}]", this, this.redactUrlSignature(signedUrl), signedHeaders);
signedHeaders.forEach(request::setHeader);
HttpResponse httpResponse = null;
try {
httpResponse = this.httpClient.execute(request);
final int statusCode = httpResponse.getStatusLine().getStatusCode();
LOG.debug("Upload initiation request responded with status code: {}", statusCode);
if (!HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(httpResponse, "Failed to initiate resumable upload", LOG);
}
final String locationHeader = HttpClientUtils.getHeaderValue(httpResponse, HttpHeaders.LOCATION);
if (locationHeader == null) {
throw new IOException(String.format("Upload initiation response does not contain %s header", HttpHeaders.LOCATION));
}
final URI uploadLocation = new URI(locationHeader);
LOG.info("Successfully initiated upload session for {}; URI: [{}]", this, this.redactUrlSignature(uploadLocation.toString()));
return uploadLocation;
} catch (final IOException | URISyntaxException ex) {
this.lastError = new IOException("Error initiating resumable upload", ex);
LOG.error(ex.getMessage(), ex);
throw this.lastError;
} finally {
HttpClientUtils.closeHttpResponseQuietly(httpResponse);
}
}
protected Path getPath() {
return this.path;
}
protected Path getOriginalPath() {
return this.originalPath;
}
protected boolean isOverwrite() {
return this.overwrite;
}
protected int getChunkSize() {
return this.chunkSize;
}
protected WebHdfsFileSystem getWebHdfsFileSystem() {
return this.webHdfsFileSystem;
}
protected HttpClient getHttpClient() {
return this.httpClient;
}
protected MessageDigest getMd5Hasher() {
return this.md5Hasher;
}
protected URI getUploadSessionUri() {
return this.uploadSessionUri;
}
protected DataChunk getCurrentChunk() {
return this.currentChunk;
}
protected boolean isClosed() {
return this.closed;
}
@Override
protected boolean isFileAlreadyExistError(final HttpResponse httpResponse) {
return !this.overwrite && httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_PRECONDITION_FAILED;
}
private DataChunk prepareCurrentChunkForUpload(final boolean isLast) {
final DataChunk chunk = this.getOrCreateCurrentChunk();
final int chunkDataSize = chunk.getDataSize();
// If this is the last chunk, we know the total size of the object,
// and we should inform the server so that it can finish the resumable upload
final String totalObjectSizeStr = isLast ? String.valueOf(this.totalBytesServerReceived + chunkDataSize) : "*";
final String contentRange;
if (chunkDataSize == 0) {
// No bytes to upload. Either empty object being uploaded, or a server failure on the
// last write. Either way, totalObjectSizeStr will contain the actual object size.
contentRange = String.format("bytes */%s", totalObjectSizeStr);
} else {
final long chunkFirstByte = this.totalBytesServerReceived;
final long chunkLastByte = chunkFirstByte + chunkDataSize - 1;
contentRange = String.format("bytes %d-%d/%s", chunkFirstByte, chunkLastByte, totalObjectSizeStr);
}
chunk.setContentRange(contentRange);
LOG.debug("Prepared upload of {} of {}", chunk, this);
return chunk;
}
private void uploadCurrentChunk(final boolean isLast) throws IOException {
final DataChunk chunk = this.prepareCurrentChunkForUpload(isLast);
final int chunkDataSize = chunk.getDataSize();
final InputStream currentChunkInputStream = chunk.getInputStream();
final HttpPut request = new HttpPut();
final HdlfsFileSystemCapabilities.DirectAccessSignedUrl directUpload = this.getSignedUrl(GcsCreateDirectAccessOptions.UploadType.DIRECT);
// defined session uri means legacy sgw (no direct upload possible, must use resumable upload)
if (this.uploadSessionUri == null) {
if (isLast && directUpload != null && directUpload.getEndpoint() != null) {
LOG.debug("Using direct upload");
this.uploadSessionUri = URI.create(directUpload.getEndpoint());
directUpload.getHeaders().forEach(request::setHeader);
} else {
LOG.debug("Using resumable upload");
this.uploadSessionUri = this.initiateResumableUploadSession();
}
}
request.setURI(this.uploadSessionUri);
LOG.info("Uploading {} of {} to {}", chunk, this, this.redactUrlSignature(this.uploadSessionUri.toString()));
currentChunkInputStream.mark(this.chunkSize);
request.setEntity(new RepeatableInputStreamEntity(currentChunkInputStream, chunkDataSize));
request.setHeader(HttpHeaders.CONTENT_RANGE, chunk.getContentRange());
HttpResponse response = null;
try {
try {
response = this.httpClient.execute(request);
} catch (final IOException ex) {
throw new IOException(String.format("Error uploading %s", chunk), ex);
}
final int statusCode = response.getStatusLine().getStatusCode();
LOG.debug("Chunk upload request responded with status code: {}", statusCode);
if (HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
this.eTag = this.getETagFromResponse(response);
final String expectedMd5Hash = Base64.getEncoder().encodeToString(this.md5Hasher.digest());
final String md5 = this.getMd5FromResponse(response);
if (!expectedMd5Hash.equals(md5)) {
throw new IllegalStateException(String.format("MD5 hash mismatch; expected %s, got %s", expectedMd5Hash, md5));
}
LOG.info("Object upload complete; ETag: [{}], MD5: [{}]", this.eTag, md5);
return;
}
if (statusCode != HttpClientUtils.SC_PERMANENT_REDIRECT) {
if (this.isFileAlreadyExistError(response)) {
throw new FileAlreadyExistsException(this.originalPath.toString());
}
throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error uploading %s", chunk), LOG);
}
LOG.info("Upload of {} complete", chunk);
// Check if the upload URL has changed on the server
if (this.maybeUpdateUploadSessionURI(response)) {
LOG.info("Upload session URI updated: [{}]", this.redactUrlSignature(this.uploadSessionUri.toString()));
}
// Check if the server received all bytes that were sent
final long newBytesServerReceived = this.getNextByteFromResponse(response);
final long bytesServerReceivedFromCurrentChunk = newBytesServerReceived - this.totalBytesServerReceived;
Preconditions.checkState(bytesServerReceivedFromCurrentChunk >= 0 && bytesServerReceivedFromCurrentChunk <= chunkDataSize);
this.totalBytesServerReceived = newBytesServerReceived;
final long bytesThatMustBeSentAgain = chunkDataSize - bytesServerReceivedFromCurrentChunk;
if (bytesThatMustBeSentAgain == 0L) {
// Server got all the bytes from the current chunk; releasing it
LOG.debug("Server got all bytes from {}; releasing it", chunk);
this.currentChunk.close();
this.currentChunk = null;
} else {
// Server didn't get all bytes from the current chunk
// The bytes that were missed will be moved to a new chunk and will be sent again along with the next chunk upload
LOG.warn("Server didn't got all bytes from {}; the last {} bytes will be sent again along with the next chunk upload", chunk, bytesThatMustBeSentAgain);
currentChunkInputStream.reset();
final long skippedBytes = currentChunkInputStream.skip(bytesServerReceivedFromCurrentChunk);
Preconditions.checkState(skippedBytes == bytesServerReceivedFromCurrentChunk);
this.chunksCount++;
final DataChunk newChunk = new DataChunk(this.chunkSize, this.generateChunkId(this.chunksCount));
final int copiedBytes = newChunk.write(currentChunkInputStream);
Preconditions.checkState(copiedBytes == bytesThatMustBeSentAgain);
this.currentChunk.close();
this.currentChunk = newChunk;
// If this was the last chunk, we need to manually trigger the upload the bytes that were not received by the server
if (isLast) {
this.uploadCurrentChunk(/* isLast */ true);
}
}
// -->
} catch (final IOException ex) {
this.lastError = ex;
LOG.error(ex.getMessage(), ex);
throw ex;
} finally {
IOUtils.closeQuietly(chunk);
HttpClientUtils.closeHttpResponseQuietly(response);
}
}
private HdlfsFileSystemCapabilities.DirectAccessSignedUrl getSignedUrl(final GcsCreateDirectAccessOptions.UploadType uploadType) throws IOException {
final HdlfsFileSystemCapabilities.DirectAccessSignedUrl signedUrl = getUploadMethodBasedOnType(uploadType);
final GcsSignedUrl gcsSignedUrl = GcsSignedUrl.from(signedUrl, this.signedUrlExpirationSafetyMargin);
this.directAccessProperties = this.getDirectAccessProperties(gcsSignedUrl);
return this.getUploadMethodBasedOnType(uploadType);
}
private HdlfsFileSystemCapabilities.DirectAccessSignedUrl getUploadMethodBasedOnType(final GcsCreateDirectAccessOptions.UploadType uploadType) {
return uploadType == GcsCreateDirectAccessOptions.UploadType.DIRECT ?
this.directAccessProperties.getDirectUpload() :
this.directAccessProperties.getResumableUpload();
}
private boolean maybeUpdateUploadSessionURI(final HttpResponse response) throws IOException {
final String updatedUploadSessionUri = HttpClientUtils.getHeaderValue(response, HttpHeaders.LOCATION);
if (updatedUploadSessionUri != null) {
try {
this.uploadSessionUri = new URI(updatedUploadSessionUri);
return true;
} catch (final URISyntaxException ex) {
throw new IOException("Error updating upload session URI", ex);
}
}
return false;
}
private String redactUrlSignature(final String url) {
return GCS_URL_SIGNATURE_PATTERN.matcher(url).replaceAll("$1X-Goog-Signature=[REDACTED]");
}
private long getNextByteFromResponse(final HttpResponse response) {
return Optional.ofNullable(HttpClientUtils.getHeaderValue(response, HttpHeaders.RANGE))
.map(s -> s.substring(s.indexOf("-") + 1))
.map(s -> (Long.parseLong(s) + 1L))
.orElse(0L);
}
private String getMd5FromResponse(final HttpResponse response) {
final Header[] hashHeaders = response.getHeaders(HEADER_X_GOOG_HASH_NAME);
final Optional md5HeaderValue = Arrays.stream(hashHeaders)
.map(Header::getValue)
.filter(v -> v.startsWith(MD5_DIGEST_ALGORITHM_LOWERCASE))
.findFirst();
return md5HeaderValue.map(s -> s.substring(s.indexOf("=") + 1)).orElse(null);
}
private GcsCreateDirectAccessProperties getDirectAccessProperties(final GcsSignedUrl signedUrl) throws IOException {
if (signedUrl.isExpired()) {
final GcsCreateDirectAccessOptions directAccessOptions = GcsCreateDirectAccessOptions.builder(this.overwrite)
.withUploadTypes(Arrays.asList(GcsCreateDirectAccessOptions.UploadType.RESUMABLE, GcsCreateDirectAccessOptions.UploadType.DIRECT))
.build();
return (GcsCreateDirectAccessProperties) this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions).getProperties();
}
return this.directAccessProperties;
}
}
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
© 2015 - 2025 Weber Informatics LLC | Privacy Policy