com.sap.hana.datalake.files.directaccess.wasb.WasbDirectAccessOutputStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sap-hdlfs Show documentation
Show all versions of sap-hdlfs Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files.directaccess.wasb;
import com.sap.hana.datalake.files.HdlfsConstants;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.DirectAccessResponse;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.WasbCreateDirectAccessOptions;
import com.sap.hana.datalake.files.HdlfsFileSystemCapabilities.WasbDirectAccessProperties;
import com.sap.hana.datalake.files.directaccess.BaseDirectAccessOutputStream;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.http.HttpClientUtils;
import com.sap.hana.datalake.files.utils.http.RepeatableInputStreamEntity;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.Path;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.util.SemaphoredDelegatingExecutor;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.entity.ContentType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
public class WasbDirectAccessOutputStream extends BaseDirectAccessOutputStream {
protected static final String HEADER_X_MS_BLOB_TYPE_NAME = "X-MS-Blob-Type";
protected static final String MPU_BLOCK_PARAMETER = "&comp=block&blockid=";
protected static final String MPU_BLOCKLIST_PARAMETER = "&comp=blocklist";
private static final Logger LOG = LoggerFactory.getLogger(WasbDirectAccessOutputStream.class);
private static final Pattern WASB_SIGNATURE_TOKEN_PATTERN = Pattern.compile("([?&])(sig|skoid|sktid)=[^\\s&$]*", Pattern.CASE_INSENSITIVE);
private static final String EMPTY_STRING = "";
private final int signedUrlExpirationSafetyMargin;
private final ExecutorService executorService;
private final Lock signedUrlLock;
private WasbSignedUrl signedUrl;
private List> chunkUploadFutures;
public static WasbDirectAccessOutputStream newInstance(final Path path, final boolean overwrite, final int chunkSize,
final WebHdfsFileSystem webHdfsFileSystem, final HttpClient httpClient,
final ExecutorService multipartUploadThreadPool, final int maxActiveChunks,
final int signedUrlExpirationSafetyMargin) throws IOException {
return new WasbDirectAccessOutputStream(path, overwrite, chunkSize, webHdfsFileSystem, httpClient, multipartUploadThreadPool,
maxActiveChunks, signedUrlExpirationSafetyMargin);
}
public WasbDirectAccessOutputStream(final Path path, final boolean overwrite, final int chunkSize,
final WebHdfsFileSystem webHdfsFileSystem, final HttpClient httpClient,
final ExecutorService multipartUploadThreadPool, final int maxActiveChunks,
final int signedUrlExpirationSafetyMargin) throws IOException {
super(path, overwrite, chunkSize, webHdfsFileSystem, httpClient);
this.executorService = new SemaphoredDelegatingExecutor(multipartUploadThreadPool, maxActiveChunks, /* fair */ true);
this.signedUrlLock = new ReentrantLock();
this.signedUrl = this.getSignedUrl();
this.signedUrlExpirationSafetyMargin = signedUrlExpirationSafetyMargin;
this.getOrCreateCurrentChunk();
}
@Override
public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
LOG.trace("Started write operation into stream {}", this);
ArrayUtils.validateBoundaries(data, offset, length);
if (this.closed) {
throw new IllegalStateException("Stream already closed");
}
if (length == 0) {
return;
}
final DataChunk chunk = this.getOrCreateCurrentChunk();
final int writtenBytes = this.writeDataToCurrentChunk(data, offset, length);
final int chunkRemainingCapacity = chunk.getRemainingCapacity();
LOG.debug("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunkRemainingCapacity);
if (writtenBytes < length) {
LOG.debug("Current data chunk ran out of space; dispatching it for upload");
// Note: this call might block depending on the number of Chunks being currently uploaded
this.uploadCurrentChunkAsync();
final int leftOverOffset = offset + writtenBytes;
final int leftOverLength = length - writtenBytes;
this.write(data, leftOverOffset, leftOverLength);
}
LOG.debug("Write operation into {} completed", this);
}
@Override
public synchronized void close() throws IOException {
if (this.closed) {
LOG.debug("Ignoring close() as stream was already closed");
return;
}
this.closed = true;
LOG.debug("Closing stream {}", this);
try {
final HttpResponse response;
if (this.chunksCount == 1) {
response = this.uploadSingleChunk();
} else {
response = this.finishMpuUploadAndClose();
}
this.eTag = this.getETagFromResponse(response);
LOG.info("Object upload complete; ETag=[{}]", this.eTag);
} catch (final Exception ex) {
LOG.error("Error occurred while closing {}", this, ex);
throw ex;
} finally {
this.cleanUpOnClose();
}
}
@Override
public synchronized AbortableResult abort() {
if (this.closed) {
return new AbortableResultImpl(/* alreadyClosed */ true, /* cleanupException */ null);
}
this.closed = true;
LOG.debug("Aborting {}", this);
this.chunkUploadFutures.forEach(future -> future.cancel(/* mayInterruptIfRunning */ true));
this.cleanUpOnClose();
return new AbortableResultImpl(/* alreadyClosed */ false, /* cleanupException */ null);
}
@Override
public String toString() {
return "WasbDirectAccessOutputStream{" +
"path=" + this.path +
", overwrite=" + this.overwrite +
", chunkSize=" + this.chunkSize +
", closed=" + this.closed +
", eTag='" + this.eTag + '\'' +
'}';
}
@Override
protected boolean isFileAlreadyExistError(final HttpResponse httpResponse) {
return !this.overwrite && httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_CONFLICT;
}
private WasbSignedUrl getSignedUrl() throws IOException {
if (this.signedUrl != null && !this.signedUrl.isExpired()) {
return this.signedUrl;
}
try {
// We need a dedicated lock because the signed URL might expire while we have the main thread waiting for the
// upload threads to finish during `close`, which is synchronized
this.signedUrlLock.lock();
if (this.signedUrl == null || this.signedUrl.isExpired()) {
final WasbCreateDirectAccessOptions directAccessOptions = WasbCreateDirectAccessOptions.builder(this.overwrite).build();
final DirectAccessResponse directAccessResponse = this.webHdfsFileSystem.createDirectAccess(this.path, directAccessOptions);
final WasbDirectAccessProperties directAccessResponseProperties = (WasbDirectAccessProperties) directAccessResponse.getProperties();
this.signedUrl = WasbSignedUrl.from(directAccessResponseProperties.getSignedUrl(), this.signedUrlExpirationSafetyMargin);
}
} finally {
this.signedUrlLock.unlock();
}
return this.signedUrl;
}
private int writeDataToCurrentChunk(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
final DataChunk chunk = this.getOrCreateCurrentChunk();
final int writtenBytes = chunk.write(data, offset, length);
this.md5Hasher.update(data, offset, writtenBytes);
return writtenBytes;
}
private void uploadCurrentChunkAsync() {
if (this.chunkUploadFutures == null) {
this.chunkUploadFutures = new ArrayList<>();
}
final DataChunk chunk = this.getCurrentChunkWithCalculatedChecksum();
LOG.debug("Submitting upload of {} from {}", chunk, this);
final Future> chunkUploadFuture = this.executorService.submit(this.getChunkUploadTask(chunk));
this.chunkUploadFutures.add(chunkUploadFuture);
this.currentChunk = null;
this.md5Hasher.reset();
}
private DataChunk getCurrentChunkWithCalculatedChecksum() {
final DataChunk chunk = this.getOrCreateCurrentChunk();
final String chunkMd5 = Base64.getEncoder().encodeToString(this.md5Hasher.digest());
chunk.setMd5Checksum(chunkMd5);
return chunk;
}
private HttpResponse uploadSingleChunk() throws IOException {
final DataChunk chunk = this.getCurrentChunkWithCalculatedChecksum();
final HttpPut request = this.buildSingleChunkUploadRequest(chunk);
LOG.info("Uploading {} of {} using signed URL [{}]", chunk, this, this.getRedactedRequestUrl(request));
final HttpResponse response = this.httpClient.execute(request);
this.validateUploadResponse(response, chunk);
return response;
}
private void validateUploadResponse(final HttpResponse response, final Object object) throws IOException {
final int statusCode = response.getStatusLine().getStatusCode();
LOG.debug("{} upload request responded with status code: {}", object, statusCode);
if (!HttpClientUtils.isSuccessfulStatusCode(statusCode)) {
if (this.isFileAlreadyExistError(response)) {
throw new FileAlreadyExistsException(this.path.toString());
}
throw HttpClientUtils.getAndLogExceptionForUnsuccessfulResponse(response, String.format("Error uploading %s", object), LOG);
}
}
private HttpResponse finishMpuUploadAndClose() throws IOException {
this.uploadCurrentChunkAsync();
this.waitForChunksToBeUploaded();
final HttpPut request = this.buildCommitChunksRequest();
LOG.info("Sending commit request for [{}] chunks of {} using signed URL [{}]", this.chunksCount, this,
this.getRedactedRequestUrl(request));
final HttpResponse response = this.httpClient.execute(request);
this.validateUploadResponse(response, "BlockList");
return response;
}
private void waitForChunksToBeUploaded() throws IOException {
LOG.debug("Waiting for [{}] chunks to be uploaded", this.chunkUploadFutures.size());
for (final Future> chunkUploadFuture : this.chunkUploadFutures) {
try {
chunkUploadFuture.get();
} catch (final ExecutionException ex) {
final String errorMessage = "Error while waiting for Chunks to be uploaded; aborting and cleaning up";
final Throwable actualException = Optional.ofNullable(ex.getCause()).orElse(ex);
LOG.error(errorMessage, actualException);
this.abort();
throw new IOException(errorMessage, actualException);
} catch (final InterruptedException ex) {
final String errorMessage = "Thread interrupted while waiting for Chunks to be uploaded";
LOG.error(errorMessage, ex);
Thread.currentThread().interrupt();
throw new IOException(errorMessage, ex);
}
}
}
private HttpPut buildSingleChunkUploadRequest(final DataChunk chunk) throws IOException {
final URI uri = this.buildRequestURI();
return this.buildBaseChunkUploadRequest(chunk, uri);
}
private HttpPut buildMultiChunkUploadRequest(final DataChunk chunk) throws IOException {
final URI uri = this.buildRequestURI(MPU_BLOCK_PARAMETER + chunk.getId());
return this.buildBaseChunkUploadRequest(chunk, uri);
}
private HttpPut buildBaseChunkUploadRequest(final DataChunk chunk, final URI uri) throws IOException {
final HttpPut request = new HttpPut(uri);
this.setChunkRequestEntity(request, chunk);
this.getSignedUrl().getHeaders().forEach(request::setHeader);
// delegate MD5 validation to Blob Service
request.setHeader(HttpHeaders.CONTENT_MD5, chunk.getMd5Checksum());
return request;
}
private HttpPut buildCommitChunksRequest() throws IOException {
final URI uri = this.buildRequestURI(MPU_BLOCKLIST_PARAMETER);
final String requestContent = this.buildCommitChunksRequestContent();
final HttpPut request = new HttpPut(uri);
this.setCommitRequestContent(request, requestContent);
this.getSignedUrl().getHeaders().forEach((key, value) -> {
if (!key.equals(HEADER_X_MS_BLOB_TYPE_NAME)) {
request.setHeader(key, value);
}
});
return request;
}
private String buildCommitChunksRequestContent() {
final StringBuilder blockListStringBuilder = new StringBuilder("");
for (int chunkNumber = 1; chunkNumber <= this.chunksCount; chunkNumber++) {
final String chunkId = this.generateChunkId(chunkNumber);
blockListStringBuilder.append(String.format("%s ", chunkId));
}
blockListStringBuilder.append(" ");
return blockListStringBuilder.toString();
}
private void setCommitRequestContent(final HttpPut request, final String requestContent) {
final byte[] requestContentBytes = requestContent.getBytes(HdlfsConstants.DEFAULT_CHARSET);
this.md5Hasher.reset();
this.md5Hasher.update(requestContentBytes);
final String requestContentMd5 = Base64.getEncoder().encodeToString(this.md5Hasher.digest());
final ByteArrayEntity requestEntity = new ByteArrayEntity(requestContentBytes, ContentType.APPLICATION_OCTET_STREAM);
request.setEntity(requestEntity);
request.setHeader(HttpHeaders.CONTENT_MD5, requestContentMd5);
}
private URI buildRequestURI() throws IOException {
return this.buildRequestURI(EMPTY_STRING);
}
private URI buildRequestURI(final String queryParameters) throws IOException {
final String signedUrl = this.getSignedUrl().getUrl();
try {
return new URI(signedUrl + queryParameters);
} catch (final URISyntaxException ex) {
throw new RuntimeException(ex);
}
}
private void setChunkRequestEntity(final HttpPut request, final DataChunk chunk) {
final int chunkDataSize = chunk.getDataSize();
final InputStream chunkInputStream = chunk.getInputStream();
request.setEntity(new RepeatableInputStreamEntity(chunkInputStream, chunkDataSize));
}
private Callable getChunkUploadTask(final DataChunk chunk) {
return () -> {
try {
final HttpPut request = this.buildMultiChunkUploadRequest(chunk);
LOG.debug("Uploading {} of {} using signed URL [{}]", chunk, this, this.getRedactedRequestUrl(request));
final HttpResponse response = this.httpClient.execute(request);
this.validateUploadResponse(response, chunk);
} catch (final Exception ex) {
final String errorMessage = String.format("Could not upload %s", chunk);
throw new IOException(errorMessage, ex);
} finally {
IOUtils.closeQuietly(chunk);
}
return null;
};
}
private String getRedactedRequestUrl(final HttpPut request) {
final String requestUrl = request.getURI().toString();
return WASB_SIGNATURE_TOKEN_PATTERN.matcher(requestUrl).replaceAll("$1$2=[REDACTED]");
}
}
// © 2023-2024 SAP SE or an SAP affiliate company. All rights reserved.
© 2015 - 2025 Weber Informatics LLC | Privacy Policy