com.sap.hana.datalake.files.HdlfsOutputStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sap-hdlfs Show documentation
Show all versions of sap-hdlfs Show documentation
An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.
// © 2022-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files;
import com.sap.hana.datalake.files.classification.InterfaceAudience;
import com.sap.hana.datalake.files.exception.PayloadTooLargeException;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.HdlfsRetryUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Abortable;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicBoolean;
@InterfaceAudience.Private
public class HdlfsOutputStream extends OutputStream implements Abortable {
private static final Logger LOG = LoggerFactory.getLogger(HdlfsOutputStream.class);
private final AtomicBoolean closed = new AtomicBoolean(false);
private final byte[] singleByteBuffer = new byte[1];
private final int chunkSize;
private final HdlfsBaseFileSystem fileSystem;
private final boolean keepPendingAfterClose;
private final HdlfsMultipartUpload.Config multipartUploadConfig;
private final FsPermission permission;
private final Progressable progress;
private final boolean overwrite;
private final short replication;
private final RetryPolicy retryPolicy;
private final Path targetPath;
private final String uuid;
private int chunksCount = 0;
private DataChunk currentChunk;
private HdlfsMultipartUpload multipartUpload;
private long totalBytesWritten = 0;
private HdlfsOutputStream(final Builder builder) throws IOException {
builder.validate();
this.chunkSize = builder.multipartUploadConfig.getChunkSize();
this.fileSystem = builder.fileSystem;
this.keepPendingAfterClose = builder.keepPendingAfterClose;
this.multipartUploadConfig = builder.multipartUploadConfig;
this.permission = builder.permission;
this.progress = builder.progress;
this.overwrite = builder.overwrite;
this.replication = builder.replication;
this.retryPolicy = builder.retryPolicy;
this.targetPath = builder.targetPath;
this.uuid = UUID.randomUUID().toString();
if (builder.initialChunk != null) {
this.currentChunk = new DataChunk(builder.initialChunk, this.getNextDataChunkId());
} else {
this.currentChunk = this.createChunkIfNecessary();
}
if (this.multipartUploadConfig.isHdlfsOutputCommitterEnabled() && this.keepPendingAfterClose) {
this.initializeMultipartUpload();
}
}
public long getTotalBytesWritten() {
return this.totalBytesWritten;
}
public int getChunkSize() {
return this.chunkSize;
}
public HdlfsMultipartUpload.Config getMultipartUploadConfig() {
return this.multipartUploadConfig;
}
public FsPermission getPermission() {
return this.permission;
}
public Progressable getProgress() {
return this.progress;
}
public short getReplication() {
return this.replication;
}
public Path getTargetPath() {
return this.targetPath;
}
public boolean isOverwrite() {
return this.overwrite;
}
@Override
public void write(final int byteValue) throws IOException {
this.singleByteBuffer[0] = (byte) byteValue;
this.write(this.singleByteBuffer, /* offset */ 0, /* length */ 1);
}
@Override
public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
LOG.debug("Started write operation into stream with id=[{}] in chunk [{}]", this.uuid, this.chunksCount);
ArrayUtils.validateBoundaries(data, offset, length);
this.checkStreamIsOpen();
if (length == 0) {
return;
}
final DataChunk chunk = this.createChunkIfNecessary();
final int writtenBytes = chunk.write(data, offset, length);
final int chunkRemainingCapacity = chunk.getRemainingCapacity();
LOG.debug("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunkRemainingCapacity);
this.totalBytesWritten += writtenBytes;
if (writtenBytes < length) {
/* The current chunk ran out of space and there is still data left to be written.
* So, dispatch the current chunk for upload and process the rest.
*/
LOG.debug("Current data chunk ran out of space; dispatching it for upload");
// Note: this call might block depending on the number of Chunks being currently uploaded
this.uploadCurrentChunkAsync();
final int leftOverOffset = offset + writtenBytes;
final int leftOverLength = length - writtenBytes;
this.write(data, leftOverOffset, leftOverLength);
}
LOG.debug("Write operation into stream with id=[{}] in chunk [{}] completed", this.uuid, this.chunksCount);
}
@Override
public void close() throws IOException {
if (this.closed.getAndSet(true)) {
LOG.debug("Ignoring close() as stream was already closed");
return;
}
final DataChunk activeChunk = this.getCurrentChunk();
final boolean hasActiveChunk = activeChunk != null;
LOG.debug("Closing stream: targetPath=[{}], chunksCount=[{}], chunkSize=[{}], bytesWritten=[{}]",
this.targetPath, this.chunksCount, this.chunkSize, this.totalBytesWritten);
try {
if (this.multipartUpload == null) {
/* If multipart upload was not used, the amount of data written to the stream was less than the chunkSize.
* So, the current active activeChunk should be uploaded as a normal CREATE request.
* NOTE: even if zero bytes were written, an empty object should be created.
*/
if (hasActiveChunk) {
this.createTargetFromCurrentChunk();
} else {
LOG.warn("Stream had no active chunks and no multipart upload initialized");
}
} else {
if (this.chunksCount == 1) {
LOG.debug("Unnecessary use of Multipart Upload for a stream that contains just a single data chunk");
}
/* The multipart upload is in progress.
* Dispatch the last activeChunk and wait for completion.
*/
if (hasActiveChunk && activeChunk.getDataSize() > 0) {
LOG.debug("Dispatching last data chunk with [{}] bytes for upload", activeChunk.getDataSize());
this.uploadCurrentChunkAsync();
}
LOG.debug("Waiting for Multipart Upload to complete");
this.multipartUpload.complete();
LOG.debug("Multipart Upload complete! ETag of the target object(path={}) is [{}]", this.targetPath, this.multipartUpload.getETag());
}
} catch (final Exception ex) {
/* In case the operation fails, we need to abort the multipart upload
* in order to clean up the chunk objects that might have been created
*/
final String errorMessage = "An error occurred while closing the stream";
LOG.error(errorMessage, ex);
if (this.multipartUpload != null) {
LOG.debug("Aborting Multipart Upload");
this.multipartUpload.abort();
}
throw ex;
} finally {
this.cleanUpChunk();
}
}
@Override
public AbortableResult abort() {
if (this.closed.getAndSet(true)) {
LOG.debug("Ignoring abort() as stream was already closed");
return new AbortableResultImpl(true, null);
}
LOG.debug("Aborting");
this.multipartUpload.abort();
try {
this.cleanUpChunk();
} catch (final IOException ex) {
LOG.warn("An error occurred during the chunks cleanup inside abort(): {}", ex);
}
return new AbortableResultImpl(false, null);
}
protected synchronized DataChunk getCurrentChunk() {
return this.currentChunk;
}
protected RetryPolicy getRetryPolicy() {
return this.retryPolicy;
}
protected synchronized void clearCurrentChunk() {
this.currentChunk = null;
}
protected void initializeMultipartUpload() {
if (this.multipartUpload == null) {
if (this.keepPendingAfterClose) {
this.multipartUpload = new HdlfsKeepPendingMultipartUpload(this.targetPath,
this.fileSystem,
this.multipartUploadConfig,
this.progress,
this.retryPolicy);
} else {
this.multipartUpload = new HdlfsMultipartUpload(this.targetPath,
this.fileSystem,
this.multipartUploadConfig,
this.progress,
this.retryPolicy);
}
}
}
protected void uploadCurrentChunkAsync() {
LOG.debug("Dispatching chunk #{} for upload", this.chunksCount);
this.initializeMultipartUpload();
try {
this.multipartUpload.uploadChunkAsync(this.getCurrentChunk(), this.chunksCount - 1);
} finally {
/* Set the current chunk to null, so that the next write creates a new chunk. */
this.clearCurrentChunk();
}
LOG.debug("Chunk #{} dispatched for upload successfully", this.chunksCount);
}
protected AtomicBoolean getClosed() {
return this.closed;
}
protected DataChunk createNewDataChunk() {
return new DataChunk(this.chunkSize, this.getNextDataChunkId());
}
protected synchronized DataChunk createChunkIfNecessary() throws PayloadTooLargeException {
if (this.currentChunk == null) {
if (this.chunksCount >= HdlfsMultipartUpload.MAX_CHUNKS) {
final String message = "Number of chunks in the stream exceeds the limit: " + HdlfsMultipartUpload.MAX_CHUNKS;
LOG.error(message);
throw new PayloadTooLargeException(message);
}
LOG.debug("A new data chunk will be created; the active stream has [{}] chunks", this.chunksCount);
this.currentChunk = this.createNewDataChunk();
}
return this.currentChunk;
}
protected void createTargetFromCurrentChunk() throws IOException {
final DataChunk chunk = this.getCurrentChunk();
LOG.debug("Creating target(path={}) from current data Chunk(size={})", this.targetPath, chunk.getDataSize());
final String operationTitle = String.format("Upload Chunk(path=%s, size=%d)", this.targetPath, chunk.getDataSize());
// no more writing to this chunk
final InputStream chunkInputStream = chunk.getInputStream();
try {
HdlfsRetryUtils.execWithRetry(operationTitle, this.retryPolicy, /* idempotent */ true, () -> {
/* Create new target object and write data from the active chunk to the new object output stream */
final WebHdfsFileSystem webHdfsFileSystem = this.fileSystem.getWebHdfsFileSystem();
/* Try to write Chunk data into CREATE output stream */
try (final OutputStream createOutputStream = webHdfsFileSystem.create(this.targetPath, this.permission, this.overwrite, this.chunkSize, this.replication, this.chunkSize, this.progress)) {
/* In case of failures, chunkInputStream will be kept open for retries - DataChunk.copyBytesTo marks and resets the stream */
com.sap.hana.datalake.files.utils.IOUtils.copyBytesTo(chunkInputStream, createOutputStream, chunk.getDataSize());
}
return null;
});
} catch (final Exception ex) {
final String errorMessage = String.format("Error creating target object(path=%s) from data Chunk(size=%d)", this.targetPath, chunk.getDataSize());
LOG.error(errorMessage, ex);
throw new IOException(errorMessage, ex);
} finally {
chunk.close();
}
this.clearCurrentChunk();
LOG.debug("Target file [{}] created", this.targetPath);
}
private void checkStreamIsOpen() throws IOException {
if (this.closed.get()) {
final String errorMessage = "Stream is already closed";
LOG.error(errorMessage);
throw new IOException(errorMessage);
}
}
private void cleanUpChunk() throws IOException {
IOUtils.closeQuietly(this.getCurrentChunk());
this.clearCurrentChunk();
super.close();
}
private String getNextDataChunkId() {
Preconditions.checkArgument(this.chunksCount >= 0, "chunk count cannot be < 0");
this.chunksCount++;
return String.valueOf(this.chunksCount - 1); // 0-indexed chunk count
}
public static class Builder {
private HdlfsBaseFileSystem fileSystem;
private HdlfsMultipartUpload.Config multipartUploadConfig;
private FsPermission permission;
private Progressable progress;
private boolean overwrite;
private short replication;
private RetryPolicy retryPolicy;
private Path targetPath;
private boolean keepPendingAfterClose;
private DataChunk initialChunk;
public Builder() {
// no-op
}
public Builder(final Builder builder) {
this.fileSystem = builder.fileSystem;
this.multipartUploadConfig = builder.multipartUploadConfig;
this.permission = builder.permission;
this.progress = builder.progress;
this.overwrite = builder.overwrite;
this.replication = builder.replication;
this.retryPolicy = builder.retryPolicy;
this.targetPath = builder.targetPath;
this.keepPendingAfterClose = builder.keepPendingAfterClose;
this.initialChunk = builder.initialChunk;
}
public HdlfsOutputStream build() throws IOException {
return new HdlfsOutputStream(this);
}
public Builder withFileSystem(final HdlfsBaseFileSystem fileSystem) {
this.fileSystem = fileSystem;
return this;
}
public Builder withMultipartUploadConfig(final HdlfsMultipartUpload.Config multipartUploadConfig) {
this.multipartUploadConfig = multipartUploadConfig;
return this;
}
public Builder withPermission(final FsPermission permission) {
this.permission = permission;
return this;
}
public Builder withProgress(final Progressable progress) {
this.progress = progress;
return this;
}
public Builder withOverwrite(final boolean overwrite) {
this.overwrite = overwrite;
return this;
}
public Builder withReplication(final short replication) {
this.replication = replication;
return this;
}
public Builder withRetryPolicy(final RetryPolicy retryPolicy) {
this.retryPolicy = retryPolicy;
return this;
}
public Builder withTargetPath(final Path targetPath) {
this.targetPath = targetPath;
return this;
}
public Builder withInitialChunk(final DataChunk chunk) {
this.initialChunk = chunk;
return this;
}
Builder withKeepPendingAfterClose(final boolean keepPendingAfterClose) {
this.keepPendingAfterClose = keepPendingAfterClose;
return this;
}
private void validate() {
Preconditions.checkNotNull(this.fileSystem, "fileSystem must not be null");
Preconditions.checkNotNull(this.multipartUploadConfig, "multipartUploadConfig must not be null");
Preconditions.checkNotNull(this.targetPath, "targetPath must not be null");
}
}
}
// © 2022-2024 SAP SE or an SAP affiliate company. All rights reserved.
© 2015 - 2025 Weber Informatics LLC | Privacy Policy