All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.HdlfsOutputStream Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2022 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files;

import com.sap.hana.datalake.files.exception.PayloadTooLargeException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicBoolean;

public class HdlfsOutputStream extends OutputStream {

  private static final Logger LOG = LoggerFactory.getLogger(HdlfsOutputStream.class);

  private final AtomicBoolean closed = new AtomicBoolean(false);
  private final byte[] singleByteBuffer = new byte[1];

  private final int chunkSize;
  private final HdlfsFileSystem fileSystem;
  private final boolean keepPendingAfterClose;
  private final HdlfsMultipartUpload.Config multipartUploadConfig;
  private final FsPermission permission;
  private final Progressable progress;
  private final boolean overwrite;
  private final short replication;
  private final RetryPolicy retryPolicy;
  private final Path targetPath;
  private final String uuid;

  private int chunksCount = 0;
  private HdlfsMultipartUpload.Chunk currentChunk;
  private HdlfsMultipartUpload multipartUpload;
  private long totalBytesWritten = 0;

  private HdlfsOutputStream(final Builder builder) throws IOException {
    builder.validate();

    this.chunkSize = builder.multipartUploadConfig.getChunkSize();
    this.fileSystem = builder.fileSystem;
    this.keepPendingAfterClose = builder.keepPendingAfterClose;
    this.multipartUploadConfig = builder.multipartUploadConfig;
    this.permission = builder.permission;
    this.progress = builder.progress;
    this.overwrite = builder.overwrite;
    this.replication = builder.replication;
    this.retryPolicy = builder.retryPolicy;
    this.targetPath = builder.targetPath;
    this.uuid = UUID.randomUUID().toString();
    this.currentChunk = this.createChunkIfNecessary();
  }

  public long getTotalBytesWritten() {
    return this.totalBytesWritten;
  }

  public int getChunkSize() {
    return this.chunkSize;
  }

  public HdlfsMultipartUpload.Config getMultipartUploadConfig() {
    return this.multipartUploadConfig;
  }

  public FsPermission getPermission() {
    return this.permission;
  }

  public Progressable getProgress() {
    return this.progress;
  }

  public short getReplication() {
    return this.replication;
  }

  public Path getTargetPath() {
    return this.targetPath;
  }

  public boolean isOverwrite() {
    return this.overwrite;
  }

  @Override
  public void write(final int byteValue) throws IOException {
    this.singleByteBuffer[0] = (byte) byteValue;
    this.write(this.singleByteBuffer, /* offset */ 0, /* length */ 1);
  }

  @Override
  public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
    LOG.debug("Started write operation into stream with id=[{}] in chunk [{}]", this.uuid, this.chunksCount);

    this.validateWriteArgs(data, offset, length);
    this.checkStreamIsOpen();

    if (length == 0) {
      return;
    }

    final HdlfsMultipartUpload.Chunk chunk = this.createChunkIfNecessary();
    final int writtenBytes = chunk.write(data, offset, length);
    final int chunkRemainingCapacity = chunk.getRemainingCapacity();

    LOG.debug("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunkRemainingCapacity);
    this.totalBytesWritten += writtenBytes;

    if (writtenBytes < length) {
      /* The current chunk ran out of space and there is still data left to be written.
       * So, dispatch the current chunk for upload and process the rest.
       */
      LOG.debug("Current data chunk ran out of space; dispatching it for upload");

      // Note: this call might block depending on the number of Chunks being currently uploaded
      this.uploadCurrentChunkAsync();

      final int leftOverOffset = offset + writtenBytes;
      final int leftOverLength = length - writtenBytes;

      this.write(data, leftOverOffset, leftOverLength);
    }

    LOG.debug("Write operation into stream with id=[{}] in chunk [{}] completed", this.uuid, this.chunksCount);
  }

  @Override
  public void close() throws IOException {
    if (this.closed.getAndSet(true)) {
      LOG.debug("Ignoring close() as stream was already closed");
      return;
    }

    final HdlfsMultipartUpload.Chunk activeChunk = this.getCurrentChunk();
    final boolean hasActiveChunk = activeChunk != null;

    LOG.debug("Closing stream: targetPath=[{}], chunksCount=[{}], chunkSize=[{}], bytesWritten=[{}]",
            this.targetPath, this.chunksCount, this.chunkSize, this.totalBytesWritten);

    try {
      if (this.multipartUpload == null && !this.getMultipartUploadConfig().isHdlfsOutputCommitterEnabled()) {
        /* If multipart upload was not used, the amount of data written to the stream was less than the chunkSize.
         * So, the current active activeChunk should be uploaded as a normal CREATE request.
         * NOTE: even if zero bytes were written, an empty object should be created.
         */
        if (hasActiveChunk) {
          this.createTargetFromCurrentChunk();
        } else {
          LOG.warn("Stream had no active chunks and no multipart upload initialized");
        }
      } else {
        if (this.chunksCount == 1) {
          LOG.debug("Unnecessary use of Multipart Upload for a stream that contains just a single data chunk");
        }

        /* The multipart upload is in progress.
         * Dispatch the last activeChunk and wait for completion.
         */
        if (hasActiveChunk && activeChunk.getWrittenBytesCount() > 0) {
          LOG.debug("Dispatching last data chunk with [{}] bytes for upload", activeChunk.getWrittenBytesCount());
          this.uploadCurrentChunkAsync();
        }

        LOG.debug("Waiting for Multipart Upload to complete");
        this.multipartUpload.complete();

        LOG.debug("Multipart Upload complete! ETag of the target object(path={}) is [{}]", this.targetPath, this.multipartUpload.getETag());
      }
    } catch (final Exception ex) {
      /* In case the operation fails, we need to abort the multipart upload
       * in order to clean up the chunk objects that might have been created
       */
      final String errorMessage = "An error occurred while closing the stream";
      LOG.error(errorMessage, ex);

      if (this.multipartUpload != null) {
        LOG.debug("Aborting Multipart Upload");
        this.multipartUpload.abort();
      }

      throw ex;
    } finally {
      // Clean up
      IOUtils.closeQuietly(this.getCurrentChunk());
      this.clearCurrentChunk();
      super.close();
    }
  }

  protected synchronized HdlfsMultipartUpload.Chunk getCurrentChunk() {
    return this.currentChunk;
  }

  protected RetryPolicy getRetryPolicy() {
    return this.retryPolicy;
  }

  protected synchronized void clearCurrentChunk() {
    this.currentChunk = null;
  }

  protected void initializeMultipartUpload() {
    if (this.multipartUpload == null) {
      if (this.keepPendingAfterClose) {
        this.multipartUpload = new HdlfsKeepPendingMultipartUpload(this.targetPath,
                this.fileSystem,
                this.multipartUploadConfig,
                this.progress,
                this.retryPolicy);
      } else {
        this.multipartUpload = new HdlfsMultipartUpload(this.targetPath,
              this.fileSystem,
              this.multipartUploadConfig,
              this.progress,
              this.retryPolicy);
      }
    }
  }

  protected void uploadCurrentChunkAsync() {
    LOG.debug("Dispatching chunk #{} for upload", this.chunksCount);
    this.initializeMultipartUpload();

    try {
      this.multipartUpload.uploadChunkAsync(this.getCurrentChunk());
    } finally {
      /* Set the current chunk to null, so that the next write creates a new chunk. */
      this.clearCurrentChunk();
    }

    LOG.debug("Chunk #{} dispatched for upload successfully", this.chunksCount);
  }

  protected AtomicBoolean getClosed() {
    return this.closed;
  }

  protected synchronized HdlfsMultipartUpload.Chunk createChunkIfNecessary() throws PayloadTooLargeException {
    if (this.currentChunk == null) {
      if (this.chunksCount >= HdlfsMultipartUpload.MAX_CHUNKS) {
        final String message = "Number of chunks in the stream exceeds the limit: " + HdlfsMultipartUpload.MAX_CHUNKS;
        LOG.error(message);
        throw new PayloadTooLargeException(message);
      }

      LOG.debug("A new data chunk will be created; the active stream has [{}] chunks", this.chunksCount);
      this.currentChunk = new HdlfsMultipartUpload.Chunk(this.chunksCount, this.chunkSize);
      this.chunksCount++;
    }

    return this.currentChunk;
  }

  protected void createTargetFromCurrentChunk() throws IOException {
    final HdlfsMultipartUpload.Chunk chunk = this.getCurrentChunk();

    LOG.debug("Creating target(path={}) from current data chunk(writtenBytes={})", this.targetPath, chunk.getWrittenBytesCount());

    final WebHdfsFileSystem delegateFs = (WebHdfsFileSystem) this.fileSystem.getDelegateFs();
    /* Create new target object and write data from the active chunk to the new object output stream */
    try (final OutputStream targetOutputStream = delegateFs.create(this.targetPath, this.permission, this.overwrite, this.chunkSize, this.replication, this.chunkSize, this.progress)) {
      /* Both CREATE output stream and Chunk will be closed and resources will be deallocated! */
      chunk.writeBytesTo(targetOutputStream, /* keepChunkOpen */ false);
    } catch (final Exception ex) {
      final String errorMessage = String.format("Error creating target object(path=%s) from data chunk(writtenBytes=%d)", this.targetPath, chunk.getWrittenBytesCount());
      LOG.error(errorMessage, ex);
      throw new IOException(errorMessage, ex);
    } finally {
      /* Making sure the Chunk is indeed closed! */
      chunk.close();
    }

    this.clearCurrentChunk();
    LOG.debug("Target file [{}] created", this.targetPath);
  }

  private void checkStreamIsOpen() throws IOException {
    if (this.closed.get()) {
      final String errorMessage = "Stream is already closed";
      LOG.error(errorMessage);
      throw new IOException(errorMessage);
    }
  }

  private void validateWriteArgs(final byte[] data, final int offset, final int length) {
    final int lastByteOffset = offset + length;

    Preconditions.checkNotNull(data);

    if (offset < 0 || offset > data.length || length < 0 || lastByteOffset > data.length || lastByteOffset < 0) {
      final String errorMessage = String.format("Invalid write arguments: data(length=%d), offset=%d, length=%d", data.length, offset, length);
      LOG.error(errorMessage);
      throw new IndexOutOfBoundsException(errorMessage);
    }
  }

  public static class Builder {

    private HdlfsFileSystem fileSystem;
    private HdlfsMultipartUpload.Config multipartUploadConfig;
    private FsPermission permission;
    private Progressable progress;
    private boolean overwrite;
    private short replication;
    private RetryPolicy retryPolicy;
    private Path targetPath;
    private boolean keepPendingAfterClose;

    public Builder() {
      // no-op
    }

    public Builder(final Builder builder) {
      this.fileSystem = builder.fileSystem;
      this.multipartUploadConfig = builder.multipartUploadConfig;
      this.permission = builder.permission;
      this.progress = builder.progress;
      this.overwrite = builder.overwrite;
      this.replication = builder.replication;
      this.retryPolicy = builder.retryPolicy;
      this.targetPath = builder.targetPath;
      this.keepPendingAfterClose = builder.keepPendingAfterClose;
    }

    public HdlfsOutputStream build() throws IOException {
      return new HdlfsOutputStream(this);
    }

    public Builder withFileSystem(final HdlfsFileSystem fileSystem) {
      this.fileSystem = fileSystem;
      return this;
    }

    public Builder withMultipartUploadConfig(final HdlfsMultipartUpload.Config multipartUploadConfig) {
      this.multipartUploadConfig = multipartUploadConfig;
      return this;
    }

    public Builder withPermission(final FsPermission permission) {
      this.permission = permission;
      return this;
    }

    public Builder withProgress(final Progressable progress) {
      this.progress = progress;
      return this;
    }

    public Builder withOverwrite(final boolean overwrite) {
      this.overwrite = overwrite;
      return this;
    }

    public Builder withReplication(final short replication) {
      this.replication = replication;
      return this;
    }

    public Builder withRetryPolicy(final RetryPolicy retryPolicy) {
      this.retryPolicy = retryPolicy;
      return this;
    }

    public Builder withTargetPath(final Path targetPath) {
      this.targetPath = targetPath;
      return this;
    }

    Builder withKeepPendingAfterClose(final boolean keepPendingAfterClose) {
      this.keepPendingAfterClose = keepPendingAfterClose;
      return this;
    }

    private void validate() {
      Preconditions.checkNotNull(this.fileSystem, "fileSystem must not be null");
      Preconditions.checkNotNull(this.multipartUploadConfig, "multipartUploadConfig must not be null");
      Preconditions.checkNotNull(this.targetPath, "targetPath must not be null");
    }

  }

}

// © 2022 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2024 Weber Informatics LLC | Privacy Policy