All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.hana.datalake.files.HdlfsOutputStream Maven / Gradle / Ivy

Go to download

An implementation of org.apache.hadoop.fs.FileSystem targeting SAP HANA Data Lake Files.

There is a newer version: 3.0.27
Show newest version
// © 2022-2024 SAP SE or an SAP affiliate company. All rights reserved.
package com.sap.hana.datalake.files;

import com.sap.hana.datalake.files.classification.InterfaceAudience;
import com.sap.hana.datalake.files.exception.PayloadTooLargeException;
import com.sap.hana.datalake.files.utils.AbortableResultImpl;
import com.sap.hana.datalake.files.utils.ArrayUtils;
import com.sap.hana.datalake.files.utils.DataChunk;
import com.sap.hana.datalake.files.utils.HdlfsRetryUtils;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Abortable;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import com.sap.hana.datalake.files.shaded.org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicBoolean;

@InterfaceAudience.Private
public class HdlfsOutputStream extends OutputStream implements Abortable {

  private static final Logger LOG = LoggerFactory.getLogger(HdlfsOutputStream.class);

  private final AtomicBoolean closed = new AtomicBoolean(false);
  private final byte[] singleByteBuffer = new byte[1];

  private final int chunkSize;
  private final HdlfsBaseFileSystem fileSystem;
  private final boolean keepPendingAfterClose;
  private final HdlfsMultipartUpload.Config multipartUploadConfig;
  private final FsPermission permission;
  private final Progressable progress;
  private final boolean overwrite;
  private final short replication;
  private final RetryPolicy retryPolicy;
  private final Path targetPath;
  private final String uuid;

  private int chunksCount = 0;
  private DataChunk currentChunk;
  private HdlfsMultipartUpload multipartUpload;
  private long totalBytesWritten = 0;

  private HdlfsOutputStream(final Builder builder) throws IOException {
    builder.validate();

    this.chunkSize = builder.multipartUploadConfig.getChunkSize();
    this.fileSystem = builder.fileSystem;
    this.keepPendingAfterClose = builder.keepPendingAfterClose;
    this.multipartUploadConfig = builder.multipartUploadConfig;
    this.permission = builder.permission;
    this.progress = builder.progress;
    this.overwrite = builder.overwrite;
    this.replication = builder.replication;
    this.retryPolicy = builder.retryPolicy;
    this.targetPath = builder.targetPath;
    this.uuid = UUID.randomUUID().toString();
    if (builder.initialChunk != null) {
      this.currentChunk = new DataChunk(builder.initialChunk, this.getNextDataChunkId());
    } else {
      this.currentChunk = this.createChunkIfNecessary();
    }

    if (this.multipartUploadConfig.isHdlfsOutputCommitterEnabled() && this.keepPendingAfterClose) {
      this.initializeMultipartUpload();
    }
  }

  public long getTotalBytesWritten() {
    return this.totalBytesWritten;
  }

  public int getChunkSize() {
    return this.chunkSize;
  }

  public HdlfsMultipartUpload.Config getMultipartUploadConfig() {
    return this.multipartUploadConfig;
  }

  public FsPermission getPermission() {
    return this.permission;
  }

  public Progressable getProgress() {
    return this.progress;
  }

  public short getReplication() {
    return this.replication;
  }

  public Path getTargetPath() {
    return this.targetPath;
  }

  public boolean isOverwrite() {
    return this.overwrite;
  }

  @Override
  public void write(final int byteValue) throws IOException {
    this.singleByteBuffer[0] = (byte) byteValue;
    this.write(this.singleByteBuffer, /* offset */ 0, /* length */ 1);
  }

  @Override
  public synchronized void write(@Nonnull final byte[] data, final int offset, final int length) throws IOException {
    LOG.debug("Started write operation into stream with id=[{}] in chunk [{}]", this.uuid, this.chunksCount);

    ArrayUtils.validateBoundaries(data, offset, length);
    this.checkStreamIsOpen();

    if (length == 0) {
      return;
    }

    final DataChunk chunk = this.createChunkIfNecessary();
    final int writtenBytes = chunk.write(data, offset, length);
    final int chunkRemainingCapacity = chunk.getRemainingCapacity();

    LOG.debug("Wrote [{}] bytes to the current data chunk; the remaining capacity is [{}] bytes", writtenBytes, chunkRemainingCapacity);
    this.totalBytesWritten += writtenBytes;

    if (writtenBytes < length) {
      /* The current chunk ran out of space and there is still data left to be written.
       * So, dispatch the current chunk for upload and process the rest.
       */
      LOG.debug("Current data chunk ran out of space; dispatching it for upload");

      // Note: this call might block depending on the number of Chunks being currently uploaded
      this.uploadCurrentChunkAsync();

      final int leftOverOffset = offset + writtenBytes;
      final int leftOverLength = length - writtenBytes;

      this.write(data, leftOverOffset, leftOverLength);
    }

    LOG.debug("Write operation into stream with id=[{}] in chunk [{}] completed", this.uuid, this.chunksCount);
  }

  @Override
  public void close() throws IOException {
    if (this.closed.getAndSet(true)) {
      LOG.debug("Ignoring close() as stream was already closed");
      return;
    }

    final DataChunk activeChunk = this.getCurrentChunk();
    final boolean hasActiveChunk = activeChunk != null;

    LOG.debug("Closing stream: targetPath=[{}], chunksCount=[{}], chunkSize=[{}], bytesWritten=[{}]",
        this.targetPath, this.chunksCount, this.chunkSize, this.totalBytesWritten);

    try {
      if (this.multipartUpload == null) {
        /* If multipart upload was not used, the amount of data written to the stream was less than the chunkSize.
         * So, the current active activeChunk should be uploaded as a normal CREATE request.
         * NOTE: even if zero bytes were written, an empty object should be created.
         */
        if (hasActiveChunk) {
          this.createTargetFromCurrentChunk();
        } else {
          LOG.warn("Stream had no active chunks and no multipart upload initialized");
        }
      } else {
        if (this.chunksCount == 1) {
          LOG.debug("Unnecessary use of Multipart Upload for a stream that contains just a single data chunk");
        }

        /* The multipart upload is in progress.
         * Dispatch the last activeChunk and wait for completion.
         */
        if (hasActiveChunk && activeChunk.getDataSize() > 0) {
          LOG.debug("Dispatching last data chunk with [{}] bytes for upload", activeChunk.getDataSize());
          this.uploadCurrentChunkAsync();
        }

        LOG.debug("Waiting for Multipart Upload to complete");
        this.multipartUpload.complete();

        LOG.debug("Multipart Upload complete! ETag of the target object(path={}) is [{}]", this.targetPath, this.multipartUpload.getETag());
      }
    } catch (final Exception ex) {
      /* In case the operation fails, we need to abort the multipart upload
       * in order to clean up the chunk objects that might have been created
       */
      final String errorMessage = "An error occurred while closing the stream";
      LOG.error(errorMessage, ex);

      if (this.multipartUpload != null) {
        LOG.debug("Aborting Multipart Upload");
        this.multipartUpload.abort();
      }

      throw ex;
    } finally {
      this.cleanUpChunk();
    }
  }

  @Override
  public AbortableResult abort() {
    if (this.closed.getAndSet(true)) {
      LOG.debug("Ignoring abort() as stream was already closed");
      return new AbortableResultImpl(true, null);
    }

    LOG.debug("Aborting");

    this.multipartUpload.abort();

    try {
      this.cleanUpChunk();
    } catch (final IOException ex) {
      LOG.warn("An error occurred during the chunks cleanup inside abort(): {}", ex);
    }

    return new AbortableResultImpl(false,  null);
  }

  protected synchronized DataChunk getCurrentChunk() {
    return this.currentChunk;
  }

  protected RetryPolicy getRetryPolicy() {
    return this.retryPolicy;
  }

  protected synchronized void clearCurrentChunk() {
    this.currentChunk = null;
  }

  protected void initializeMultipartUpload() {
    if (this.multipartUpload == null) {
      if (this.keepPendingAfterClose) {
        this.multipartUpload = new HdlfsKeepPendingMultipartUpload(this.targetPath,
                this.fileSystem,
                this.multipartUploadConfig,
                this.progress,
                this.retryPolicy);
      } else {
        this.multipartUpload = new HdlfsMultipartUpload(this.targetPath,
              this.fileSystem,
              this.multipartUploadConfig,
              this.progress,
              this.retryPolicy);
      }
    }
  }

  protected void uploadCurrentChunkAsync() {
    LOG.debug("Dispatching chunk #{} for upload", this.chunksCount);
    this.initializeMultipartUpload();

    try {
      this.multipartUpload.uploadChunkAsync(this.getCurrentChunk(), this.chunksCount - 1);
    } finally {
      /* Set the current chunk to null, so that the next write creates a new chunk. */
      this.clearCurrentChunk();
    }

    LOG.debug("Chunk #{} dispatched for upload successfully", this.chunksCount);
  }

  protected AtomicBoolean getClosed() {
    return this.closed;
  }

  protected DataChunk createNewDataChunk() {
    return new DataChunk(this.chunkSize, this.getNextDataChunkId());
  }

  protected synchronized DataChunk createChunkIfNecessary() throws PayloadTooLargeException {
    if (this.currentChunk == null) {
      if (this.chunksCount >= HdlfsMultipartUpload.MAX_CHUNKS) {
        final String message = "Number of chunks in the stream exceeds the limit: " + HdlfsMultipartUpload.MAX_CHUNKS;
        LOG.error(message);
        throw new PayloadTooLargeException(message);
      }

      LOG.debug("A new data chunk will be created; the active stream has [{}] chunks", this.chunksCount);
      this.currentChunk = this.createNewDataChunk();

    }

    return this.currentChunk;
  }

  protected void createTargetFromCurrentChunk() throws IOException {
    final DataChunk chunk = this.getCurrentChunk();
    LOG.debug("Creating target(path={}) from current data Chunk(size={})", this.targetPath, chunk.getDataSize());
    final String operationTitle = String.format("Upload Chunk(path=%s, size=%d)", this.targetPath, chunk.getDataSize());

    // no more writing to this chunk
    final InputStream chunkInputStream = chunk.getInputStream();

    try {
      HdlfsRetryUtils.execWithRetry(operationTitle, this.retryPolicy, /* idempotent */ true, () -> {
        /* Create new target object and write data from the active chunk to the new object output stream */
        final WebHdfsFileSystem webHdfsFileSystem = this.fileSystem.getWebHdfsFileSystem();

        /* Try to write Chunk data into CREATE output stream */
        try (final OutputStream createOutputStream = webHdfsFileSystem.create(this.targetPath, this.permission, this.overwrite, this.chunkSize, this.replication, this.chunkSize, this.progress)) {
          /* In case of failures, chunkInputStream will be kept open for retries - DataChunk.copyBytesTo marks and resets the stream */
          com.sap.hana.datalake.files.utils.IOUtils.copyBytesTo(chunkInputStream, createOutputStream, chunk.getDataSize());
        }

        return null;
      });
    } catch (final Exception ex) {
      final String errorMessage = String.format("Error creating target object(path=%s) from data Chunk(size=%d)", this.targetPath, chunk.getDataSize());
      LOG.error(errorMessage, ex);
      throw new IOException(errorMessage, ex);
    } finally {
      chunk.close();
    }

    this.clearCurrentChunk();
    LOG.debug("Target file [{}] created", this.targetPath);
  }

  private void checkStreamIsOpen() throws IOException {
    if (this.closed.get()) {
      final String errorMessage = "Stream is already closed";
      LOG.error(errorMessage);
      throw new IOException(errorMessage);
    }
  }

  private void cleanUpChunk() throws IOException {
    IOUtils.closeQuietly(this.getCurrentChunk());
    this.clearCurrentChunk();
    super.close();
  }

  private String getNextDataChunkId() {
    Preconditions.checkArgument(this.chunksCount >= 0, "chunk count cannot be < 0");
    this.chunksCount++;

    return String.valueOf(this.chunksCount - 1); // 0-indexed chunk count
  }

  public static class Builder {

    private HdlfsBaseFileSystem fileSystem;
    private HdlfsMultipartUpload.Config multipartUploadConfig;
    private FsPermission permission;
    private Progressable progress;
    private boolean overwrite;
    private short replication;
    private RetryPolicy retryPolicy;
    private Path targetPath;
    private boolean keepPendingAfterClose;
    private DataChunk initialChunk;

    public Builder() {
      // no-op
    }

    public Builder(final Builder builder) {
      this.fileSystem = builder.fileSystem;
      this.multipartUploadConfig = builder.multipartUploadConfig;
      this.permission = builder.permission;
      this.progress = builder.progress;
      this.overwrite = builder.overwrite;
      this.replication = builder.replication;
      this.retryPolicy = builder.retryPolicy;
      this.targetPath = builder.targetPath;
      this.keepPendingAfterClose = builder.keepPendingAfterClose;
      this.initialChunk = builder.initialChunk;
    }

    public HdlfsOutputStream build() throws IOException {
      return new HdlfsOutputStream(this);
    }

    public Builder withFileSystem(final HdlfsBaseFileSystem fileSystem) {
      this.fileSystem = fileSystem;
      return this;
    }

    public Builder withMultipartUploadConfig(final HdlfsMultipartUpload.Config multipartUploadConfig) {
      this.multipartUploadConfig = multipartUploadConfig;
      return this;
    }

    public Builder withPermission(final FsPermission permission) {
      this.permission = permission;
      return this;
    }

    public Builder withProgress(final Progressable progress) {
      this.progress = progress;
      return this;
    }

    public Builder withOverwrite(final boolean overwrite) {
      this.overwrite = overwrite;
      return this;
    }

    public Builder withReplication(final short replication) {
      this.replication = replication;
      return this;
    }

    public Builder withRetryPolicy(final RetryPolicy retryPolicy) {
      this.retryPolicy = retryPolicy;
      return this;
    }

    public Builder withTargetPath(final Path targetPath) {
      this.targetPath = targetPath;
      return this;
    }

    public Builder withInitialChunk(final DataChunk chunk) {
      this.initialChunk = chunk;
      return this;
    }

    Builder withKeepPendingAfterClose(final boolean keepPendingAfterClose) {
      this.keepPendingAfterClose = keepPendingAfterClose;
      return this;
    }

    private void validate() {
      Preconditions.checkNotNull(this.fileSystem, "fileSystem must not be null");
      Preconditions.checkNotNull(this.multipartUploadConfig, "multipartUploadConfig must not be null");
      Preconditions.checkNotNull(this.targetPath, "targetPath must not be null");
    }

  }

}

// © 2022-2024 SAP SE or an SAP affiliate company. All rights reserved.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy