All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.s3.S3Sinks Maven / Gradle / Ivy

There is a newer version: 5.5.0
Show newest version
/*
 * Copyright 2021 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.s3;

import com.hazelcast.function.FunctionEx;
import com.hazelcast.function.SupplierEx;
import com.hazelcast.internal.util.ExceptionUtil;
import com.hazelcast.internal.util.StringUtil;
import com.hazelcast.jet.pipeline.Sink;
import com.hazelcast.jet.pipeline.SinkBuilder;
import com.hazelcast.memory.MemoryUnit;
import software.amazon.awssdk.core.sync.RequestBody;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest;
import software.amazon.awssdk.services.s3.model.CompletedPart;
import software.amazon.awssdk.services.s3.model.CreateMultipartUploadRequest;
import software.amazon.awssdk.services.s3.model.UploadPartRequest;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import static com.hazelcast.internal.util.JVMUtil.upcast;

import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

/**
 * Contains factory methods for creating AWS S3 sinks.
 */
public final class S3Sinks {

    private S3Sinks() {
    }

    /**
     * Convenience for {@link #s3(String, String, Charset, SupplierEx, FunctionEx)}
     * Uses {@link Object#toString()} to convert the items to lines.
     */
    @Nonnull
    public static  Sink s3(
            @Nonnull String bucketName,
            @Nonnull SupplierEx clientSupplier
    ) {
        return s3(bucketName, null, StandardCharsets.UTF_8, clientSupplier, Object::toString);
    }

    /**
     * Creates an AWS S3 {@link Sink} which writes items to files into the
     * given bucket. Sink converts each item to string using given {@code
     * toStringFn} and writes it as a line. The sink creates a file
     * in the bucket for each processor instance. Name of the file will include
     * an user provided prefix (if defined) and processor's global index,
     * for example the processor having the
     * index 2 with prefix {@code my-object-} will create the object
     * {@code my-object-2}.
     * 

* No state is saved to snapshot for this sink. If the job is restarted * previously written files will be overwritten. *

* The default local parallelism for this sink is 1. *

* Here is an example which reads from a map and writes the entries * to given bucket using {@link Object#toString()} to convert the * values to a line. * *

{@code
     * Pipeline p = Pipeline.create();
     * p.readFrom(Sources.map("map"))
     *  .writeTo(S3Sinks.s3("bucket", "my-map-", StandardCharsets.UTF_8,
     *      () -> S3Client.create(),
     *      Object::toString
     * ));
     * }
* * @param type of the items the sink accepts * @param bucketName the name of the bucket * @param prefix the prefix to be included in the file name * @param charset the charset to be used when encoding the strings * @param clientSupplier S3 client supplier * @param toStringFn the function which converts each item to its * string representation */ @Nonnull public static Sink s3( @Nonnull String bucketName, @Nullable String prefix, @Nonnull Charset charset, @Nonnull SupplierEx clientSupplier, @Nonnull FunctionEx toStringFn ) { String charsetName = charset.name(); return SinkBuilder .sinkBuilder("s3Sink", context -> new S3SinkContext<>(bucketName, prefix, charsetName, context.globalProcessorIndex(), toStringFn, clientSupplier)) .receiveFn(S3SinkContext::receive) .flushFn(S3SinkContext::flush) .destroyFn(S3SinkContext::close) .build(); } static final class S3SinkContext { static final int DEFAULT_MAXIMUM_PART_NUMBER = 10000; static final int MINIMUM_PART_NUMBER = 1; // visible for testing static int maximumPartNumber = DEFAULT_MAXIMUM_PART_NUMBER; // the minimum size required for each part in AWS multipart static final int DEFAULT_MINIMUM_UPLOAD_PART_SIZE = (int) MemoryUnit.MEGABYTES.toBytes(5); static final double BUFFER_SCALE = 1.2d; private final String bucketName; private final String prefix; private final int processorIndex; private final S3Client s3Client; private final FunctionEx toStringFn; private final Charset charset; private final byte[] lineSeparatorBytes; private final List completedParts = new ArrayList<>(); private ByteBuffer buffer; private int partNumber = MINIMUM_PART_NUMBER; // must be between 1 and maximumPartNumber private int fileNumber; private String uploadId; private S3SinkContext( String bucketName, @Nullable String prefix, String charsetName, int processorIndex, FunctionEx toStringFn, SupplierEx clientSupplier) { this.bucketName = bucketName; String trimmedPrefix = StringUtil.trim(prefix); this.prefix = StringUtil.isNullOrEmpty(trimmedPrefix) ? "" : trimmedPrefix; this.processorIndex = processorIndex; this.s3Client = clientSupplier.get(); this.toStringFn = toStringFn; this.charset = Charset.forName(charsetName); this.lineSeparatorBytes = System.lineSeparator().getBytes(charset); checkIfBucketExists(); resizeBuffer(DEFAULT_MINIMUM_UPLOAD_PART_SIZE); } private void initiateUpload() { CreateMultipartUploadRequest req = CreateMultipartUploadRequest .builder() .bucket(bucketName) .key(key()) .build(); uploadId = s3Client.createMultipartUpload(req).uploadId(); } private void checkIfBucketExists() { s3Client.getBucketLocation(b -> b.bucket(bucketName)); } private void receive(T item) { byte[] bytes = toStringFn.apply(item).getBytes(charset); int length = bytes.length + lineSeparatorBytes.length; // not enough space in buffer to write if (buffer.remaining() < length) { // we try to flush the current buffer first flush(); // this might not be enough - either item is bigger than current // buffer size or there was not enough data in the buffer to upload // in this case we have to resize the buffer to hold more data if (buffer.remaining() < length) { resizeBuffer(length + buffer.position()); } } buffer.put(bytes); buffer.put(lineSeparatorBytes); } private void resizeBuffer(int minimumLength) { assert buffer == null || buffer.position() < minimumLength; int newCapacity = (int) (minimumLength * BUFFER_SCALE); ByteBuffer newBuffer = ByteBuffer.allocateDirect(newCapacity); if (buffer != null) { upcast(buffer).flip(); newBuffer.put(buffer); } buffer = newBuffer; } private void flush() { if (uploadId == null) { initiateUpload(); } if (buffer.position() > DEFAULT_MINIMUM_UPLOAD_PART_SIZE) { boolean isLastPart = partNumber == maximumPartNumber; flushBuffer(isLastPart); } } private void close() { try { flushBuffer(true); } finally { s3Client.close(); } } private void flushBuffer(boolean isLastPart) { if (buffer.position() > 0) { upcast(buffer).flip(); UploadPartRequest req = UploadPartRequest .builder() .bucket(bucketName) .key(key()) .uploadId(uploadId) .partNumber(partNumber) .build(); String eTag = s3Client.uploadPart(req, RequestBody.fromByteBuffer(buffer)).eTag(); completedParts.add(CompletedPart.builder().partNumber(partNumber).eTag(eTag).build()); partNumber++; upcast(buffer).clear(); } if (isLastPart) { completeUpload(); } } private void completeUpload() { try { if (completedParts.isEmpty()) { abortUpload(); } else { CompleteMultipartUploadRequest req = CompleteMultipartUploadRequest .builder() .bucket(bucketName) .key(key()) .uploadId(uploadId) .multipartUpload(b -> b.parts(completedParts)) .build(); s3Client.completeMultipartUpload(req); completedParts.clear(); partNumber = MINIMUM_PART_NUMBER; uploadId = null; fileNumber++; } } catch (Exception e) { abortUpload(); ExceptionUtil.rethrow(e); } } private void abortUpload() { s3Client.abortMultipartUpload(b -> b.uploadId(uploadId).bucket(bucketName).key(key())); } private String key() { return prefix + processorIndex + (fileNumber == 0 ? "" : "." + fileNumber); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy