io.github.nejckorasa.s3.unzip.strategy.SplitTextUnzipStrategy Maven / Gradle / Ivy

Go to download
package io.github.nejckorasa.s3.unzip.strategy;

import com.amazonaws.services.s3.AmazonS3;
import io.github.nejckorasa.s3.unzip.S3UnzipException;
import io.github.nejckorasa.s3.unzip.S3ZipFile;
import io.github.nejckorasa.s3.upload.S3MultipartUpload;
import lombok.*;
import lombok.extern.slf4j.Slf4j;

import java.io.ByteArrayOutputStream;
import java.util.Scanner;

import static com.amazonaws.services.s3.internal.Constants.MB;
import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Unzips and uploads a text file with splitting (sharding) - it creates a 1:n mappings between zipped and unzipped files.
 *
 *  It reads the file as UTF-8 text file split into lines.
 * 
 Set {@link #header} to 'true' if zipped file contains a header that needs to be included with every split file/shard (e.g. csv files). Defaults to false.
 * 
 This strategy is suitable for larger files as it splits them into smaller, more manageable unzipped files (shards).
 *
 *  Utilizes multipart upload - unzipping is achieved without keeping all data in memory or writing to disk.
 */
@Slf4j
@NoArgsConstructor
@AllArgsConstructor(access = AccessLevel.PRIVATE)
public class SplitTextUnzipStrategy implements UnzipStrategy {

    public static final String LINE_BREAK = "\n";

    /**
     * S3 multipart upload part limit in bytes.
     *
     * @see S3MultipartUpload
     */
    @NonNull
    @With
    private int uploadPartBytesLimit = 20 * MB;

    /**
     * Add a header line to all files, i.e. first line of the source zip entry will be replicated to all output files
     */
    @With
    private boolean header = false;

    /**
     * File (shard) size limit, i.e. 100 MB will split the source zip entry into files with size limit of 100 MB
     */
    @NonNull
    @With
    private long fileBytesLimit = 100 * MB;

    /**
     * Configuration for S3 multipart upload. Configures {@link S3MultipartUpload},
     */
    @NonNull
    private S3MultipartUpload.Config config = S3MultipartUpload.Config.DEFAULT;

    /**
     * Creates SplitTextUnzipStrategy with provided configuration for {@link S3MultipartUpload}
     *
     * @param config Multipart upload configuration for {@link S3MultipartUpload}
     */
    public SplitTextUnzipStrategy(@NonNull S3MultipartUpload.Config config) {
        this.config = config;
    }

    @Override
    public void unzip(S3ZipFile zipFile, AmazonS3 s3Client) {
        String filename = zipFile.filename();
        long compressedSize = zipFile.compressedSize();
        long size = zipFile.size();
        String key = zipFile.key();

        log.info("Unzipping {}, compressed: {} bytes, extracted: {} bytes to {}", filename, compressedSize, size, key);

        int fileNumber = 1;
        Scanner scanner = new Scanner(zipFile.getInputStream(), UTF_8);
        var s3MultipartUpload = initializeS3MultipartUpload(s3Client, zipFile, fileNumber);

        try {
            var outputStream = new ByteArrayOutputStream();

            long allBytesRead = 0;
            long uploadPartBytes = 0;
            long fileBytes = 0;

            String headerLine = null;

            long partNumber = 0;
            boolean newFile = false;

            while (scanner.hasNextLine()) {
                String line = scanner.nextLine() + LINE_BREAK;

                if (header && headerLine == null) {
                    headerLine = line;
                }

                long bytesRead = 0;

                // write header line if new file
                if (header && newFile) {
                    bytesRead += writeLine(headerLine, outputStream);
                    newFile = false;
                }

                // write line
                bytesRead += writeLine(line, outputStream);
                fileBytes += bytesRead;
                allBytesRead += bytesRead;

                if (uploadPartBytes < uploadPartBytesLimit) {
                    uploadPartBytes += bytesRead;
                    continue;
                }

                // upload new part
                partNumber += 1;

                // have reached file bytes limit
                if (fileBytes > fileBytesLimit) {
                    log.debug("Uploading final part [{}] for file: {} and shard file number: {} - Read {} bytes out of {} bytes", partNumber, filename, fileNumber, allBytesRead, size);

                    // finalize upload with current file
                    s3MultipartUpload.uploadFinalPart(outputStream.toByteArray());
                    partNumber = 1;
                    fileNumber += 1;
                    fileBytes = 0;
                    newFile = true;

                    log.info("Unzipped and uploaded file: {} shard file number {} in {} parts", filename, fileNumber, partNumber);

                    // initialize new multipart upload
                    s3MultipartUpload = initializeS3MultipartUpload(s3Client, zipFile, fileNumber);
                } else {
                    log.debug("Uploading part [{}] for file: {} and shard file number: {} - Read {} bytes out of {} bytes", partNumber, filename, fileNumber, allBytesRead, size);
                    s3MultipartUpload.uploadPart(outputStream.toByteArray());
                }

                outputStream.reset();
                uploadPartBytes = 0;
            }

            // upload remaining part of output stream as final part
            s3MultipartUpload.uploadFinalPart(outputStream.toByteArray());
            log.info("Unzipped and uploaded file: {} sharded into {} files", filename, fileNumber);

        } catch (Throwable t) {
            s3MultipartUpload.abort();
            throw new S3UnzipException("Failed to unzip " + filename, t);
        }
    }

    private S3MultipartUpload initializeS3MultipartUpload(AmazonS3 s3Client, S3ZipFile s3ZipFile, int fileNumber) {
        String filenameWithNumber = fileNumber + "-" + s3ZipFile.filename();
        log.debug("Initializing upload for file: {}", filenameWithNumber);

        String key = s3ZipFile.getOutputPrefix() + filenameWithNumber;
        var multipartUpload = new S3MultipartUpload(s3ZipFile.getBucketName(), key, s3Client, config);
        multipartUpload.initialize();

        return multipartUpload;
    }

    private long writeLine(String line, ByteArrayOutputStream outputStream) {
        byte[] bytes = line.getBytes(UTF_8);
        int bytesRead = bytes.length;
        outputStream.write(bytes, 0, bytesRead);
        return bytesRead;
    }
}