All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.nejckorasa.s3.unzip.strategy.SplitTextUnzipStrategy Maven / Gradle / Ivy

package io.github.nejckorasa.s3.unzip.strategy;

import com.amazonaws.services.s3.AmazonS3;
import io.github.nejckorasa.s3.unzip.S3UnzipException;
import io.github.nejckorasa.s3.unzip.S3ZipFile;
import io.github.nejckorasa.s3.upload.S3MultipartUpload;
import lombok.*;
import lombok.extern.slf4j.Slf4j;

import java.io.ByteArrayOutputStream;
import java.util.Scanner;

import static com.amazonaws.services.s3.internal.Constants.MB;
import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Unzips and uploads a text file with splitting (sharding) - it creates a 1:n mappings between zipped and unzipped files.
 *
 * 

It reads the file as UTF-8 text file split into lines. *

Set {@link #header} to 'true' if zipped file contains a header that needs to be included with every split file/shard (e.g. csv files). Defaults to false. *

This strategy is suitable for larger files as it splits them into smaller, more manageable unzipped files (shards). * *

Utilizes multipart upload - unzipping is achieved without keeping all data in memory or writing to disk. */ @Slf4j @NoArgsConstructor @AllArgsConstructor(access = AccessLevel.PRIVATE) public class SplitTextUnzipStrategy implements UnzipStrategy { public static final String LINE_BREAK = "\n"; /** * S3 multipart upload part limit in bytes. * * @see S3MultipartUpload */ @NonNull @With private int uploadPartBytesLimit = 20 * MB; /** * Add a header line to all files, i.e. first line of the source zip entry will be replicated to all output files */ @With private boolean header = false; /** * File (shard) size limit, i.e. 100 MB will split the source zip entry into files with size limit of 100 MB */ @NonNull @With private long fileBytesLimit = 100 * MB; /** * Configuration for S3 multipart upload. Configures {@link S3MultipartUpload}, */ @NonNull private S3MultipartUpload.Config config = S3MultipartUpload.Config.DEFAULT; /** * Creates SplitTextUnzipStrategy with provided configuration for {@link S3MultipartUpload} * * @param config Multipart upload configuration for {@link S3MultipartUpload} */ public SplitTextUnzipStrategy(@NonNull S3MultipartUpload.Config config) { this.config = config; } @Override public void unzip(S3ZipFile zipFile, AmazonS3 s3Client) { String filename = zipFile.filename(); long compressedSize = zipFile.compressedSize(); long size = zipFile.size(); String key = zipFile.key(); log.info("Unzipping {}, compressed: {} bytes, extracted: {} bytes to {}", filename, compressedSize, size, key); int fileNumber = 1; Scanner scanner = new Scanner(zipFile.getInputStream(), UTF_8); var s3MultipartUpload = initializeS3MultipartUpload(s3Client, zipFile, fileNumber); try { var outputStream = new ByteArrayOutputStream(); long allBytesRead = 0; long uploadPartBytes = 0; long fileBytes = 0; String headerLine = null; long partNumber = 0; boolean newFile = false; while (scanner.hasNextLine()) { String line = scanner.nextLine() + LINE_BREAK; if (header && headerLine == null) { headerLine = line; } long bytesRead = 0; // write header line if new file if (header && newFile) { bytesRead += writeLine(headerLine, outputStream); newFile = false; } // write line bytesRead += writeLine(line, outputStream); fileBytes += bytesRead; allBytesRead += bytesRead; if (uploadPartBytes < uploadPartBytesLimit) { uploadPartBytes += bytesRead; continue; } // upload new part partNumber += 1; // have reached file bytes limit if (fileBytes > fileBytesLimit) { log.debug("Uploading final part [{}] for file: {} and shard file number: {} - Read {} bytes out of {} bytes", partNumber, filename, fileNumber, allBytesRead, size); // finalize upload with current file s3MultipartUpload.uploadFinalPart(outputStream.toByteArray()); partNumber = 1; fileNumber += 1; fileBytes = 0; newFile = true; log.info("Unzipped and uploaded file: {} shard file number {} in {} parts", filename, fileNumber, partNumber); // initialize new multipart upload s3MultipartUpload = initializeS3MultipartUpload(s3Client, zipFile, fileNumber); } else { log.debug("Uploading part [{}] for file: {} and shard file number: {} - Read {} bytes out of {} bytes", partNumber, filename, fileNumber, allBytesRead, size); s3MultipartUpload.uploadPart(outputStream.toByteArray()); } outputStream.reset(); uploadPartBytes = 0; } // upload remaining part of output stream as final part s3MultipartUpload.uploadFinalPart(outputStream.toByteArray()); log.info("Unzipped and uploaded file: {} sharded into {} files", filename, fileNumber); } catch (Throwable t) { s3MultipartUpload.abort(); throw new S3UnzipException("Failed to unzip " + filename, t); } } private S3MultipartUpload initializeS3MultipartUpload(AmazonS3 s3Client, S3ZipFile s3ZipFile, int fileNumber) { String filenameWithNumber = fileNumber + "-" + s3ZipFile.filename(); log.debug("Initializing upload for file: {}", filenameWithNumber); String key = s3ZipFile.getOutputPrefix() + filenameWithNumber; var multipartUpload = new S3MultipartUpload(s3ZipFile.getBucketName(), key, s3Client, config); multipartUpload.initialize(); return multipartUpload; } private long writeLine(String line, ByteArrayOutputStream outputStream) { byte[] bytes = line.getBytes(UTF_8); int bytesRead = bytes.length; outputStream.write(bytes, 0, bytesRead); return bytesRead; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy