com.spredfast.kafka.connect.s3.sink.BlockGZIPFileWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kafka-connect-s3 Show documentation
S3 Source and Sink Connectors for Kafka Connect
There is a newer version: 0.5.0
package com.spredfast.kafka.connect.s3.sink;

import static java.util.stream.Collectors.toList;

import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPOutputStream;

import org.apache.kafka.connect.errors.RetriableException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.spredfast.kafka.connect.s3.json.ChunkDescriptor;
import com.spredfast.kafka.connect.s3.json.ChunksIndex;

/**
 * BlockGZIPFileWriter accumulates newline delimited UTF-8 records and writes them to an
 * output file that is readable by GZIP.
 * 
 * In fact this file is the concatenation of possibly many separate GZIP files corresponding to smaller chunks
 * of the input. Alongside the output filename.gz file, a file filename-index.json is written containing JSON
 * metadata about the size and location of each block.
 * 

 * This allows a reading class to skip to particular line/record without decompressing whole file by looking up
 * the offset of the containing block, seeking to it and beginning GZIp read from there.
 * 

 * This is especially useful when the file is an archive in HTTP storage like Amazon S3 where GET request with
 * range headers can allow pulling a small segment from overall compressed file.
 * 
 * Note that thanks to GZIP spec, the overall file is perfectly valid and will decompress as if it was a single stream
 * with any regular GZIP decoding library or program.
 */
public class BlockGZIPFileWriter implements Closeable {

	private String filenameBase;
	private String path;
	private GZIPOutputStream gzipStream;
	private CountingOutputStream fileStream;
	private final ObjectMapper objectMapper = new ObjectMapper();

	private class Chunk {
		public long rawBytes = 0;
		public long byteOffset = 0;
		public long compressedByteLength = 0;
		public long firstOffset = 0;
		public long numRecords = 0;

		ChunkDescriptor toJson() {
			ChunkDescriptor chunkObj = new ChunkDescriptor();
			chunkObj.first_record_offset = firstOffset;
			chunkObj.num_records = numRecords;
			chunkObj.byte_offset = byteOffset;
			chunkObj.byte_length = compressedByteLength;
			chunkObj.byte_length_uncompressed = rawBytes;
			return chunkObj;
		}
	}

	private class CountingOutputStream extends FilterOutputStream {
		private long numBytes = 0;

		CountingOutputStream(OutputStream out) throws IOException {
			super(out);
		}

		@Override
		public void write(int b) throws IOException {
			out.write(b);
			numBytes++;
		}

		@Override
		public void write(byte[] b) throws IOException {
			out.write(b);
			numBytes += b.length;
		}

		@Override
		public void write(byte[] b, int off, int len) throws IOException {
			out.write(b, off, len);
			numBytes += len;
		}

		public long getNumBytesWritten() {
			return numBytes;
		}
	}

	private ArrayList chunks;

	// Default each chunk is 64MB of uncompressed data
	private long chunkThreshold;

	// Offset to the first record.
	// Set to non-zero if this file is part of a larger stream and you want
	// record offsets in the index to reflect the global offset rather than local
	private long firstRecordOffset;

	public BlockGZIPFileWriter(String filenameBase, String path) throws IOException {
		this(filenameBase, path, 0, 67108864);
	}

	public BlockGZIPFileWriter(String filenameBase, String path, long firstRecordOffset) throws IOException {
		this(filenameBase, path, firstRecordOffset, 67108864);
	}

	public BlockGZIPFileWriter(String filenameBase, String path, long firstRecordOffset, long chunkThreshold) throws IOException {
		this(filenameBase, path, firstRecordOffset, chunkThreshold, new byte[0]);
	}

	public BlockGZIPFileWriter(String filenameBase, String path, long firstRecordOffset, long chunkThreshold, byte[] header)
		throws IOException {
		this.filenameBase = filenameBase;
		this.path = path;
		this.firstRecordOffset = firstRecordOffset;
		this.chunkThreshold = chunkThreshold;

		chunks = new ArrayList<>();

		// Initialize first chunk
		Chunk ch = new Chunk();
		ch.firstOffset = firstRecordOffset;
		chunks.add(ch);

		// Explicitly truncate the file. On linux and OS X this appears to happen
		// anyway when opening with FileOutputStream but that behavior is not actually documented
		// or specified anywhere so let's be rigorous about it.
		File file = new File(getDataFilePath());
		if (!file.getParentFile().exists() && !file.getParentFile().mkdirs()) {
			throw new RetriableException("could not create file " + file);
		}
		FileOutputStream fos = new FileOutputStream(file);
		fos.getChannel().truncate(0);

		// Open file for writing and setup
		this.fileStream = new CountingOutputStream(fos);
		initChunkWriter();
		if (header.length > 0) {
			// if there is a header, write it as its own gzip chunk
			// so we know how many bytes to skip
			gzipStream.write(header);
			gzipStream.finish();
			gzipStream = new GZIPOutputStream(fileStream);
			// may have written header bytes
			ch.byteOffset = fileStream.getNumBytesWritten();
		}
	}

	private void initChunkWriter() throws IOException {
		gzipStream = new GZIPOutputStream(fileStream);
	}

	private Chunk currentChunk() {
		return chunks.get(chunks.size() - 1);
	}

	public String getDataFileName() {
		return String.format("%s-%012d.gz", filenameBase, firstRecordOffset);
	}

	public String getIndexFileName() {
		return String.format("%s-%012d.index.json", filenameBase, firstRecordOffset);
	}

	public String getDataFilePath() {
		return String.format("%s/%s", path, this.getDataFileName());
	}

	public String getIndexFilePath() {
		return String.format("%s/%s", path, this.getIndexFileName());
	}


	/**
	 *
	 * @param toWrite the bytes to write.
	 * @param recordCount how many records these bytes represent.
	 */
	public void write(List toWrite, long recordCount) throws IOException {
		Chunk ch = currentChunk();

		int rawBytesToWrite = 0;
		for (byte[] bytes : toWrite) {
			rawBytesToWrite += bytes.length;
		}

		if ((ch.rawBytes + rawBytesToWrite) > chunkThreshold) {
			finishChunk();
			initChunkWriter();

			Chunk newCh = new Chunk();
			newCh.firstOffset = ch.firstOffset + ch.numRecords;
			newCh.byteOffset = ch.byteOffset + ch.compressedByteLength;
			chunks.add(newCh);
			ch = newCh;
		}

		for (byte[] bytes : toWrite) {
			gzipStream.write(bytes);
		}

		ch.rawBytes += rawBytesToWrite;
		ch.numRecords += recordCount;
	}

	public void delete() {
		deleteIfExists(getDataFilePath());
		deleteIfExists(getIndexFilePath());
	}

	private void deleteIfExists(String path) {
		File f = new File(path);
		if (f.exists() && !f.isDirectory()) {
			//noinspection ResultOfMethodCallIgnored
			f.delete();
		}
	}

	private void finishChunk() throws IOException {
		Chunk ch = currentChunk();

		// Complete GZIP block without closing stream
		gzipStream.finish();

		// We can no find out how long this chunk was compressed
		long bytesWritten = fileStream.getNumBytesWritten();
		ch.compressedByteLength = bytesWritten - ch.byteOffset;
	}

	public void close() throws IOException {
		// Flush last chunk, updating index
		finishChunk();
		gzipStream.close();
		// Now close the writer (and the whole stream stack)
		writeIndex();
	}

	private void writeIndex() throws IOException {
		File indexFile = new File(getIndexFilePath());
		if (!indexFile.getParentFile().exists() && !indexFile.getParentFile().mkdirs()) {
			throw new IOException("Cannot create index " + indexFile);
		}

		objectMapper.writer().writeValue(indexFile, ChunksIndex.of(chunks.stream()
			.map(Chunk::toJson).collect(toList())));
	}

	public int getTotalUncompressedSize() {
		int totalBytes = 0;
		for (Chunk ch : chunks) {
			totalBytes += ch.rawBytes;
		}
		return totalBytes;
	}

	public int getNumChunks() {
		return chunks.size();
	}

	public int getNumRecords() {
		int totalRecords = 0;
		for (Chunk ch : chunks) {
			totalRecords += ch.numRecords;
		}
		return totalRecords;
	}
}