com.spredfast.kafka.connect.s3.source.S3FilesReader Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of kafka-connect-s3 Show documentation
S3 Source and Sink Connectors for Kafka Connect
There is a newer version: 0.5.0
package com.spredfast.kafka.connect.s3.source;

import static java.util.stream.Collectors.toList;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.BiPredicate;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.amazonaws.AmazonClientException;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.spredfast.kafka.connect.s3.LazyString;
import com.spredfast.kafka.connect.s3.S3RecordsReader;
import com.spredfast.kafka.connect.s3.json.ChunkDescriptor;
import com.spredfast.kafka.connect.s3.json.ChunksIndex;

/**
 * Helpers for reading records out of S3. Not thread safe.
 * Records should be in order since S3 lists files in lexicographic order.
 * It is strongly recommended that you use a unique key prefix per topic as
 * there is no option to restrict this reader by topic.
 * 
 * NOTE: hasNext() on the returned iterators may throw AmazonClientException if there
 * was a problem communicating with S3 or reading an object. Your code should
 * catch AmazonClientException and implement back-off and retry as desired.
 * 
 * Any other exception should be considered a permanent failure.
 */
public class S3FilesReader implements Iterable {

	private static final Logger log = LoggerFactory.getLogger(S3FilesReader.class);

	public static final Pattern DEFAULT_PATTERN = Pattern.compile(
		"(\\/|^)"                        // match the / or the start of the key so we shouldn't have to worry about prefix
			+ "(?[^/]+?)-"            // assuming no / in topic names
			+ "(?\\d{5})-"
			+ "(?\\d{12})\\.gz$"
	);

	private final AmazonS3 s3Client;

	private final Supplier makeReader;

	private final Map offsets;

	private final ObjectReader indexParser = new ObjectMapper().reader(ChunksIndex.class);

	private final S3SourceConfig config;

	public S3FilesReader(S3SourceConfig config, AmazonS3 s3Client, Map offsets, Supplier recordReader) {
		this.config = config;
		this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new);
		this.s3Client = s3Client;
		this.makeReader = recordReader;
	}

	public Iterator iterator() {
		return readAll();
	}

	public interface PartitionFilter {
		// convenience for simple filters. Only the 2 argument version will ever be called.
		boolean matches(int partition);

		default boolean matches(String topic, int partition) {
			return matches(partition);
		}

		static PartitionFilter from(BiPredicate filter) {
			return new PartitionFilter() {
				@Override
				public boolean matches(int partition) {
					throw new UnsupportedOperationException();
				}

				@Override
				public boolean matches(String topic, int partition) {
					return filter.test(topic, partition);
				}
			};
		}

		PartitionFilter MATCH_ALL = p -> true;
	}

	private static final Pattern DATA_SUFFIX = Pattern.compile("\\.gz$");

	private int partition(String key) {
		final Matcher matcher = config.keyPattern.matcher(key);
		if (!matcher.find()) {
			throw new IllegalArgumentException("Not a valid chunk filename! " + key);
		}
		return Integer.parseInt(matcher.group("partition"));
	}

	private String topic(String key) {
		final Matcher matcher = config.keyPattern.matcher(key);
		if (!matcher.find()) {
			throw new IllegalArgumentException("Not a valid chunk filename! " + key);
		}
		return matcher.group("topic");
	}

	public Iterator readAll() {
		return new Iterator() {
			String currentKey;

			ObjectListing objectListing;
			Iterator nextFile = Collections.emptyIterator();
			Iterator> iterator = Collections.emptyIterator();

			private void nextObject() {
				while (!nextFile.hasNext() && hasMoreObjects()) {

					// partitions will be read completely for each prefix (e.g., a day) in order.
					// i.e., all of partition 0 will be read before partition 1. Seems like that will make perf wonky if
					// there is an active, multi-partition consumer on the other end.
					// to mitigate that, have as many tasks as partitions.
					if (objectListing == null) {
						objectListing = s3Client.listObjects(new ListObjectsRequest(
							config.bucket,
							config.keyPrefix,
							config.startMarker,
							null,
							// we have to filter out chunk indexes on this end, so
							// whatever the requested page size is, we'll need twice that
							config.pageSize * 2
						));
						log.debug("aws ls {}/{} after:{} = {}", config.bucket, config.keyPrefix, config.startMarker,
							LazyString.of(() -> objectListing.getObjectSummaries().stream().map(S3ObjectSummary::getKey).collect(toList())));
					} else {
						String marker = objectListing.getNextMarker();
						objectListing = s3Client.listNextBatchOfObjects(objectListing);
						log.debug("aws ls {}/{} after:{} = {}", config.bucket, config.keyPrefix, marker,
							LazyString.of(() -> objectListing.getObjectSummaries().stream().map(S3ObjectSummary::getKey).collect(toList())));
					}

					List chunks = new ArrayList<>(objectListing.getObjectSummaries().size() / 2);
					for (S3ObjectSummary chunk : objectListing.getObjectSummaries()) {
						if (DATA_SUFFIX.matcher(chunk.getKey()).find() && parseKeyUnchecked(chunk.getKey(),
								(t, p, o) -> config.partitionFilter.matches(t, p))) {
							S3Offset offset = offset(chunk);
							if (offset != null) {
								// if our offset for this partition is beyond this chunk, ignore it
								// this relies on filename lexicographic order being correct
								if (offset.getS3key().compareTo(chunk.getKey()) > 0) {
									log.debug("Skipping {} because < current offset of {}", chunk.getKey(), offset);
									continue;
								}
							}
							chunks.add(chunk);
						}
					}
					log.debug("Next Chunks: {}", LazyString.of(() -> chunks.stream().map(S3ObjectSummary::getKey).collect(toList())));
					nextFile = chunks.iterator();
				}
				if (!nextFile.hasNext()) {
					iterator = Collections.emptyIterator();
					return;
				}
				try {
					S3ObjectSummary file = nextFile.next();

					currentKey = file.getKey();
					S3Offset offset = offset(file);
					if (offset != null && offset.getS3key().equals(currentKey)) {
						resumeFromOffset(offset);
					} else {
						log.debug("Now reading from {}", currentKey);
						S3RecordsReader reader = makeReader.get();
						InputStream content = getContent(s3Client.getObject(config.bucket, currentKey));
						iterator = parseKey(currentKey, (topic, partition, startOffset) -> {
							reader.init(topic,partition, content, startOffset);
							return reader.readAll(topic, partition, content, startOffset);
						});
					}
				} catch (IOException e) {
					throw new AmazonClientException(e);
				}
			}

			private InputStream getContent(S3Object object) throws IOException {
				return config.inputFilter.filter(object.getObjectContent());
			}

			private S3Offset offset(S3ObjectSummary chunk) {
				return offsets.get(S3Partition.from(config.bucket, config.keyPrefix, topic(chunk.getKey()), partition(chunk.getKey())));
			}

			/**
			 * If we have a non-null offset to resume from, then our marker is the current file, not the next file,
			 * so we need to load the marker and find the offset to start from.
			 */
			private void resumeFromOffset(S3Offset offset) throws IOException {
				log.debug("resumeFromOffset {}", offset);
				S3RecordsReader reader = makeReader.get();

				ChunksIndex index = getChunksIndex(offset.getS3key());
				ChunkDescriptor chunkDescriptor = index.chunkContaining(offset.getOffset() + 1)
					.orElse(null);

				if (chunkDescriptor == null) {
					log.warn("Missing chunk descriptor for requested offset {} (max:{}). Moving on to next file.",
						offset, index.lastOffset());
					// it's possible we were at the end of this file,
					// so move on to the next one
					nextObject();
					return;
				}

				// if we got here, it is a real object and contains
				// the offset we want to start at

				// if need the start of the file for the read, let it read it
				if (reader.isInitRequired() && chunkDescriptor.byte_offset > 0) {
					try (S3Object object = s3Client.getObject(new GetObjectRequest(config.bucket, offset.getS3key()))) {
						parseKey(object.getKey(), (topic, partition, startOffset) -> {
							reader.init(topic, partition, getContent(object), startOffset);
							return null;
						});
					}
				}

				GetObjectRequest request = new GetObjectRequest(config.bucket, offset.getS3key());
				request.setRange(chunkDescriptor.byte_offset, index.totalSize());

				S3Object object = s3Client.getObject(request);

				currentKey = object.getKey();
				log.debug("Resume {}: Now reading from {}, reading {}-{}", offset, currentKey, chunkDescriptor.byte_offset, index.totalSize());

				iterator = parseKey(object.getKey(), (topic, partition, startOffset) ->
					reader.readAll(topic, partition, getContent(object), chunkDescriptor.first_record_offset));

				// skip records before the given offset
				long recordSkipCount = offset.getOffset() - chunkDescriptor.first_record_offset + 1;
				for (int i = 0; i < recordSkipCount; i++) {
					iterator.next();
				}
			}

			@Override
			public boolean hasNext() {
				while (!iterator.hasNext() && hasMoreObjects()) {
					nextObject();
				}
				return iterator.hasNext();
			}

			boolean hasMoreObjects() {
				return objectListing == null || objectListing.isTruncated() || nextFile.hasNext();
			}

			@Override
			public S3SourceRecord next() {
				ConsumerRecord record = iterator.next();
				return new S3SourceRecord(
					S3Partition.from(config.bucket, config.keyPrefix, record.topic(), record.partition()),
					S3Offset.from(currentKey, record.offset()),
					record.topic(),
					record.partition(),
					record.key(),
					record.value()
				);
			}

			@Override
			public void remove() {
				throw new UnsupportedOperationException();
			}
		};
	}

	private  T parseKeyUnchecked(String key, QuietKeyConsumer consumer) {
		try {
			return parseKey(key, consumer::consume);
		} catch (IOException never) {
			throw new RuntimeException(never);
		}
	}

	private  T parseKey(String key, KeyConsumer consumer) throws IOException {
		final Matcher matcher = config.keyPattern.matcher(key);
		if (!matcher.find()) {
			throw new IllegalArgumentException("Not a valid chunk filename! " + key);
		}
		final String topic = matcher.group("topic");
		final int partition = Integer.parseInt(matcher.group("partition"));
		final long startOffset = Long.parseLong(matcher.group("offset"));

		return consumer.consume(topic, partition, startOffset);
	}


	private interface QuietKeyConsumer {
		T consume(String topic, int partition, long startOffset);
	}

	private interface KeyConsumer {
		T consume(String topic, int partition, long startOffset) throws IOException;
	}

	private ChunksIndex getChunksIndex(String key) throws IOException {
		return indexParser.readValue(new InputStreamReader(s3Client.getObject(config.bucket, DATA_SUFFIX.matcher(key)
			.replaceAll(".index.json")).getObjectContent()));
	}

	/**
	 * Filtering applied to the S3InputStream. Will almost always start
	 * with GUNZIP, but could also include things like decryption.
	 */
	public interface InputFilter {
		InputStream filter(InputStream inputStream) throws IOException;

		InputFilter GUNZIP = GZIPInputStream::new;
	}

}