All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.descoped.rawdata.avro.AvroRawdataProducer Maven / Gradle / Ivy

The newest version!
package io.descoped.rawdata.avro;

import de.huxhorn.sulky.ulid.ULID;
import io.descoped.rawdata.api.RawdataClosedException;
import io.descoped.rawdata.api.RawdataMessage;
import io.descoped.rawdata.api.RawdataProducer;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.file.SeekableFileInput;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

import static java.util.Optional.ofNullable;

class AvroRawdataProducer implements RawdataProducer {

    static final Logger LOG = LoggerFactory.getLogger(AvroRawdataProducer.class);

    static final Schema schema = SchemaBuilder.record("RawdataMessage")
            .fields()
            .name("id").type().fixed("ulid").size(16).noDefault()
            .name("orderingGroup").type().nullable().stringType().noDefault()
            .name("sequenceNumber").type().longType().longDefault(0)
            .name("position").type().stringType().noDefault()
            .name("data").type().map().values().bytesType().noDefault()
            .endRecord();

    final AtomicBoolean closed = new AtomicBoolean(false);

    final ULID ulid = new ULID();
    final AtomicReference prevUlid = new AtomicReference<>(ulid.nextValue());

    final AvroRawdataUtils gcsRawdataUtils;
    final Path tmpFolder;
    final long avroMaxSeconds;
    final long avroMaxBytes;
    final int avroSyncInterval;
    final String topic;

    final AtomicReference> dataFileWriterRef = new AtomicReference<>();
    final Path topicFolder;
    final AtomicReference pathRef = new AtomicReference<>();

    final AtomicLong timestampOfFirstMessageInWindow = new AtomicLong(-1);
    final AvroFileMetadata activeAvrofileMetadata;
    final AtomicLong avroBytesWrittenInBlock = new AtomicLong(0);

    final ReentrantLock lock = new ReentrantLock();

    final Thread uploadThread;
    final BlockingQueue uploadQueue = new LinkedBlockingQueue<>();

    static class Upload {
        final Path source;
        final RawdataAvroFile target;

        Upload(Path source, RawdataAvroFile target) {
            this.source = source;
            this.target = target;
        }
    }

    AvroRawdataProducer(AvroRawdataUtils gcsRawdataUtils, Path tmpFolder, long avroMaxSeconds, long avroMaxBytes, int avroSyncInterval, String topic) {
        this.gcsRawdataUtils = gcsRawdataUtils;
        this.tmpFolder = tmpFolder;
        this.avroMaxSeconds = avroMaxSeconds;
        this.avroMaxBytes = avroMaxBytes;
        this.avroSyncInterval = avroSyncInterval;
        this.topic = topic;
        this.activeAvrofileMetadata = gcsRawdataUtils.newAvrofileMetadata();
        this.topicFolder = tmpFolder.resolve(topic);
        try {
            Files.createDirectories(topicFolder);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        createOrOverwriteLocalAvroFile();
        this.uploadThread = new Thread(() -> {
            for (; ; ) {
                final Upload upload;
                try {
                    upload = uploadQueue.take(); // wait for upload task
                } catch (InterruptedException e) {
                    LOG.warn("Closing producer topic {}", topic);
                    close();
                    LOG.warn("Upload thread interrupted. Upload thread for producer of topic {} will now die.", topic);
                    return;
                }
                try {
                    if (upload.source == null) {
                        LOG.info("Upload thread for producer of topic {} received close signal and will now die.", topic);
                        return;
                    }
                    verifySeekableToLastBlockOffsetAsGivenByFilename(upload.source, upload.target.getOffsetOfLastBlock());
                    String fileSize = AvroRawdataUtils.humanReadableByteCount(upload.source.toFile().length(), false);
                    LOG.info("Copying Avro file {} ({}) to target: {}", upload.source.getFileName(), fileSize, upload.target);
                    upload.target.copyFrom(upload.source);
                    Files.delete(upload.source);
                    LOG.info("Copy COMPLETE! Deleted Avro file {}", upload.source.getFileName());
                } catch (Throwable t) {
                    LOG.error(String.format("While uploading file %s to target %s", upload.source.getFileName(), upload.target), t);
                    LOG.warn("Closing producer topic {}", topic);
                    close();
                    LOG.warn("Upload thread for producer of topic {} will now die.", topic);
                    return;
                }
            }
        });
        this.uploadThread.start();
    }

    private void createOrOverwriteLocalAvroFile() {
        try {
            if (!lock.tryLock(5, TimeUnit.MINUTES)) {
                throw new IllegalStateException("Unable to acquire lock within 5 minutes");
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        try {
            Path path = Files.createTempFile(topicFolder, "", ".avro");
            pathRef.set(path);
            activeAvrofileMetadata.clear();
            DatumWriter datumWriter = new GenericDatumWriter<>(schema);
            DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter);
            dataFileWriter.setSyncInterval(2 * avroSyncInterval);
            dataFileWriter.setFlushOnEveryBlock(true);
            dataFileWriterRef.set(dataFileWriter);
            dataFileWriter.create(schema, path.toFile());
            long lastSyncPosition = dataFileWriter.sync(); // position of first block
            activeAvrofileMetadata.setSyncOfLastBlock(lastSyncPosition);
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            lock.unlock();
        }
    }

    private void closeAvroFileAndTriggerAsyncUploadToGCS() {
        try {
            if (!lock.tryLock(5, TimeUnit.MINUTES)) {
                throw new IllegalStateException("Unable to acquire lock within 5 minutes");
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        try {
            DataFileWriter dataFileWriter = dataFileWriterRef.getAndSet(null);
            if (dataFileWriter != null) {
                dataFileWriter.flush();
                dataFileWriter.close();
            }
            Path path = pathRef.get();
            if (path != null) {
                if (activeAvrofileMetadata.getCount() > 0) {
                    RawdataAvroFile rawdataAvroFile = activeAvrofileMetadata.toRawdataAvroFile(topic);
                    uploadQueue.add(new Upload(path, rawdataAvroFile)); // schedule upload asynchronously
                } else {
                    // no records, no need to write file to GCS
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            lock.unlock();
        }
    }

    static void verifySeekableToLastBlockOffsetAsGivenByFilename(Path path, long offsetOfLastBlock) throws IOException {
        DatumReader datumReader = new GenericDatumReader<>(AvroRawdataProducer.schema);
        try (DataFileReader dataFileReader = new DataFileReader<>(new SeekableFileInput(path.toFile()), datumReader)) {
            dataFileReader.seek(offsetOfLastBlock);
            dataFileReader.hasNext(); // will throw an exception if offset is wrong
        }
    }

    @Override
    public String topic() {
        return topic;
    }

    @Override
    public void publish(RawdataMessage... messages) throws RawdataClosedException {
        if (isClosed()) {
            throw new RawdataClosedException();
        }
        try {
            if (!lock.tryLock(5, TimeUnit.MINUTES)) {
                throw new IllegalStateException("Unable to acquire lock within 5 minutes");
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        try {
            for (RawdataMessage message : messages) {
                long now = System.currentTimeMillis();
                timestampOfFirstMessageInWindow.compareAndSet(-1, now);

                boolean timeLimitExceeded = timestampOfFirstMessageInWindow.get() + 1000 * avroMaxSeconds < now;
                if (timeLimitExceeded) {
                    closeAvroFileAndTriggerAsyncUploadToGCS();
                    createOrOverwriteLocalAvroFile();
                    timestampOfFirstMessageInWindow.set(now);
                }

                ULID.Value ulidValue = message.ulid();
                if (ulidValue == null) {
                    ulidValue = RawdataProducer.nextMonotonicUlid(ulid, prevUlid.get());
                }
                prevUlid.set(ulidValue);

                activeAvrofileMetadata.setIdOfFirstRecord(ulidValue);
                activeAvrofileMetadata.setPositionOfFirstRecord(message.position());

                try {
                    GenericRecord record = new GenericData.Record(schema);
                    record.put("id", new GenericData.Fixed(schema.getField("id").schema(), ulidValue.toBytes()));
                    record.put("orderingGroup", message.orderingGroup());
                    record.put("sequenceNumber", message.sequenceNumber());
                    record.put("position", message.position());
                    record.put("data", message.data().entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> ByteBuffer.wrap(e.getValue()))));

                    if (avroBytesWrittenInBlock.get() >= avroSyncInterval) {
                        // start new block in avro file
                        long lastSyncPosition = dataFileWriterRef.get().sync();
                        activeAvrofileMetadata.setSyncOfLastBlock(lastSyncPosition);
                        avroBytesWrittenInBlock.set(0);
                    }
                    dataFileWriterRef.get().append(record);
                    activeAvrofileMetadata.incrementCounter(1);
                    avroBytesWrittenInBlock.addAndGet(estimateAvroSizeOfRawdataMessage(message));
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }

                boolean sizeLimitExceeded = pathRef.get().toFile().length() > avroMaxBytes;
                if (sizeLimitExceeded) {
                    closeAvroFileAndTriggerAsyncUploadToGCS();
                    createOrOverwriteLocalAvroFile();
                }
            }
        } finally {
            lock.unlock();
        }
    }

    static long estimateAvroSizeOfRawdataMessage(RawdataMessage message) {
        return 16 + // ulid
                2 + ofNullable(message.orderingGroup()).map(String::length).orElse(0) + // orderingGroup
                6 + // sequenceNumber
                2 + message.position().length() // position
                + message.data().entrySet().stream()
                .map(e -> 2L + e.getKey().length() + 4 + e.getValue().length)
                .reduce(0L, Long::sum);
    }

    @Override
    public CompletableFuture publishAsync(RawdataMessage... messages) {
        if (isClosed()) {
            throw new RawdataClosedException();
        }
        return CompletableFuture.runAsync(() -> publish(messages));
    }

    @Override
    public boolean isClosed() {
        return closed.get();
    }

    @Override
    public void close() {
        if (closed.compareAndSet(false, true)) {
            closeAvroFileAndTriggerAsyncUploadToGCS();
            uploadQueue.add(new Upload(null, null)); // send close signal to upload-thread.
        }
        try {
            // all callers must wait for all uploads to complete
            uploadThread.join();
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy