io.github.cloudchacho.Firehose Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hedwig-firehose-dataflow Show documentation
A firehose implementation using Dataflow for Hedwig
There is a newer version: 0.5
Show newest version
package io.github.cloudchacho;

import io.github.cloudchacho.hedwig.Container;
import io.github.cloudchacho.hedwig.Options;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.protobuf.*;
import com.google.protobuf.util.JsonFormat;
import org.apache.beam.runners.dataflow.DataflowRunner;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.*;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.transforms.windowing.*;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * This pipeline ingests incoming data from a Cloud Pub/Sub topic and
 * outputs the proto JSON data into windowed files at the specified output
 * directory.
 *
 * Example Usage:
 *
 *  * mvn compile exec:java \
 -Dexec.mainClass=[MAIN CLASS] \
 -Dexec.cleanupDaemonThreads=false \
 -Dexec.args=" \
 --project=${PROJECT_ID} \
 --stagingLocation=gs://${PROJECT_ID}/dataflow/pipelines/${PIPELINE_FOLDER}/staging \
 --tempLocation=gs://${PROJECT_ID}/dataflow/pipelines/${PIPELINE_FOLDER}/temp \
 --runner=DataflowRunner \
 --windowDuration=2m \
 --numShards=1 \
 --inputSubscriptions=hedwig-firehose-${TOPIC_ID} \
 --inputSubscriptionsCrossProject=hedwig-firehose-${PROJECT_ID}-${TOPIC_ID};${PROJECT_ID} \
 --userTempLocation=gs://${PROJECT_ID}/tmp/ \
 --outputDirectory=gs://${PROJECT_ID}/output/ \
 --region=${REGION} \
 --zone=${ZONE} \
 --workerLogLevelOverrides='{\"io.github.cloudchacho.Firehose\":\"DEBUG\"}'"
 * 
 */
public class Firehose {

    private static class DatePartitionedName implements FileIO.Write.FileNaming {
        private static final DateTimeFormatter DATE_FORMAT = DateTimeFormat.forPattern("yyyy/MM/dd");
        private static final DateTimeFormatter TIME_FORMAT = DateTimeFormat.forPattern("HH:mm:ss");

        // ProtobufDecoder sets group id as `/`
        private static final Pattern FILENAME_PATTERN = Pattern.compile("([^/]+)/(.+)");

        private final String projectId;
        private final String topic;

        DatePartitionedName(String name) {
            Matcher matcher = FILENAME_PATTERN.matcher(name);
            if (!matcher.matches()) {
                throw new RuntimeException(String.format(
                    "filename: %s doesn't match pattern: %s", name, FILENAME_PATTERN.toString()));
            }
            projectId = matcher.group(1);
            topic = matcher.group(2);
        }

        @Override
        public String getFilename(
            BoundedWindow window, PaneInfo pane, int numShards, int shardIndex, Compression compression) {
            IntervalWindow intervalWindow = (IntervalWindow) window;

            return String.format(
                "%s/%s/%s/%s-%s-%s-%s-of-%s%s",
                projectId,
                topic,
                DATE_FORMAT.print(intervalWindow.start()),
                topic,
                TIME_FORMAT.print(intervalWindow.start()),
                TIME_FORMAT.print(intervalWindow.end()),
                shardIndex,
                numShards,
                compression.getSuggestedSuffix());
        }
    }

    private static final Logger LOG = LoggerFactory.getLogger(Firehose.class);

    // reverse of https://github.com/google/gson/blob/3958b1f78dc2b12da3ffd7426d3fc90550d46758/gson/src/main/java/com/google/gson/stream/JsonWriter.java#L157
    private static final Map HTML_SAFE_REPLACEMENT_CHARS = Map.of(
        "\\\\u003c", "<",
        "\\\\u003e", ">",
        "\\\\u0026", "&",
        "\\\\u003d", "=",
        "\\\\u0027", "\""
    );

    private static final String unknownMessageGroupId = "unknown";

    private static final Pattern schemaPattern = Pattern.compile("^(.*)/(\\d*)\\.(\\d*)$");

    // name be like
    // hedwig-firehose-user-created-v1
    private static final Pattern SUBSCRIPTION_REGEXP = Pattern.compile("hedwig-firehose-(.+)");

    // name be like
    // hedwig-firehose-other-project-user-created-v1;other-project
    private static final Pattern SUBSCRIPTION_CROSS_PROJECT_REGEXP = Pattern.compile("hedwig-firehose-(.+)-(.+);\\1");

    private static final Map emptyMap = new HashMap<>();

    private static final Base64.Encoder encoder = Base64.getEncoder();

    private static final ObjectMapper mapper = new ObjectMapper();

    private static class ProtobufDecoder extends SimpleFunction> {
        private final ValueProvider projectId;
        private final ValueProvider schemaFileDescriptorSetFile;

        transient private static JsonFormat.TypeRegistry typeRegistry;
        transient private static final Map> schemaClasses = new HashMap<>();

        private ProtobufDecoder(String projectId, ValueProvider schemaFileDescriptorSetFile) {
            this.projectId = ValueProvider.StaticValueProvider.of(projectId);
            this.schemaFileDescriptorSetFile = schemaFileDescriptorSetFile;
        }

        private ProtobufDecoder(ValueProvider projectId, ValueProvider schemaFileDescriptorSet) {
            this.projectId = projectId;
            this.schemaFileDescriptorSetFile = schemaFileDescriptorSet;
        }

        private void ensureSchema() {
            if (typeRegistry != null) {
                return;
            }

            LOG.debug("schema not read yet, reading now");

            JsonFormat.TypeRegistry.Builder builder = JsonFormat.TypeRegistry.newBuilder()
                .add(Value.getDescriptor());

            ExtensionRegistry extensionRegistry = ExtensionRegistry.newInstance();
            Options.registerAllExtensions(extensionRegistry);

            int messagesCount = 0;

            try {
                String file = schemaFileDescriptorSetFile.get();
                if (file == null) {
                    LOG.error("schemaFileDescriptorSetFile can't be null");
                    throw new IllegalArgumentException("schemaFileDescriptorSetFile can't be null");
                }
                ResourceId resourceId = FileSystems.matchNewResource(file, false);
                ReadableByteChannel channel = FileSystems.open(resourceId);
                InputStream stream = Channels.newInputStream(channel);
                byte[] fileDescriptorSetBytes = stream.readAllBytes();
                DescriptorProtos.FileDescriptorSet fileDescriptorSet  = DescriptorProtos.FileDescriptorSet.parseFrom(fileDescriptorSetBytes, extensionRegistry);
                List dependencies = new ArrayList<>();
                for (DescriptorProtos.FileDescriptorProto fileDescriptorProto : fileDescriptorSet.getFileList()) {
                    Descriptors.FileDescriptor fileDescriptor = Descriptors.FileDescriptor.buildFrom(fileDescriptorProto, dependencies.toArray(new Descriptors.FileDescriptor[0]), false);
                    dependencies.add(fileDescriptor);
                    for (Descriptors.Descriptor messageDescriptor : fileDescriptor.getMessageTypes()) {
                        // filter to only Hedwig messages
                        if (messageDescriptor.getOptions().hasExtension(Options.messageOptions)) {
                            DynamicMessage.Builder msg = DynamicMessage.newBuilder(messageDescriptor);
                            builder.add(messageDescriptor);
                            Options.MessageOptions msgOptions =
                                messageDescriptor.getOptions().getExtension(Options.messageOptions);
                            schemaClasses.putIfAbsent(msgOptions.getMessageType(), new HashMap<>());
                            schemaClasses.get(msgOptions.getMessageType())
                                .put(msgOptions.getMajorVersion(), msg.getDefaultInstanceForType());
                            ++messagesCount;
                        }
                    }
                }
                typeRegistry = builder.build();
            } catch (IOException|Descriptors.DescriptorValidationException e) {
                String msg = String.format("Unable to read schemaFileDescriptorSet at %s", schemaFileDescriptorSetFile.get());
                LOG.error(msg, e);
                throw new IllegalArgumentException(msg);
            }
            LOG.debug(String.format("Read %d Hedwig message types from schema", messagesCount));
        }

        /**
         * Pack an input message into container format and serialize for appropriate output into files. If the message
         * type is unknown, then packed data will be packed with unknown type, and thus may not decode unless you know
         * the correct type.
         *
         * @param data If message could be decoded, this must be set to the Message object.
         * @param groupId Group id for this message. If null, defaults to unknownMessageType.
         * @param input The input message
         * @return A tuple of group id and serialized message
         */
        private KV packInContainer(Message data, String groupId, PubsubMessage input) {
            if (data == null) {
                // Type of data is unknown, so encode as base64
                Value.Builder dataBuilder;
                dataBuilder = Value.newBuilder();
                dataBuilder.setStringValue(encoder.encodeToString(input.getPayload()));
                data = dataBuilder.build();
            }

            if (groupId == null) {
                groupId = unknownMessageGroupId;
            }

            Container.PayloadV1 msg = null;
            if (input.getAttributeMap() != null) {
                Map attributes = new HashMap<>(input.getAttributeMap());
                Container.MetadataV1.Builder metadataBuilder = Container.MetadataV1.newBuilder();
                String value = attributes.remove("hedwig_publisher");
                if (value != null) {
                    metadataBuilder.setPublisher(value);
                }
                value = attributes.remove("hedwig_message_timestamp");
                if (value != null) {
                    try {
                        long millis = Long.parseLong(value);
                        Timestamp timestamp = Timestamp.newBuilder().setSeconds(millis / 1000)
                            .setNanos((int) ((millis % 1000) * 1000000)).build();
                        metadataBuilder.setTimestamp(timestamp);
                    } catch (NumberFormatException ignored) {}
                }

                Container.PayloadV1.Builder builder = Container.PayloadV1.newBuilder();
                value = attributes.remove("hedwig_format_version");
                if (value != null) {
                    builder.setFormatVersion(value);
                }
                value = attributes.remove("hedwig_id");
                if (value != null) {
                    builder.setId(value);
                }
                value = attributes.remove("hedwig_schema");
                if (value != null) {
                    builder.setSchema(value);
                }
                metadataBuilder.putAllHeaders(attributes);
                builder.setMetadata(metadataBuilder);
                builder.setData(Any.pack(data));

                msg = builder.build();
            }

            String output;
            if (msg != null) {
                try {
                    output = JsonFormat.printer()
                        .omittingInsignificantWhitespace()
                        .preservingProtoFieldNames()
                        .usingTypeRegistry(typeRegistry)
                        .print(msg);
                    // XXX: workaround for https://github.com/protocolbuffers/protobuf/issues/7273
                    for (Map.Entry replacement : HTML_SAFE_REPLACEMENT_CHARS.entrySet()) {
                        output = output.replaceAll(replacement.getKey(), replacement.getValue());
                    }
                } catch (InvalidProtocolBufferException e) {
                    // should never happen?
                    LOG.warn("Failed to convert to JSON for: {}", groupId);
                    if (data instanceof Value) {
                        // data is already an unknown value, serialization still failed: try with plain JSON serializer
                        try {
                            output = mapper.writeValueAsString(msg);
                        } catch (JsonProcessingException jsonProcessingException) {
                            // still failed! booo.. last fallback, hand written JSON:
                            output = String.format("{" +
                                    "\"pubsub_message_id\":\"%s\"," +
                                    "\"message_type\":\"%s\"," +
                                    "\"payload\":\"%s\"," +
                                    "\"error\":\"Failed to serialize\"" +
                                    "}",
                                input.getMessageId(),
                                groupId,
                                encoder.encodeToString(input.getPayload())
                            );
                        }
                    } else {
                        return this.packInContainer(null, groupId, input);
                    }
                }
            } else {
                output = String.format("{" +
                        "\"pubsub_message_id\":\"%s\"," +
                        "\"message_type\":\"%s\"," +
                        "\"payload\":\"%s\"," +
                        "\"error\":\"No attributes found, can't decode data\"" +
                        "}",
                    input.getMessageId(),
                    groupId,
                    encoder.encodeToString(input.getPayload())
                );
            }
            return KV.of(String.format("%s/%s", projectId.get(), groupId), output);
        }

        private KV packInContainer(PubsubMessage input) {
            return this.packInContainer(null, null, input);
        }

        private KV packInContainer(String groupId, PubsubMessage input) {
            return this.packInContainer(null, groupId, input);
        }

        private String groupId(String messageType, int majorVersion) {
            // this value is also used as file prefix later in the pipeline
            return String
                .format("%s-v%s", messageType, majorVersion)
                .replaceAll("[._]", "-");
        }

        @Override
        public KV apply(PubsubMessage input) {
            // assume that transport message attributes are in use
            // attributes that must be set already, if not set that's a fail

            ensureSchema();

            String schema = input.getAttribute("hedwig_schema");
            if (schema == null || schema.equals("")) {
                LOG.warn("No schema found, fallback to binary encoding with unknown type");
                return this.packInContainer(input);
            }

            Matcher matcher = schemaPattern.matcher(schema);
            if (!matcher.matches()) {
                LOG.warn("Invalid schema found: {}, fallback to binary encoding with unknown type", schema);
                return this.packInContainer(input);
            }

            String messageType = matcher.group(1);
            int majorVersion = Integer.parseInt(matcher.group(2));
            int minorVersion = Integer.parseInt(matcher.group(3));
            String groupId = this.groupId(messageType, majorVersion);

            Message protoMessage = schemaClasses.getOrDefault(messageType, emptyMap).get(majorVersion);
            if (protoMessage == null) {
                LOG.warn(
                    "Proto message not found for in map: {} v{}, fallback to binary encoding", messageType, majorVersion);
                return this.packInContainer(groupId, input);
            }

            Message msg;
            try {
                msg = protoMessage.getParserForType().parseFrom(input.getPayload());
            } catch (InvalidProtocolBufferException e) {
                LOG.warn("Failed to parse payload for: {} v{}", messageType, majorVersion);
                return this.packInContainer(groupId, input);
            }

            Options.MessageOptions msgOptions =
                msg.getDescriptorForType().getOptions().getExtension(Options.messageOptions);
            if (msgOptions.getMinorVersion() < minorVersion) {
                LOG.warn(
                    "Known proto class but unknown minor version: {} v{}.{}", messageType, majorVersion, minorVersion);
                // encode without decoding payload because we don't want to lose unknown fields added by minor versions
                return this.packInContainer(groupId, input);
            }

            KV output = this.packInContainer(msg, groupId, input);

            LOG.debug("Successfully parsed and encoded as JSON: {} v{}", messageType, majorVersion);
            return output;
        }
    }

    /**
     * Runs the pipeline with the supplied options.
     * @param options runtime options
     */
    private static PipelineResult run(RuntimeOptions options) {
        // Create the pipeline
        Pipeline pipeline = Pipeline.create(options);

        /*
         * Steps:
         *  1) For each subscription:
         *      a) Read proto messages from PubSub
         *      b) Decode proto messages and re-encode in human readable format
         *  2) Window the messages into minute intervals specified by the executor.
         *  3) Output the windowed files to GCS
         */

        // Read PubSub subscriptions and map messages with group id
        List>> pCollections = new ArrayList<>();
        for (String subName : options.getInputSubscriptions()) {
            Matcher matcher = SUBSCRIPTION_REGEXP.matcher(subName);
            if (!matcher.matches()) {
                throw new RuntimeException(String.format(
                    "Invalid subscription: %s, should have matched: %s%n", subName, SUBSCRIPTION_REGEXP));
            }
            String topicName = matcher.group(1);
            String subId = String.format("projects/%s/subscriptions/hedwig-firehose-%s", options.getProject(), topicName);
            PCollection> messages = pipeline
                .apply("Read " + topicName,
                    PubsubIO.readMessagesWithAttributesAndMessageId().fromSubscription(subId))
                .apply("Decode " + topicName, MapElements.via(new ProtobufDecoder(options.getProject(), options.getSchemaFileDescriptorSetFile())));
            pCollections.add(messages);
        }

        // Read PubSub subscriptions and map messages with group id
        for (String subIdCrossProject : options.getInputSubscriptionsCrossProject()) {
            Matcher matcher = SUBSCRIPTION_CROSS_PROJECT_REGEXP.matcher(subIdCrossProject);
            if (!matcher.matches()) {
                throw new RuntimeException(String.format(
                    "Invalid subscription: %s, should have matched: %s%n", subIdCrossProject, SUBSCRIPTION_CROSS_PROJECT_REGEXP));
            }
            String otherProjectId = matcher.group(1);
            String topicName = matcher.group(2);
            String subId = String.format("projects/%s/subscriptions/hedwig-firehose-%s-%s", options.getProject(), otherProjectId, topicName);
            String subDisplayName = String.format("%s-%s", otherProjectId, topicName);
            PCollection> messages = pipeline
                .apply("Read " + subDisplayName,
                    PubsubIO.readMessagesWithAttributesAndMessageId().fromSubscription(subId))
                .apply("Decode " + subDisplayName, MapElements.via(new ProtobufDecoder(otherProjectId, options.getSchemaFileDescriptorSetFile())));
            pCollections.add(messages);
        }

        PCollectionList.of(pCollections)
            .apply("Flatten", Flatten.pCollections())
            .setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))
            .apply(
                options.getWindowDuration() + " Window",
                Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))))

            // Apply windowed file writes. Use a NestedValueProvider because the filename
            // policy requires a resourceId generated from the input value at runtime.
            .apply(
                "Write File(s)",
                FileIO.>writeDynamic()
                .withDestinationCoder(StringUtf8Coder.of())
                .by(KV::getKey)
                .via(Contextful.fn(KV::getValue), TextIO.sink())
                .withNumShards(options.getNumShards())
                .to(options.getOutputDirectory())
                .withNaming(DatePartitionedName::new)
                .withTempDirectory(ValueProvider.NestedValueProvider.of(
                    maybeUseUserTempLocation(
                        options.getUserTempLocation(),
                        options.getOutputDirectory()),
                    input -> FileBasedSink.convertToFileResourceIfPossible(input).toString()))
                .withCompression(Compression.GZIP));

        // Execute the pipeline and return the result.
        return pipeline.run();
    }

    public static void main(String[] args) throws IllegalArgumentException {
        PipelineOptionsFactory.register(RuntimeOptions.class);

        RuntimeOptions options = PipelineOptionsFactory
            .fromArgs(args)
            .withValidation()
            .as(RuntimeOptions.class);

        options.setJobName("hedwig-firehose");
        options.setStreaming(true);
        options.setRunner(DataflowRunner.class);

        PipelineResult result = Firehose.run(options);
        // ignore result?
        System.exit(0);
    }

    /**
     * Utility method for using optional parameter userTempLocation as TempDirectory.
     * This is useful when output bucket is locked and temporary data cannot be deleted.
     *
     * @param userTempLocation user provided temp location
     * @param outputLocation user provided outputDirectory to be used as the default temp location
     * @return userTempLocation if available, otherwise outputLocation is returned.
     */
    private static ValueProvider maybeUseUserTempLocation(
        ValueProvider userTempLocation,
        ValueProvider outputLocation) {
        return DualInputNestedValueProvider.of(
            userTempLocation,
            outputLocation,
            (SerializableFunction, String>)
                input -> (input.getX() != null) ? input.getX() : input.getY());
    }
}