All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.bullet.storm.drpc.DRPCQuerySubscriber Maven / Gradle / Ivy

Go to download

This is the implementation of Bullet - a real-time query engine - in Apache Storm.

The newest version!
/*
 *  Copyright 2017, Yahoo Inc.
 *  Licensed under the terms of the Apache License, Version 2.0.
 *  See the LICENSE file associated with the project for terms.
 */
package com.yahoo.bullet.storm.drpc;

import com.yahoo.bullet.common.BulletConfig;
import com.yahoo.bullet.pubsub.BufferingSubscriber;
import com.yahoo.bullet.pubsub.Metadata;
import com.yahoo.bullet.pubsub.PubSubMessage;
import com.yahoo.bullet.storm.drpc.utils.DRPCOutputCollector;
import lombok.extern.slf4j.Slf4j;
import org.apache.storm.drpc.DRPCSpout;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * This class wraps a {@link DRPCSpout} and uses it to read messages from Storm DRPC. It needs all the Storm config to
 * be able to connect to and read from the DRPC servers using Thrift.
 *
 * It buffers read queries in memory upto a specified limit (and stops till further commits are received) and can
 * re-emit failed queries. However, it is not resilient if the Subscriber is closed or reinitialized elsewhere.
 */
@Slf4j
public class DRPCQuerySubscriber extends BufferingSubscriber {
    private DRPCSpout spout;
    private DRPCOutputCollector collector;

    // PubSubMessage id to DRPCMessageIds. For failing requests if the subscriber is closed.
    private Map emittedIDs;

    /**
     * Creates and initializes a Subscriber that reads from the DRPC servers. Intended to be used inside a Storm
     * spout in a Storm topology.
     *
     * @param config The config containing the String function in {@link DRPCConfig#DRPC_FUNCTION}, the Storm configuration
     *               {@link Map} as {@link com.yahoo.bullet.storm.BulletStormConfig#STORM_CONFIG} and the Storm
     *               {@link TopologyContext} as {@link com.yahoo.bullet.storm.BulletStormConfig#STORM_CONTEXT}.
     * @param maxUnCommittedQueries The maximum number of queries that can be read without committing them.
     */
    public DRPCQuerySubscriber(BulletConfig config, int maxUnCommittedQueries) {
        // Get the DRPC function we should subscribe to
        this(config, maxUnCommittedQueries, new DRPCOutputCollector(),
             new DRPCSpout(config.getRequiredConfigAs(DRPCConfig.DRPC_FUNCTION, String.class)));
    }

    /**
     * Exposed for testing.
     *
     * @param config The config containing the String function in {@link DRPCConfig#DRPC_FUNCTION}, the Storm configuration
     *               {@link Map} as {@link com.yahoo.bullet.storm.BulletStormConfig#STORM_CONFIG} and the Storm
     *               {@link TopologyContext} as {@link com.yahoo.bullet.storm.BulletStormConfig#STORM_CONTEXT}.
     * @param maxUnCommittedQueries The maximum number of queries that can be read without committing them.
     * @param collector The {@link DRPCOutputCollector} to use.
     * @param spout The {@link DRPCSpout} to use.
     */
    DRPCQuerySubscriber(BulletConfig config, int maxUnCommittedQueries, DRPCOutputCollector collector, DRPCSpout spout) {
        super(maxUnCommittedQueries);

        this.collector = collector;
        this.spout = spout;
        emittedIDs = new HashMap<>();

        // Get the Storm Config that has all the relevant cluster settings and properties
        Map stormConfig = config.getRequiredConfigAs(DRPCConfig.STORM_CONFIG, Map.class);

        // Get the TopologyContext
        TopologyContext context = config.getRequiredConfigAs(DRPCConfig.STORM_CONTEXT, TopologyContext.class);

        // Wrap the collector in a SpoutOutputCollector (it just delegates to the underlying DRPCOutputCollector)
        SpoutOutputCollector spoutOutputCollector = new SpoutOutputCollector(collector);

        spout.open(stormConfig, context, spoutOutputCollector);
    }

    @Override
    public List getMessages() {
        // Try and read from DRPC. The DRPCSpout does a sleep for 1 ms if there are no tuples, so we don't have to do it.
        spout.nextTuple();

        if (!collector.haveOutput()) {
            return null;
        }

        // The DRPCSpout only should have emitted one tuple
        List> tuples = collector.reset();

        log.debug("Have a message through DRPC {}", tuples);
        List tupleAndID = tuples.get(0);

        // The first object is the actual DRPCSpout tuple and the second is the DRPC messageID.
        List tuple = (List) tupleAndID.get(0);
        Object drpcID = tupleAndID.get(1);

        // The first object in the tuple is our PubSubMessage as JSON
        String pubSubMessageJSON = (String) tuple.get(0);
        // The second object in the tuple is the serialized returnInfo added by the DRPCSpout
        String returnInfo = (String) tuple.get(1);

        log.debug("Read message\n{}\nfrom DRPC with return information {}", pubSubMessageJSON, returnInfo);

        PubSubMessage pubSubMessage = PubSubMessage.fromJSON(pubSubMessageJSON);
        // Add returnInfo as metadata. Cannot add it to pubSubMessage
        String id = pubSubMessage.getId();
        String content = pubSubMessage.getContentAsString();
        PubSubMessage message = new PubSubMessage(id, content, new Metadata(null, returnInfo));

        emittedIDs.put(id, drpcID);
        return Collections.singletonList(message);
    }

    @Override
    public void commit(String id) {
        super.commit(id);
        emittedIDs.remove(id);
    }

    /*
     * Not overriding fail to remove the (id, sequence) from emittedIDs because the message will be re-emitted by
     * super.fail and re-added back to emittedIDs. It is actually a bug to remove the id from emittedIDs in fail because
     * if the request is failed, then close is called before receive is called to re-add it back, the request will
     * NOT be closed and will remain on the DRPC servers till the global timeout.
     */

    @Override
    public void close() {
        log.warn("Failing all pending requests: {}", emittedIDs);
        emittedIDs.values().forEach(spout::fail);
        log.info("Closing spout...");
        spout.close();
    }
}