com.bazaarvoice.emodb.databus.core.DefaultFanout Maven / Gradle / Ivy
package com.bazaarvoice.emodb.databus.core;
import com.bazaarvoice.emodb.common.dropwizard.lifecycle.ServiceFailureListener;
import com.bazaarvoice.emodb.common.dropwizard.log.RateLimitedLog;
import com.bazaarvoice.emodb.common.dropwizard.log.RateLimitedLogFactory;
import com.bazaarvoice.emodb.common.dropwizard.time.ClockTicker;
import com.bazaarvoice.emodb.databus.ChannelNames;
import com.bazaarvoice.emodb.databus.model.OwnedSubscription;
import com.bazaarvoice.emodb.datacenter.api.DataCenter;
import com.bazaarvoice.emodb.event.api.EventData;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.time.Clock;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import static java.util.Objects.requireNonNull;
* Copies events from the "__system_bus:master" event channel or an inbound replication channel to the individual
* subscription event channels.
* Each source channel is handled by a single process in the EmoDB cluster. Generally this copy process is fast enough
* (and I/O bound) that it's not necessary to spread the work across different servers, but if that changes we can
* spread writes across multiple source channels (eg. __system_bus:master1, __system_bus:master2, etc.).
public class DefaultFanout extends AbstractScheduledService {
private static final Logger _log = LoggerFactory.getLogger(DefaultFanout.class);
private static final int FLUSH_EVENTS_THRESHOLD = 500;
private final String _name;
private final EventSource _eventSource;
private final Function, Void> _eventSink;
private final boolean _replicateOutbound;
private final PartitionSelector _outboundPartitionSelector;
private final Duration _sleepWhenIdle;
private final Supplier> _subscriptionsSupplier;
private final DataCenter _currentDataCenter;
private final RateLimitedLog _rateLimitedLog;
private final SubscriptionEvaluator _subscriptionEvaluator;
private final Meter _eventsRead;
private final Meter _eventsWrittenLocal;
private final Meter _eventsWrittenOutboundReplication;
private final Meter _subscriptionMatchEvaluations;
private final Timer _totalCopyTimer;
private final Timer _fetchEventsTimer;
private final Timer _fetchSubscriptionsTimer;
private final Timer _fanoutTimer;
private final Timer _e2eFanoutTimer;
private final Timer _matchSubscriptionsTimer;
private final Timer _replicateTimer;
private final Timer _fetchMatchEventDataTimer;
private final Timer _eventFlushTimer;
private final Clock _clock;
private final Stopwatch _lastLagStopwatch;
private final FanoutLagMonitor.Lag _lagGauge;
private int _lastLagSeconds = -1;
private final ExecutorService _fanoutPool;
public DefaultFanout(String name,
String partitionName,
EventSource eventSource,
Function, Void> eventSink,
@Nullable PartitionSelector outboundPartitionSelector,
Duration sleepWhenIdle,
Supplier> subscriptionsSupplier,
DataCenter currentDataCenter,
RateLimitedLogFactory logFactory,
SubscriptionEvaluator subscriptionEvaluator,
FanoutLagMonitor fanoutLagMonitor,
MetricRegistry metricRegistry, Clock clock) {
_name = requireNonNull(name, "name");
requireNonNull(partitionName, "partitionName");
_eventSource = requireNonNull(eventSource, "eventSource");
_eventSink = requireNonNull(eventSink, "eventSink");
_replicateOutbound = outboundPartitionSelector != null;
_outboundPartitionSelector = outboundPartitionSelector;
_sleepWhenIdle = requireNonNull(sleepWhenIdle, "sleepWhenIdle");
_subscriptionsSupplier = requireNonNull(subscriptionsSupplier, "subscriptionsSupplier");
_currentDataCenter = requireNonNull(currentDataCenter, "currentDataCenter");
_subscriptionEvaluator = requireNonNull(subscriptionEvaluator, "subscriptionEvaluator");
_rateLimitedLog = logFactory.from(_log);
_eventsRead = newEventMeter("read", metricRegistry);
_eventsWrittenLocal = newEventMeter("written-local", metricRegistry);
_eventsWrittenOutboundReplication = newEventMeter("written-outbound-replication", metricRegistry);
_subscriptionMatchEvaluations = newEventMeter("subscription-match-evaluations", metricRegistry);
_totalCopyTimer = metricRegistry.timer(metricName("total-copy"));
_fetchEventsTimer = metricRegistry.timer(metricName("fetch-events"));
_fetchSubscriptionsTimer = metricRegistry.timer(metricName("fetch-subscriptions"));
_fanoutTimer = metricRegistry.timer(metricName("fanout"));
_e2eFanoutTimer = metricRegistry.timer(metricName("e2e-fanout"));
_matchSubscriptionsTimer = metricRegistry.timer(metricName("match-subscriptions"));
_replicateTimer = metricRegistry.timer(metricName("replicate"));
_fetchMatchEventDataTimer = metricRegistry.timer(metricName("fetch-match-event-data"));
_eventFlushTimer = metricRegistry.timer(metricName("flush-events"));
_lagGauge = requireNonNull(fanoutLagMonitor, "fanoutLagMonitor").createForFanout(name, partitionName);
_lastLagStopwatch = Stopwatch.createStarted(ClockTicker.getTicker(clock));
_clock = clock;
ServiceFailureListener.listenTo(this, metricRegistry);
final ThreadFactory fanoutThreadFactory = new ThreadFactoryBuilder()
_fanoutPool = Executors.newFixedThreadPool(8, fanoutThreadFactory);
private Meter newEventMeter(String name, MetricRegistry metricRegistry) {
return metricRegistry.meter(metricName(name));
private String metricName(String name) {
return"bv.emodb.databus", "DefaultFanout", name, _name);
protected Scheduler scheduler() {
return Scheduler.newFixedDelaySchedule(0, _sleepWhenIdle.toMillis(), TimeUnit.MILLISECONDS);
protected void runOneIteration() {
try {
//noinspection StatementWithEmptyBody
while (isRunning() && copyEvents()) {
// Loop w/o sleeping as long as we keep finding events.
} catch (Throwable t) {
// Fanout runs in a continuous loop. If we get into a bad state, use the rate limited log to avoid
// flooding the logs with a continuous stream of error messages. Include the event source name in the
// message template so we rate limit each event source independently.
_rateLimitedLog.error(t, "Unexpected fanout exception copying from " + _name + ": {}", t);
stopAsync().awaitTerminated(); // Give up leadership temporarily. Maybe another server will have more success.
protected void shutDown() throws Exception {
// Leadership lost, stop posting fanout lag
private boolean copyEvents() {
try (Timer.Context ignored = _totalCopyTimer.time()) {
// Use peek() not poll() since LeaderSelector ensures we're not competing with other processes for claims.
final Timer.Context peekTime = _fetchEventsTimer.time();
List rawEvents = _eventSource.get(1000);
// If no events, sleep a little while before doing any more work to allow new events to arrive.
if (rawEvents.isEmpty()) {
// Update the lag metrics to indicate there is no lag
return false;
// Last chance to check that we are the leader before doing anything that would be bad if we aren't.
return isRunning() && copyEvents(rawEvents);
boolean copyEvents(List rawEvents) {
// Read the list of subscriptions *after* reading events from the event store to avoid race conditions with
// creating a new subscription.
final Timer.Context subTime = _fetchSubscriptionsTimer.time();
Iterable subscriptions = _subscriptionsSupplier.get();
List lastMatchEventBatchTimes = Collections.synchronizedList(Lists.newArrayList());
try (final Timer.Context ignored = _e2eFanoutTimer.time()) {
final List> futures = new LinkedList<>();
// Copy the events to all the destination channels.
for (final List rawEventPartition : Lists.partition(rawEvents, (int) Math.ceil(1.0 * rawEvents.size() / 8))) {
futures.add(_fanoutPool.submit(() -> {
try {
// multimap is not threadsafe
final List eventKeys = new LinkedList<>();
final ListMultimap eventsByChannel = LinkedListMultimap.create();
SubscriptionEvaluator.MatchEventData lastMatchEventData = null;
int numOutboundReplicationEvents = 0;
try (Timer.Context ignored1 = _fanoutTimer.time()) {
for (EventData rawEvent : rawEventPartition) {
ByteBuffer eventData = rawEvent.getData();
SubscriptionEvaluator.MatchEventData matchEventData;
try (Timer.Context ignored2 = _fetchMatchEventDataTimer.time()) {
matchEventData = _subscriptionEvaluator.getMatchEventData(eventData);
} catch (OrphanedEventException e) {
// There's a 2 second window where a race condition exists such that a newly created
// table may exist but due to caching the table may be cached as unknown. To allow
// plenty of room for error wait until over 30 seconds after the event was written
// before dropping the event. After this the event must be orphaned because
// the associated table was dropped.
if (e.getEventTime().until(_clock.instant(), ChronoUnit.SECONDS) > 30) {
// Copy to subscriptions in the current data center.
Timer.Context matchTime = _matchSubscriptionsTimer.time();
int subscriptionCount = 0;
for (OwnedSubscription subscription : subscriptions) {
subscriptionCount += 1;
if (_subscriptionEvaluator.matches(subscription, matchEventData)) {
eventsByChannel.put(subscription.getName(), eventData);
// Copy to queues for eventual delivery to remote data centers.
try (Timer.Context ignored4 = _replicateTimer.time()) {
if (_replicateOutbound) {
for (DataCenter dataCenter : matchEventData.getTable().getDataCenters()) {
if (!dataCenter.equals(_currentDataCenter)) {
int partition = _outboundPartitionSelector.getPartition(matchEventData.getKey());
String channel = ChannelNames.getReplicationFanoutChannel(dataCenter, partition);
eventsByChannel.put(channel, eventData);
// Flush to cap the amount of memory used to buffer events.
if (eventsByChannel.size() >= FLUSH_EVENTS_THRESHOLD) {
flush(eventKeys, eventsByChannel, numOutboundReplicationEvents);
numOutboundReplicationEvents = 0;
// Track the final match event data record returned
lastMatchEventData = matchEventData;
// Final flush.
flush(eventKeys, eventsByChannel, numOutboundReplicationEvents);
if (lastMatchEventData != null) {
} catch (Throwable t) {
_log.error("Uncaught exception in fanout pool. Thread will die, but it should be replaced.", t);
throw t;
for (final Future> future : futures) {
// Update the lag metrics based on the last event returned. This isn't perfect for several reasons:
// 1. In-order delivery is not guaranteed
// 2. The event time is based on the change ID which is close-to but not precisely the time the update occurred
// 3. Injected events have artificial change IDs which don't correspond to any clock-based time
// However, this is still a useful metric because:
// 1. Delivery is in-order the majority of the time
// 2. Change IDs are typically within milliseconds of update times
// 3. Injected events are extremely rare and should be avoided outside of testing anyway
// 4. The lag only becomes a concern on the scale of minutes, far above the uncertainty introduced by the above
Date lastMatchEventTime =;
if (lastMatchEventTime != null) {
return true;
private void updateLagMetrics(@Nullable Date eventTime) {
int lagSeconds = eventTime == null ? 0 : (int) TimeUnit.MILLISECONDS.toSeconds(_clock.millis() - eventTime.getTime());
// As a performance savings only update the metric if both of the following are true:
// 1. It has been more than 5 seconds since the last time the metric was updated
// 2. The lag changed since the last posting
if (lagSeconds != _lastLagSeconds && _lastLagStopwatch.elapsed(TimeUnit.SECONDS) >= 5) {
_lastLagSeconds = lagSeconds;
private void flush(List eventKeys, Multimap eventsByChannel,
int numOutboundReplicationEvents) {
try (Timer.Context ignore = _eventFlushTimer.time()) {
if (!eventsByChannel.isEmpty()) {
_eventsWrittenLocal.mark(eventsByChannel.size() - numOutboundReplicationEvents);
if (!eventKeys.isEmpty()) {