All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikidata.query.rdf.tool.change.KafkaPoller Maven / Gradle / Ivy

Go to download

Tools to sync Wikibase to RDF stores. Also contains overall integration tests that rely on everything else.

The newest version!
package org.wikidata.query.rdf.tool.change;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.util.function.Function.identity;
import static java.util.stream.Collectors.toMap;

import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;

import javax.annotation.Nonnull;

import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndTimestamp;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.InterruptException;
import org.apache.kafka.common.errors.WakeupException;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.query.rdf.tool.Utils;
import org.wikidata.query.rdf.tool.change.events.ChangeEvent;
import org.wikidata.query.rdf.tool.change.events.EventWithChronology;
import org.wikidata.query.rdf.tool.change.events.PageDeleteEvent;
import org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent;
import org.wikidata.query.rdf.tool.exception.RetryableException;
import org.wikidata.query.rdf.tool.wikibase.WikibaseRepository.Uris;

import com.codahale.metrics.Counter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.Timer.Context;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.AtomicLongMap;

/**
 * This poller will connect to Kafka event source and get changes from
 * one or more topics there.
 *
 * After each block of changes is processed, we store current offsets in the DB as:
 * {@code
 *  wikibase:kafka ( "topic:0" 1234 )
 * }
 *
 * When the poller is started, the offsets are initialized from two things:
 * a) if the storage above if found, use that for initialization
 * b) if not, use the given timestamp to initialize
 *
 * This also means if the Kafka offset is ever reset, the old offsets need to be manually
 * cleaned up. See KafkaOffsetsRepository.UPDATE_OFFSETS query for how to do it.
 *
 * See https://wikitech.wikimedia.org/wiki/EventBus for more docs on events in Kafka.
 */
@SuppressWarnings("checkstyle:classfanoutcomplexity") // TODO: refactoring required!
public class KafkaPoller implements Change.Source {

    private static final Logger log = LoggerFactory.getLogger(KafkaPoller.class);

    private static final String MAX_POLL_PROPERTY = KafkaPoller.class.getName() + ".maxPoll";
    private static final String MAX_FETCH_PROPERTY = KafkaPoller.class.getName() + ".maxFetch";
    /**
     * Name of the topic which offset reporting will be based on.
     */
    private static final String REPORTING_TOPIC_PROP = KafkaPoller.class.getName() + ".reportingTopic";

    /**
     * List of topics to listen to.
     * mediawiki.revision-create
     * mediawiki.page-delete
     * mediawiki.page-undelete
     * mediawiki.page-properties-change
     * Note that these may not be real topic names if we have
     * cluster configuration.
     * FIXME: should be configuration?
     */
    private static final Map> defaultTopics = ImmutableMap.of(
            // Not not using for now since revision-create should cover it
//                  "mediawiki.recentchange", RecentChangeEvent.class,
            "mediawiki.revision-create", RevisionCreateEvent.class,
            "mediawiki.page-delete", PageDeleteEvent.class,
            // Same class as revision-create since relevant data part looks the same
            "mediawiki.page-undelete", RevisionCreateEvent.class
//            "mediawiki.page-properties-change", PropertiesChangeEvent.class
    );

    /**
     * Default name of the topic which offset reporting will be based on.
     */
    private static final String DEFAULT_REPORTING_TOPIC =  "mediawiki.revision-create";

    /**
     *
     */
    private final String reportingTopic;

    /**
     * The first start time to poll.
     */
    private final Instant firstStartTime;
    /**
     * Size of the batches to poll against wikibase.
     */
    private final int batchSize;
    /**
     * Kafka consumer.
     */
    private final Consumer consumer;
    /**
     * Used to store and retrieve Kafka Offsets.
     */
    private final KafkaOffsetsRepository kafkaOffsetsRepository;
    /**
     * Wikibase URIs setup.
     */
    private Uris uris;
    /**
     * List of topics/partitions we're listening to.
     */
    @Nonnull
    private final ImmutableList topicPartitions;
    /**
     * Should we ignore stored offsets?
     */
    private final boolean ignoreStoredOffsets;

    /**
     * Counts the total number of changes polled from Kafka.
     */
    private final Counter changesCounter;

    /**
     * Counts the time spent polling Kafka.
     *
     * Note that this includes time spent waiting for a timeout if not enough
     * changes are available in Kafka for a poll to complete immediately.
     */
    private final Timer pollingTimer;

    public KafkaPoller(Consumer consumer, Uris uris,
                       Instant firstStartTime, int batchSize, Collection topics,
                       KafkaOffsetsRepository kafkaOffsetsRepository,
                       boolean ignoreStoredOffsets,
                       MetricRegistry metricRegistry) {
        this.consumer = consumer;
        this.uris = uris;
        this.firstStartTime = firstStartTime;
        this.batchSize = batchSize;
        this.changesCounter = metricRegistry.counter("kafka-changes-counter");
        this.pollingTimer = metricRegistry.timer("kafka-changes-timer");
        this.topicPartitions = topicsToPartitions(topics, consumer);
        this.kafkaOffsetsRepository = kafkaOffsetsRepository;
        this.ignoreStoredOffsets = ignoreStoredOffsets;
        this.reportingTopic = System.getProperty(REPORTING_TOPIC_PROP, DEFAULT_REPORTING_TOPIC);
    }

    /**
     * Create list of topics with cluster names.
     * @param clusterNames Cluster names (if empty, original list is returned)
     * @return List of topics with cluster names as: %cluster%.%topic%
     */
    private static Map> clusterNamesAwareTopics(Collection clusterNames) {
        if (clusterNames == null || clusterNames.isEmpty()) {
            // No cluster - use topic names as is
            return defaultTopics;
        } else {
            return defaultTopics.entrySet().stream().flatMap(entry ->
                // Prepend cluster names to keys, e.g.:
                // page.revision => eqiad.page.revision
                clusterNames.stream().map(
                    cluster -> Maps.immutableEntry(cluster + "." + entry.getKey(), entry.getValue())
                )
            ).collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue));
        }
    }

    // Suppressing resource warnings so Java doesn't complain about KafkaConsumer not being closed
    @SuppressWarnings("resource")
    private static KafkaConsumer buildKafkaConsumer(
            String servers, String consumerId,
            Map> topicToClass,
            int batchSize) {
        // See http://kafka.apache.org/documentation.html#consumerconfigs
        Properties props = new Properties();
        props.put("bootstrap.servers", servers);
        props.put("group.id", consumerId);
        // This is an interval between polls after which the broker decides the client is dead.
        // 5 mins seems to be good enough.
        props.put("max.poll.interval.ms", "600000");
        // We will manually commit after the batch is processed
        props.put("enable.auto.commit", "false");
        // See https://cwiki.apache.org/confluence/display/KAFKA/KIP-41%3A+KafkaConsumer+Max+Records
        // Basically it works this way: Kafka fetches N records from each partition, where N is max.poll.records
        // Or if there isn't as much, it polls as many as possible.
        // Then it returns them to poll() in a round-robin fashion. Next poll is not initiated unless
        // the prefetch data dips below N.
        // TODO: Should we set it to half batch size, so in each batch we will have several topics?
        props.put("max.poll.records", System.getProperty(MAX_POLL_PROPERTY, String.valueOf(batchSize)));
        // This is about one batch of messages since message sizes in event queue are about 1k
        props.put("max.partition.fetch.bytes", System.getProperty(MAX_FETCH_PROPERTY, String.valueOf(batchSize * 1024)));
        log.info("Creating consumer {}", consumerId);
        return new KafkaConsumer<>(props, new StringDeserializer(), new JsonDeserializer<>(topicToClass));
    }

    @Nonnull
    public static KafkaPoller buildKafkaPoller(
            String kafkaServers, String consumerId, Collection clusterNames,
            Uris uris, int batchSize, Instant startTime,
            boolean ignoreStoredOffsets, KafkaOffsetsRepository kafkaOffsetsRepository,
            MetricRegistry metricRegistry) {
        if (consumerId == null) {
            throw new IllegalArgumentException("Consumer ID (--consumer) must be set");
        }
        Map> topicsToClass = clusterNamesAwareTopics(clusterNames);
        ImmutableSet topics = ImmutableSet.copyOf(topicsToClass.keySet());

        return new KafkaPoller(
                buildKafkaConsumer(kafkaServers, consumerId, topicsToClass, batchSize),
                uris, startTime, batchSize, topics,
                kafkaOffsetsRepository,
                ignoreStoredOffsets,
                metricRegistry);
    }

    @Override
    public Batch firstBatch() throws RetryableException {
        Map kafkaOffsets = fetchOffsets();
        // assign ourselves to all partitions of the topics we want
        consumer.assign(kafkaOffsets.keySet());
        log.info("Subscribed to {} topics", kafkaOffsets.size());
        // Seek each topic to proper offset
        kafkaOffsets.forEach(
                (topic, offset) -> {
                    if (offset == null) {
                        log.info("No offset for {}, starting at the end", topic);
                        consumer.seekToEnd(Collections.singletonList(topic));
                        return;
                    }
                    consumer.seek(topic, offset.offset());
                    log.info("Set topic {} to {}", topic, offset);
                }
         );

        return fetch(firstStartTime);
    }

    /**
     * Fetch current offsets for all topics.
     * The offsets come either from persistent offset storage or
     * from offsetsForTimes() API.
     * @return Map TopicPartition -> offset
     */
    private Map fetchOffsets() {
        // Create a map of offsets from storage
        Map storedOffsets;
        if (ignoreStoredOffsets) {
            storedOffsets = ImmutableMap.of();
        } else {
            storedOffsets = kafkaOffsetsRepository.load(firstStartTime);
        }

        // Make a map (topic, partition) -> timestamp for those not in loaded map
        Map topicParts = topicPartitions.stream()
                .filter(tp -> !storedOffsets.containsKey(tp))
                .collect(toMap(o -> o, o -> firstStartTime.toEpochMilli()));

        // Remove topics that are not supported anymore
        Map results = storedOffsets
                .entrySet().stream()
                .filter(e -> topicPartitions.contains(e.getKey()))
                .collect(Collectors.toMap(Map.Entry::getKey,
                        Map.Entry::getValue));

        // Fill up missing offsets from timestamp
        if (topicParts.size() > 0) {
            results.putAll(consumer.offsetsForTimes(topicParts));
        }

        return results;
    }

    public ImmutableMap currentOffsets() {
        return topicPartitions.stream()
                    .collect(toImmutableMap(identity(), consumer::position));
    }

    @Override
    public Batch nextBatch(Batch lastBatch) throws RetryableException {
        consumer.commitSync();
        kafkaOffsetsRepository.store(currentOffsets());
        return fetch(lastBatch.leftOffDate());
    }

    /**
     * Fetch changes from Kafka.
     * @param lastNextStartTime where last fetch ended up.
     * @return Set of changes.
     * @throws RetryableException
     */
    @SuppressWarnings({"checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity"})
    private Batch fetch(Instant lastNextStartTime) throws RetryableException {
        Map changesByTitle = new LinkedHashMap<>();
        ConsumerRecords records;
        Instant nextInstant = Instant.EPOCH;
        AtomicLongMap topicCounts = AtomicLongMap.create();
        while (true) {
            try (Context timerContext = pollingTimer.time()) {
                // TODO: make timeout configurable? Wait for a bit so we catch bursts of messages?
                records = consumer.poll(1000);
            } catch (InterruptException | WakeupException e) {
                throw new RetryableException("Error fetching recent changes", e);
            }
            int count = records.count();
            log.debug("Fetched {} records from Kafka", count);
            changesCounter.inc(count);
            if (count == 0) {
                // If we got nothing from Kafka, get out of the loop and return what we have
                break;
            }
            boolean foundSomething = false;
            for (ConsumerRecord record: records) {
                ChangeEvent event = record.value();
                String topic = record.topic();
                log.trace("Got event t:{} o:{}", record.topic(), record.offset());
                if (!event.domain().equals(uris.getHost())) {
                    // wrong domain, ignore
                    continue;
                }
                // Not checking timestamp here, since different channels can have different
                // idea about what the latest timestamp is.
                // check namespace
                if (!uris.isEntityNamespace(event.namespace())) {
                    continue;
                }
                if (event.isRedundant()) {
                    // This is a redundant event, we can skip it.
                    continue;
                }
                // Now we have event that we want to process
                foundSomething = true;
                topicCounts.getAndIncrement(record.topic());
                // Keep max time for the reporting topic
                // We use only one topic (or set of topics) here because otherwise when catching up we
                // could get messages from different topics with different times and the tracking becomes
                // very chaotic, jumping back and forth.
                if (topic.endsWith(reportingTopic)) {
                    nextInstant = Utils.max(nextInstant, Instant.ofEpochMilli(record.timestamp()));
                }
                // Using offset here as RC id since we do not have real RC id (this not being RC poller) but
                // the offset serves the same function in Kafka and is also useful for debugging.
                Change change = makeChange(event, record.offset());
                Change dupe = changesByTitle.put(change.entityId(), change);
                // If we have a duplicate - i.e. event with same title - we
                // keep the newest one, by revision number, or in case of deletion, we keep
                // the delete since delete don't have it's own revision number.
                // Note that rev numbers are used only to weed out repeated changes
                // so worst thing we do extra work. This can happen if delete is duplicated
                // or combined with other event, since deletes will always be included.
                // This is not a big deal since deletes are relatively rare.
                if (dupe != null && change.revision() > Change.NO_REVISION && (dupe.revision() > change.revision() || dupe.revision() == Change.NO_REVISION)) {
                    // need to remove so that order will be correct
                    changesByTitle.remove(change.entityId());
                    changesByTitle.put(change.entityId(), dupe);
                }

            }
            log.debug("{} records left after filtering", changesByTitle.size());
            if (changesByTitle.size() >= batchSize) {
                // We have enough for the batch
                break;
            }
            if (changesByTitle.size() > 0 && !foundSomething) {
                log.info("Did not find anything useful in this batch, returning existing data");
                // We have changes and last poll didn't find anything new - return these ones, don't
                // wait for more.
                break;
            }
            // TODO: if we already have something and we've spent more than X seconds in the loop,
            // we probably should return without waiting for more
        }

        // If we didn't get anything useful in the reporting topic, keep the old value
        if (nextInstant.equals(Instant.EPOCH)) {
            nextInstant = lastNextStartTime;
        }

        final ImmutableList changes = ImmutableList.copyOf(changesByTitle.values());
        log.info("Found {} changes", changes.size());
        if (log.isDebugEnabled()) {
            topicCounts.asMap().forEach((k, v) -> log.debug("Topic {}: {} records", k, v));
        }
        long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextInstant);
        // Show the user the polled time - one second because we can't
        // be sure we got the whole second
        return new Batch(changes, advanced, nextInstant.minusSeconds(1).toString(), nextInstant);
    }

    /**
     * Create change object from event.
     */
    private Change makeChange(ChangeEvent event, long position) {
        if (event instanceof EventWithChronology) {
            return new Change(event.title(), event.revision(), event.timestamp(), position, ((EventWithChronology) event).chronologyId());
        } else {
            return new Change(event.title(), event.revision(), event.timestamp(), position);
        }
    }

    /**
     * Set up the list of partitions for topics we're interested in.
     */
    private static ImmutableList topicsToPartitions(
            Collection topics,
            Consumer consumer) {
        return topics.stream()
                // Get partitions
                .flatMap((String topic) -> consumer.partitionsFor(topic).stream())
                // Create TopicPartition objects
                .map(p -> new TopicPartition(p.topic(), p.partition()))
                .collect(toImmutableList());
    }

    public static final class Batch extends Change.Batch.AbstractDefaultImplementation {
        /**
         * The date where we last left off.
         */
        private final Instant leftOffDate;

        public Batch(ImmutableList changes, long advanced, String leftOff, Instant nextStartTime) {
            super(changes, advanced, leftOff);
            leftOffDate = nextStartTime;
        }

        @Override
        public String advancedUnits() {
            return "milliseconds";
        }

        @Override
        public Instant leftOffDate() {
            return leftOffDate;
        }
    }

    @Override
    public void close() {
        consumer.close();
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy