org.wikidata.query.rdf.tool.change.RecentChangesPoller Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tools Show documentation
Show all versions of tools Show documentation
Tools to sync Wikibase to RDF stores. Also contains overall integration tests that rely on everything else.
The newest version!
package org.wikidata.query.rdf.tool.change;
import static java.lang.Boolean.TRUE;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.query.rdf.tool.Utils;
import org.wikidata.query.rdf.tool.exception.RetryableException;
import org.wikidata.query.rdf.tool.wikibase.Continue;
import org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse;
import org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange;
import org.wikidata.query.rdf.tool.wikibase.WikibaseRepository;
import com.codahale.metrics.Counter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.Timer.Context;
import com.google.common.collect.ImmutableList;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
/**
* Polls recent changes. The first batch polls the recent changes api starting
* at the firstStartTime. Subsequent batches either start on the last continue
* of the previous poll, or, if there isn't a continue, then they start one
* second after the last first start time.
*/
@SuppressFBWarnings("FCCD_FIND_CLASS_CIRCULAR_DEPENDENCY")
@SuppressWarnings("checkstyle:classfanoutcomplexity") // TODO: refactoring required!
public class RecentChangesPoller implements Change.Source {
private static final Logger log = LoggerFactory.getLogger(RecentChangesPoller.class);
/**
* How many IDs we keep in seen IDs map.
* Should be enough for 1 hour for speeds up to 100 updates/s.
* If we ever get faster updates, we'd have to bump this.
*/
private static final int MAX_SEEN_IDS = 100 * 60 * 60;
/**
* Wikibase repository to poll.
*/
private final WikibaseRepository wikibase;
/**
* The first start time to poll.
*/
private final Instant firstStartTime;
/**
* Size of the batches to poll against wikibase.
*/
private final int batchSize;
/**
* Set of the IDs we've seen before.
* We have to use map here because LinkedHashMap has removeEldestEntry
* and it does not implement Set. So we're using
* Map with boolean values as Set.
*/
private final Map seenIDs;
/**
* How far back should we tail with secondary tailing poller.
* The value is in milliseconds.
* 0 or negative means no tailing poller;
*/
private final int tailSeconds;
/**
* How much to back off for recent fetches, in seconds.
*/
private static final Duration BACKOFF_TIME = Duration.ofSeconds(10);
/**
* How old should the change be to not apply backoff.
*/
private static final Duration BACKOFF_THRESHOLD = Duration.ofMinutes(2);
/**
* Queue for communicating with tailing updater.
*/
private final BlockingQueue queue = new ArrayBlockingQueue<>(100);
/**
* Optional tailing poller.
* This will be instantiated only if we were asked for it (tailSeconds > 0)
* and when the poller catches up enough so that it is beyond the tailing gap
* itself.
* Note that tailing poller runs in different thread.
*/
private TailingChangesPoller tailPoller;
/**
* Whether to use backoff.
*/
private boolean useBackoff = true;
/**
* Timer counting the time to taken by each fetch of recent changes.
*/
private final Timer recentChangesTimer;
/**
* Counts the number of changes fetched.
*/
private final Counter recentChangesCounter;
public RecentChangesPoller(
WikibaseRepository wikibase, Instant firstStartTime,
int batchSize, Map seenIDs, int tailSeconds,
Timer recentChangesTimer, Counter recentChangesCounter) {
this.wikibase = wikibase;
this.firstStartTime = firstStartTime;
this.batchSize = batchSize;
this.seenIDs = seenIDs;
this.tailSeconds = tailSeconds;
this.recentChangesTimer = recentChangesTimer;
this.recentChangesCounter = recentChangesCounter;
}
public RecentChangesPoller(
WikibaseRepository wikibase, Instant firstStartTime, int batchSize,
MetricRegistry metricRegistry) {
this(wikibase, firstStartTime, batchSize, createSeenMap(), -1,
metricRegistry.timer("recent-changes-timer"), metricRegistry.counter("recent-changes-counter"));
}
public RecentChangesPoller(
WikibaseRepository wikibase, Instant firstStartTime, int batchSize,
int tailSeconds, MetricRegistry metricRegistry) {
this(wikibase, firstStartTime, batchSize, createSeenMap(), tailSeconds,
metricRegistry.timer("recent-changes-timer"), metricRegistry.counter("recent-changes-counter"));
}
/**
* Whether to use backoff.
*/
public void setBackoff(boolean useBackoff) {
this.useBackoff = useBackoff;
}
/**
* Create map of seen IDs.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static Map createSeenMap() {
// Create hash map with max size, evicting oldest ID
final Map map = new LinkedHashMap(MAX_SEEN_IDS, .75F, false) {
@Override
protected boolean removeEldestEntry(Map.Entry eldest) {
return size() > MAX_SEEN_IDS;
}
};
return Collections.synchronizedMap(map);
}
@Override
public Batch firstBatch() throws RetryableException {
return batch(firstStartTime, null);
}
@Override
public Batch nextBatch(Batch lastBatch) throws RetryableException {
Batch newBatch = batch(lastBatch.leftOffDate, lastBatch);
if (tailSeconds > 0) {
// Check if tail poller has something to say.
newBatch = checkTailPoller(newBatch);
}
return newBatch;
}
/**
* Check if we need to poll something from secondary poller queue.
* @param lastBatch
* @return
*/
private Batch checkTailPoller(Batch lastBatch) {
if (tailSeconds <= 0) {
// not enabled, we're done here
return lastBatch;
}
if (tailPoller == null) {
if (lastBatch.leftOffDate().isBefore(Instant.now().minusSeconds(tailSeconds))) {
// still not caught up, do nothing
return lastBatch;
}
// We don't have poller yet - start it
log.info("Started trailing poller with gap of {} seconds", tailSeconds);
// Create new poller starting back tailSeconds and same IDs map.
final RecentChangesPoller poller = new RecentChangesPoller(wikibase,
Instant.now().minusSeconds(tailSeconds),
batchSize, seenIDs, -1, recentChangesTimer, recentChangesCounter);
poller.setBackoff(false);
tailPoller = new TailingChangesPoller(poller, queue, tailSeconds);
tailPoller.setDaemon(true);
tailPoller.start();
} else {
tailPoller.setPollerTs(lastBatch.leftOffDate());
final Batch queuedBatch = queue.poll();
if (queuedBatch != null) {
log.info("Merging {} changes from trailing queue", queuedBatch.changes().size());
return lastBatch.merge(queuedBatch);
}
}
return lastBatch;
}
/**
* Batch implementation for this poller.
*/
public static final class Batch extends Change.Batch.AbstractDefaultImplementation {
/**
* The date where we last left off.
*/
private final Instant leftOffDate;
/**
* Continue from last request. Can be null.
*/
private final Continue lastContinue;
/**
* Flag that states we have had changes, even though we didn't return them.
*/
private boolean hasChanges;
/**
* A batch that will next continue using the continue parameter.
*/
private Batch(ImmutableList changes, long advanced,
String leftOff, Instant nextStartTime, Continue lastContinue) {
super(changes, advanced, leftOff);
leftOffDate = nextStartTime;
this.lastContinue = lastContinue;
}
/**
* Set changes status.
* @param changes Whether we really had changes or not.
*/
public void hasChanges(boolean changes) {
hasChanges = changes;
}
@Override
public boolean hasAnyChanges() {
if (hasChanges) {
return true;
}
return super.hasAnyChanges();
}
@Override
public String advancedUnits() {
return "milliseconds";
}
@Override
public Instant leftOffDate() {
return leftOffDate;
}
@Override
public String leftOffHuman() {
if (lastContinue != null) {
return leftOffDate + " (next: " + lastContinue.getRcContinue() + ")";
} else {
return leftOffDate.toString();
}
}
/**
* Merge this batch with another batch.
*/
@SuppressFBWarnings(value = "OCP_OVERLY_CONCRETE_PARAMETER", justification = "Type seems semantically correct")
public Batch merge(Batch another) {
final ImmutableList newChanges = new ImmutableList.Builder()
.addAll(another.changes())
.addAll(changes())
.build();
return new Batch(newChanges, advanced(), leftOffDate.toString(), leftOffDate, lastContinue);
}
/**
* Get continue object.
*/
public Continue getLastContinue() {
return lastContinue;
}
}
/**
* Check whether change is very recent.
* If it is we need to backoff to allow capturing unsettled DB changes.
*/
private boolean changeIsRecent(Instant nextStartTime) {
return nextStartTime.isAfter(Instant.now().minus(BACKOFF_THRESHOLD));
}
/**
* Fetch recent changes from Wikibase.
* If we're close to current time, we back off a bit from last timestamp,
* and fetch by timestamp. If it's back in the past, we fetch by continuation.
*
* @throws RetryableException on fetch failure
*/
private RecentChangeResponse fetchRecentChanges(Instant lastNextStartTime, Batch lastBatch) throws RetryableException {
try (Context timerContext = recentChangesTimer.time()) {
RecentChangeResponse recentChanges = doFetchRecentChanges(lastNextStartTime, lastBatch);
recentChangesCounter.inc(recentChanges.getQuery().getRecentChanges().size());
return recentChanges;
}
}
private RecentChangeResponse doFetchRecentChanges(Instant lastNextStartTime, Batch lastBatch) throws RetryableException {
if (useBackoff && changeIsRecent(lastNextStartTime)) {
return wikibase.fetchRecentChangesByTime(
lastNextStartTime.minus(BACKOFF_TIME), batchSize);
} else {
return wikibase.fetchRecentChanges(lastNextStartTime,
lastBatch != null ? lastBatch.getLastContinue() : null,
batchSize);
}
}
/**
* Parse a batch from the api result.
*
* @throws RetryableException on parse failure
*/
@SuppressWarnings({"checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity"})
private Batch batch(Instant lastNextStartTime, Batch lastBatch) throws RetryableException {
RecentChangeResponse recentChanges = fetchRecentChanges(lastNextStartTime, lastBatch);
// Using LinkedHashMap here so that changes came out sorted by order of arrival
Map changesByTitle = new LinkedHashMap<>();
Continue nextContinue = recentChanges.getContinue();
Instant nextStartTime = lastNextStartTime;
List result = recentChanges.getQuery().getRecentChanges();
for (RecentChange rc : result) {
// Does not matter if the change matters for us or not, it
// still advances the time since we've seen it.
nextStartTime = Utils.max(nextStartTime, rc.getTimestamp());
if (!wikibase.isEntityNamespace(rc.getNs())) {
log.info("Skipping change in irrelevant namespace: {}", rc);
continue;
}
if (!wikibase.isValidEntity(rc.getTitle())) {
log.info("Skipping change with bogus title: {}", rc.getTitle());
continue;
}
if (seenIDs.containsKey(rc.getRcId())) {
// This change was in the last batch
log.debug("Skipping repeated change with rcid {}", rc.getRcId());
continue;
}
seenIDs.put(rc.getRcId(), TRUE);
// Looks like we can not rely on changes appearing in order in RecentChanges,
// so we have to take them all and let SPARQL sort out the dupes.
Change change;
if (rc.getType().equals("log") && rc.getRevId() == 0) {
// Deletes should always be processed, so put negative revision
change = new Change(rc.getTitle(), -1L, rc.getTimestamp(), rc.getRcId());
} else {
change = new Change(rc.getTitle(), rc.getRevId(), rc.getTimestamp(), rc.getRcId());
}
/*
* Remove duplicate changes by title keeping the latest
* revision. Note that negative revision means always update, so those
* are kept.
*/
Change dupe = changesByTitle.put(change.entityId(), change);
if (dupe != null && (dupe.revision() > change.revision() || dupe.revision() < 0)) {
// need to remove so that order will be correct
changesByTitle.remove(change.entityId());
changesByTitle.put(change.entityId(), dupe);
}
}
final ImmutableList changes = ImmutableList.copyOf(changesByTitle.values());
// Backoff overflow is when:
// a. We use backoff
// b. We got full batch of changes.
// c. None of those were new changes.
// In this case, sleeping and trying again is obviously useless.
final boolean backoffOverflow = useBackoff && changes.isEmpty() && result.size() >= batchSize;
if (backoffOverflow) {
// We have a problem here - due to backoff, we did not fetch any new items
// Try to advance one second, even though we risk to lose a change - in hope
// that trailing poller will pick them up.
nextStartTime = nextStartTime.plusSeconds(1);
log.info("Backoff overflow, advancing next time to {}", nextStartTime);
}
if (!changes.isEmpty()) {
log.info("Got {} changes, from {} to {}", changes.size(),
changes.get(0),
changes.get(changes.size() - 1));
} else {
log.info("Got no real changes");
}
// Show the user the polled time - one second because we can't
// be sure we got the whole second
long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextStartTime);
Batch batch = new Batch(changes, advanced, nextStartTime.minusSeconds(1).toString(), nextStartTime, nextContinue);
if (backoffOverflow && nextContinue != null) {
// We will not sleep if continue is provided.
log.info("Got only old changes, next is: {}", nextContinue);
batch.hasChanges(true);
}
return batch;
}
@Override
public void close() {
// TODO: Do we need to do anything here?
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy