org.wikidata.query.rdf.tool.Updater Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tools Show documentation
Show all versions of tools Show documentation
Tools to sync Wikibase to RDF stores. Also contains overall integration tests that rely on everything else.
The newest version!
package org.wikidata.query.rdf.tool;
import static java.lang.Thread.currentThread;
import java.io.Closeable;
import java.io.IOException;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.openrdf.model.Statement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.query.rdf.common.uri.UrisScheme;
import org.wikidata.query.rdf.tool.change.Change;
import org.wikidata.query.rdf.tool.change.Change.DelayedChange;
import org.wikidata.query.rdf.tool.exception.ContainedException;
import org.wikidata.query.rdf.tool.exception.RetryableException;
import org.wikidata.query.rdf.tool.rdf.Munger;
import org.wikidata.query.rdf.tool.rdf.RdfRepository;
import org.wikidata.query.rdf.tool.wikibase.WikibaseRepository;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSetMultimap;
/**
* Update tool.
*
* This contains the core logic of the update tool.
*
* @param type of update batch
*/
// TODO fan out complexity
@SuppressWarnings("checkstyle:classfanoutcomplexity")
public class Updater implements Runnable, Closeable {
private static final Logger log = LoggerFactory.getLogger(Updater.class);
/**
* For how long (seconds) we should defer a change in case we detect replication lag.
*/
private static final long DEFERRAL_DELAY = 5;
/**
* Meter for the raw number of updates synced.
*/
private final Meter updatesMeter;
/**
* Meter measuring in a batch specific unit. For the RecentChangesPoller its
* milliseconds, for the IdChangeSource its ids.
*/
private final Meter batchAdvanced;
/**
* Measure how many updates skipped ahead of their change revisions.
*/
private final Meter skipAheadMeter;
/**
* Source of change batches.
*/
private final Change.Source changeSource;
/**
* Wikibase to read rdf from.
*/
private final WikibaseRepository wikibase;
/**
* Repository to which to sync rdf.
*/
private final RdfRepository rdfRepository;
/**
* Munger to munge rdf from wikibase before adding it to the rdf store.
*/
private final Munger munger;
/**
* The executor to use for updates.
*/
private final ExecutorService executor;
/**
* Seconds to wait after we hit an empty batch. Empty batches signify that
* there aren't any changes left now but the change stream isn't over. In
* particular this will happen if the RecentChangesPoller finds no changes.
*/
private final int pollDelay;
/**
* Uris for wikibase.
*/
private final UrisScheme uris;
/**
* Queue of delayed changes.
* Change is delayed if RDF data produces lower revision than change - this means we're
* affected by replication lag.
*/
private final DelayQueue deferralQueue;
/**
* Map entity->values list from repository.
*/
private ImmutableSetMultimap repoValues;
/**
* Map entity->references list from repository.
*/
private ImmutableSetMultimap repoRefs;
/**
* Should we verify updates?
*/
private final boolean verify;
Updater(Change.Source changeSource, WikibaseRepository wikibase, RdfRepository rdfRepository,
Munger munger, ExecutorService executor, int pollDelay, UrisScheme uris, boolean verify,
MetricRegistry metricRegistry) {
this.changeSource = changeSource;
this.wikibase = wikibase;
this.rdfRepository = rdfRepository;
this.munger = munger;
this.executor = executor;
this.pollDelay = pollDelay;
this.uris = uris;
this.verify = verify;
this.updatesMeter = metricRegistry.meter("updates");
this.batchAdvanced = metricRegistry.meter("batch-progress");
this.skipAheadMeter = metricRegistry.meter("updates-skip");
this.deferralQueue = new DelayQueue<>();
}
@Override
public void run() {
B batch = null;
do {
try {
batch = changeSource.firstBatch();
} catch (RetryableException e) {
log.warn("Retryable error fetching first batch. Retrying.", e);
}
} while (batch == null);
log.debug("{} changes in batch", batch.changes().size());
Instant oldDate = null;
while (!currentThread().isInterrupted()) {
try {
handleChanges(addDeferredChanges(deferralQueue, batch.changes()));
Instant leftOffDate = batch.leftOffDate();
if (leftOffDate != null) {
/*
* Back one second because the resolution on our poll isn't
* super good and because its not big deal to recheck if we
* have some updates.
*/
leftOffDate = leftOffDate.minusSeconds(1);
// Do not update repo with the same date
if (oldDate == null || !oldDate.equals(leftOffDate)) {
syncDate(leftOffDate);
oldDate = leftOffDate;
}
}
// TODO wrap all retry-able exceptions in a special exception
batchAdvanced.mark(batch.advanced());
log.info("Polled up to {} at {} updates per second and {} {} per second", batch.leftOffHuman(),
meterReport(updatesMeter), meterReport(batchAdvanced), batch.advancedUnits());
if (batch.last()) {
return;
}
wikibase.batchDone();
batch = nextBatch(batch);
} catch (InterruptedException e) {
currentThread().interrupt();
}
}
}
/**
* If there are any deferred changes, add them to changes list.
* @return Modified collection as iterable.
*/
@VisibleForTesting
static Collection addDeferredChanges(DelayQueue deferralQueue, Collection newChanges) {
if (deferralQueue.isEmpty()) {
return newChanges;
}
List allChanges = new LinkedList<>(newChanges);
int deferrals = 0;
Set changeIds = newChanges.stream().map(Change::entityId).collect(ImmutableSet.toImmutableSet());
for (DelayedChange deferred = deferralQueue.poll();
deferred != null;
deferred = deferralQueue.poll()) {
if (changeIds.contains(deferred.getChange().entityId())) {
// This title ID already has newer change, drop the deferral.
// NOTE: here we assume that incoming stream always has changes in order of increasing revisions,
// which sounds plausible.
continue;
}
allChanges.add(deferred.getChange());
deferrals++;
}
log.info("Added {} deferred changes, {} still in the queue", deferrals, deferralQueue.size());
return allChanges;
}
/**
* Record that we reached certain date in permanent storage.
*/
protected void syncDate(Instant newDate) {
rdfRepository.updateLeftOffTime(newDate);
}
@Override
public void close() throws IOException {
executor.shutdown();
changeSource.close();
}
/**
* Handle the changes in a batch.
*
* @throws InterruptedException if the process is interrupted while waiting
* on changes to sync
*/
protected void handleChanges(Iterable changes) throws InterruptedException {
Set trueChanges = getRevisionUpdates(changes);
long start = System.currentTimeMillis();
List> futureChanges = new ArrayList<>();
for (Change change : trueChanges) {
futureChanges.add(executor.submit(() -> {
while (true) {
try {
handleChange(change);
return change;
} catch (RetryableException e) {
log.warn("Retryable error syncing. Retrying.", e);
} catch (ContainedException e) {
log.warn("Contained error syncing. Giving up on {}", change.entityId(), e);
throw e;
}
}
}));
}
List processedChanges = new ArrayList<>();
for (Future f : futureChanges) {
try {
processedChanges.add(f.get());
} catch (ExecutionException ignore) {
// failure has already been logged
}
}
log.debug("Preparing update data took {} ms, have {} changes", System.currentTimeMillis() - start, processedChanges.size());
rdfRepository.syncFromChanges(processedChanges, verify);
updatesMeter.mark(processedChanges.size());
}
/**
* Filter change by revisions.
* The revisions that have the same or superior revision in the DB will be removed.
* @param changes Collection of incoming changes.
* @return A set of changes that need to be entered into the repository.
*/
private Set getRevisionUpdates(Iterable changes) {
// List of changes that indeed need update
Set trueChanges = new HashSet<>();
// List of entity URIs that were changed
Set changeIds = new HashSet<>();
Map candidateChanges = new HashMap<>();
for (final Change change : changes) {
if (change.revision() > Change.NO_REVISION) {
Change c = candidateChanges.get(change.entityId());
if (c == null || c.revision() < change.revision()) {
candidateChanges.put(change.entityId(), change);
}
} else {
trueChanges.add(change);
changeIds.add(uris.entityIdToURI(change.entityId()));
}
}
if (candidateChanges.size() > 0) {
for (String entityId: rdfRepository.hasRevisions(candidateChanges.values())) {
// Cut off the entity prefix from the resulting URI
changeIds.add(entityId);
trueChanges.add(candidateChanges.get(uris.entityURItoId(entityId)));
}
}
log.debug("Filtered batch contains {} changes", trueChanges.size());
if (!trueChanges.isEmpty()) {
setValuesAndRefs(
rdfRepository.getValues(changeIds),
rdfRepository.getRefs(changeIds)
);
if (log.isDebugEnabled()) {
synchronized (this) {
log.debug("Fetched {} values", repoValues.size());
log.debug("Fetched {} refs", repoRefs.size());
}
}
} else {
setValuesAndRefs(null, null);
}
return trueChanges;
}
private synchronized void setValuesAndRefs(
ImmutableSetMultimap values,
ImmutableSetMultimap refs) {
repoValues = values;
repoRefs = refs;
}
/**
* Fetch the next batch.
*
* @throws InterruptedException if the process was interrupted while waiting
* during the pollDelay or waiting on something else
*/
private B nextBatch(B prevBatch) throws InterruptedException {
B batch;
while (true) {
try {
batch = changeSource.nextBatch(prevBatch);
} catch (RetryableException e) {
log.warn("Retryable error fetching next batch. Retrying.", e);
continue;
}
if (!batch.hasAnyChanges()) {
log.info("Sleeping for {} secs", pollDelay);
Thread.sleep(pollDelay * 1000);
continue;
}
if (batch.changes().isEmpty()) {
prevBatch = batch;
continue;
}
log.debug("{} changes in batch", batch.changes().size());
return batch;
}
}
/**
* Handle a change.
*
* - Check if the RDF store has the version of the page.
*
- Fetch the RDF from the Wikibase install.
*
- Add revision information to the statements if it isn't there already.
*
- Sync data to the triple store.
*
*
* @throws RetryableException if there is a retryable error updating the rdf
* store
*/
private void handleChange(Change change) throws RetryableException {
log.debug("Processing data for {}", change);
Collection statements = wikibase.fetchRdfForEntity(change);
Set values = new HashSet<>();
Set refs = new HashSet<>();
ImmutableSetMultimap repoValues;
ImmutableSetMultimap repoRefs;
synchronized (this) {
repoValues = this.repoValues;
repoRefs = this.repoRefs;
}
Change loadedChange = munger.mungeWithValues(change.entityId(), statements, repoValues, repoRefs, values, refs, change);
if (!statements.isEmpty() && loadedChange != change) {
// If we've got no statements, we have no usable loaded data, so no point in checking
// Same if we just got back our own change - no point in checking against it
final long sourceRev = change.revision();
final long fetchedRev = loadedChange.revision();
if (sourceRev > 0 && fetchedRev > 0) {
if (fetchedRev < sourceRev) {
// Something weird happened - we've got stale revision!
log.warn("Stale revision on {}: change is {}, RDF is {}", change.entityId(), sourceRev, fetchedRev);
change.delay(deferralQueue, DEFERRAL_DELAY);
}
if (sourceRev < fetchedRev) {
// We skipped some revisions, let's count it in meter
skipAheadMeter.mark();
}
}
}
change.setRefCleanupList(refs);
change.setValueCleanupList(values);
change.setStatements(statements);
}
/**
* Turn a Meter into a load average style report.
*/
private String meterReport(Meter meter) {
return String.format(Locale.ROOT, "(%.1f, %.1f, %.1f)", meter.getOneMinuteRate(), meter.getFiveMinuteRate(),
meter.getFifteenMinuteRate());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy