All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikidata.query.rdf.tool.Update Maven / Gradle / Ivy

Go to download

Tools to sync Wikibase to RDF stores. Also contains overall integration tests that rely on everything else.

There is a newer version: 0.3.2
Show newest version
package org.wikidata.query.rdf.tool;

import static java.util.concurrent.TimeUnit.SECONDS;
import static org.wikidata.query.rdf.tool.OptionsUtils.handleOptions;
import static org.wikidata.query.rdf.tool.OptionsUtils.mungerFromOptions;
import static org.wikidata.query.rdf.tool.wikibase.WikibaseRepository.inputDateFormat;
import static org.wikidata.query.rdf.tool.wikibase.WikibaseRepository.outputDateFormat;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.openrdf.model.Statement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.query.rdf.common.uri.WikibaseUris;
import org.wikidata.query.rdf.tool.OptionsUtils.BasicOptions;
import org.wikidata.query.rdf.tool.OptionsUtils.MungerOptions;
import org.wikidata.query.rdf.tool.OptionsUtils.WikibaseOptions;
import org.wikidata.query.rdf.tool.change.Change;
import org.wikidata.query.rdf.tool.change.Change.Batch;
import org.wikidata.query.rdf.tool.change.IdListChangeSource;
import org.wikidata.query.rdf.tool.change.IdRangeChangeSource;
import org.wikidata.query.rdf.tool.change.RecentChangesPoller;
import org.wikidata.query.rdf.tool.exception.ContainedException;
import org.wikidata.query.rdf.tool.exception.RetryableException;
import org.wikidata.query.rdf.tool.rdf.Munger;
import org.wikidata.query.rdf.tool.rdf.RdfRepository;
import org.wikidata.query.rdf.tool.wikibase.WikibaseRepository;

import com.codahale.metrics.JmxReporter;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.google.common.collect.Multimap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.lexicalscope.jewel.cli.Option;

/**
 * Update tool.
 *
 * @param  type of update batch
 */
// TODO fan out complexity
@SuppressWarnings("checkstyle:classfanoutcomplexity")
public class Update implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(Update.class);

    /**
     * CLI options for use with JewelCli.
     */
    @SuppressWarnings("checkstyle:javadocmethod")
    public interface Options extends BasicOptions, MungerOptions, WikibaseOptions {
        @Option(defaultValue = "https", description = "Wikidata url scheme")
        String wikibaseScheme();

        @Option(shortName = "s", defaultToNull = true, description = "Start time in 2015-02-11T17:11:08Z or 20150211170100 format.")
        String start();

        @Option(defaultToNull = true, description = "If specified must be  or -. Ids are iterated instead of recent "
                + "changes. Start and end are inclusive.")
        String ids();

        @Option(shortName = "u", description = "URL to post updates and queries.")
        String sparqlUrl();

        @Option(shortName = "d", defaultValue = "10", description = "Poll delay when no updates found")
        int pollDelay();

        @Option(shortName = "t", defaultValue = "10", description = "Thread count")
        int threadCount();

        @Option(shortName = "b", defaultValue = "100", description = "Number of recent changes fetched at a time.")
        int batchSize();

        @Option(shortName = "V", longName = "verify", description = "Verify updates (may have performance impact)")
        boolean verify();
    }

    /**
     * Run updates configured from the command line.
     */
    public static void main(String[] args) {
        Options options = handleOptions(Options.class, args);
        WikibaseRepository wikibaseRepository = new WikibaseRepository(options.wikibaseScheme(), options.wikibaseHost());
        URI sparqlUri;
        try {
            sparqlUri = new URI(options.sparqlUrl());
        } catch (URISyntaxException e) {
            log.error("Invalid url:  " + options.sparqlUrl() + " caused by " + e.getMessage());
            return;
        }
        WikibaseUris uris = new WikibaseUris(options.wikibaseHost());
        RdfRepository rdfRepository = new RdfRepository(sparqlUri, uris);
        Change.Source changeSource = buildChangeSource(options, rdfRepository,
                wikibaseRepository);
        if (changeSource == null) {
            return;
        }
        int threads = options.threadCount();
        ThreadFactoryBuilder threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("update %s");
        ExecutorService executor = new ThreadPoolExecutor(threads, threads, 0, TimeUnit.SECONDS,
                new LinkedBlockingQueue(), threadFactory.build());

        Munger munger = mungerFromOptions(options);
        new Update<>(changeSource, wikibaseRepository, rdfRepository, munger, executor,
                options.pollDelay(), uris, options.verify()).run();
    }

    /**
     * Build a change source.
     *
     * @return null if non can be built - its ok to just exit - errors have been
     *         logged to the user
     */
    @SuppressWarnings("checkstyle:cyclomaticcomplexity")
    private static Change.Source buildChangeSource(Options options, RdfRepository rdfRepository,
            WikibaseRepository wikibaseRepository) {
        if (options.ids() != null) {
            if (options.ids().contains(",")) {
                // Id list
                return new IdListChangeSource(options.ids().split(","), options.batchSize());
            }
            String[] ids = options.ids().split("-");
            long start;
            long end;
            switch (ids.length) {
            case 1:
                if (!Character.isDigit(ids[0].charAt(0))) {
                    // Not a digit - probably just single ID
                    return new IdListChangeSource(ids, options.batchSize());
                }
                start = Long.parseLong(ids[0]);
                end = start;
                break;
            case 2:
                start = Long.parseLong(ids[0]);
                end = Long.parseLong(ids[1]);
                break;
            default:
                log.error("Invalid format for --ids.  Need -.");
                return null;
            }
            return IdRangeChangeSource.forItems(start, end, options.batchSize());
        }
        long startTime;
        if (options.start() != null) {
            try {
                startTime = outputDateFormat().parse(options.start()).getTime();
            } catch (java.text.ParseException e) {
                try {
                    startTime = inputDateFormat().parse(options.start()).getTime();
                } catch (java.text.ParseException e2) {
                    log.error("Invalid date:  {}", options.start());
                    return null;
                }
            }
        } else {
            log.info("Checking where we left off");
            Date leftOff = rdfRepository.fetchLeftOffTime();
            long minStartTime = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(30);
            if (leftOff == null) {
                startTime = minStartTime;
                log.info("Defaulting start time to 30 days ago:  {}", inputDateFormat().format(new Date(startTime)));
            } else {
                if (leftOff.getTime() < minStartTime) {
                    log.error("RDF store reports the last update time is before the minimum safe poll time.  "
                            + "You will have to reload from scratch or you might have missing data.");
                    return null;
                }
                /*
                 * -2 seconds to because our precision is only 1 second and
                 * because it should be cheap to recheck that we have the right
                 * revision.
                 */
                startTime = leftOff.getTime();
                log.info("Found start time in the RDF store: {}", inputDateFormat().format(leftOff));
            }
        }
        return new RecentChangesPoller(wikibaseRepository, new Date(startTime), options.batchSize());
    }

    /**
     * Metric registry.
     */
    private final MetricRegistry metrics = new MetricRegistry();
    /**
     * Meter for the raw number of updates synced.
     */
    private final Meter updateMeter = metrics.meter("updates");
    /**
     * Meter measuring in a batch specific unit. For the RecentChangesPoller its
     * milliseconds, for the IdChangeSource its ids.
     */
    private final Meter batchAdvanced = metrics.meter("batch-progress");
    /**
     * JMX interface for metrics counters.
     */
    private final JmxReporter reporter = JmxReporter.forRegistry(metrics).build();
    /**
     * Source of change batches.
     */
    private final Change.Source changeSource;
    /**
     * Wikibase to read rdf from.
     */
    private final WikibaseRepository wikibase;
    /**
     * Repository to which to sync rdf.
     */
    private final RdfRepository rdfRepository;
    /**
     * Munger to munge rdf from wikibase before adding it to the rdf store.
     */
    private final Munger munger;
    /**
     * The executor to use for updates.
     */
    private final ExecutorService executor;
    /**
     * Seconds to wait after we hit an empty batch. Empty batches signify that
     * there aren't any changes left now but the change stream isn't over. In
     * particular this will happen if the RecentChangesPoller finds no changes.
     */
    private final int pollDelay;
    /**
     * Uris for wikibase.
     */
    private final WikibaseUris uris;
    /**
     * Map entity->values list from repository.
     */
    private Multimap repoValues;
    /**
     * Map entity->references list from repository.
     */
    private Multimap repoRefs;
    /**
     * Should we verify updates?
     */
    private final boolean verify;

    public Update(Change.Source changeSource, WikibaseRepository wikibase, RdfRepository rdfRepository,
            Munger munger, ExecutorService executor, int pollDelay, WikibaseUris uris, boolean verify) {
        this.changeSource = changeSource;
        this.wikibase = wikibase;
        this.rdfRepository = rdfRepository;
        this.munger = munger;
        this.executor = executor;
        this.pollDelay = pollDelay;
        this.uris = uris;
        this.verify = verify;
        reporter.start();
    }

    @Override
    public void run() {
        B batch = null;
        do {
            try {
                batch = changeSource.firstBatch();
            } catch (RetryableException e) {
                log.warn("Retryable error fetching first batch.  Retrying.", e);
            }
        } while (batch == null);
        log.debug("{} changes in batch", batch.changes().size());
        while (true) {
            try {
                handleChanges(batch);
                Date leftOffDate = batch.leftOffDate();
                if (leftOffDate != null) {
                    /*
                     * Back one second because the resolution on our poll isn't
                     * super good and because its not big deal to recheck if we
                     * have some updates.
                     */
                    leftOffDate = new Date(batch.leftOffDate().getTime() - SECONDS.toMillis(1));
                    rdfRepository.updateLeftOffTime(leftOffDate);
                }
                // TODO wrap all retry-able exceptions in a special exception
                batchAdvanced.mark(batch.advanced());
                log.info("Polled up to {} at {} updates per second and {} {} per second", batch.leftOffHuman(),
                        meterReport(updateMeter), meterReport(batchAdvanced), batch.advancedUnits());
                if (batch.last()) {
                    return;
                }
                batch = nextBatch(batch);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            } catch (ExecutionException e) {
                log.error("Syncing encountered a fatal exception", e);
                break;
            }
        }
    }

    /**
     * Handle the changes in a batch.
     *
     * @throws InterruptedException if the process is interrupted while waiting
     *             on changes to sync
     * @throws ExecutionException if there is an error syncing any of the
     *             changes
     */
    private void handleChanges(Change.Batch batch) throws InterruptedException, ExecutionException {
        List> tasks = new ArrayList<>();
        Set trueChanges = getRevisionUpdates(batch);
        long start = System.currentTimeMillis();
        for (final Change change : trueChanges) {
            tasks.add(executor.submit(new Runnable() {
                @Override
                public void run() {
                    while (true) {
                        try {
                            handleChange(change);
                            return;
                        } catch (RetryableException e) {
                            log.warn("Retryable error syncing.  Retrying.", e);
                        } catch (ContainedException e) {
                            log.warn("Contained error syncing.  Giving up on " + change.entityId(), e);
                            return;
                        }
                    }
                }
            }));
        }

        for (Future task : tasks) {
            task.get();
        }
        log.debug("Preparing update data took {} ms", System.currentTimeMillis() - start);
        rdfRepository.syncFromChanges(trueChanges, verify);
        updateMeter.mark(trueChanges.size());
    }

    /**
     * Filter change by revisions.
     * The revisions that have the same or superior revision in the DB will be removed.
     * @param batch
     * @return A set of changes that need to be entered into the repository.
     */
    private Set getRevisionUpdates(Change.Batch batch) {
        // List of changes that indeed need update
        Set trueChanges = new HashSet<>();
        // List of entity URIs that were changed
        Set changeIds = new HashSet<>();
        Map candidateChanges = new HashMap<>();
        for (final Change change : batch.changes()) {
            if (change.revision() >= 0) {
                Change c = candidateChanges.get(change.entityId());
                if (c == null || c.revision() < change.revision()) {
                    candidateChanges.put(change.entityId(), change);
                }
            } else {
                trueChanges.add(change);
                changeIds.add(uris.entity() + change.entityId());
            }
        }
        if (candidateChanges.size() > 0) {
            for (String entityId: rdfRepository.hasRevisions(candidateChanges.values())) {
                // Cut off the entity prefix from the resulting URI
                changeIds.add(entityId);
                trueChanges.add(candidateChanges.get(entityId.substring(uris.entity().length())));
            }
        }
        log.debug("Filtered batch contains {} changes", trueChanges.size());

        if (trueChanges.size() > 0) {
            repoValues = rdfRepository.getValues(changeIds);
            log.debug("Fetched {} values", repoValues.size());
            repoRefs = rdfRepository.getRefs(changeIds);
            log.debug("Fetched {} refs", repoRefs.size());
        } else {
            repoValues = null;
            repoRefs = null;
        }

        return trueChanges;
    }

    /**
     * Fetch the next batch.
     *
     * @throws InterruptedException if the process was interrupted while waiting
     *             during the pollDelay or waiting on something else
     */
    private B nextBatch(B batch) throws InterruptedException {
        while (true) {
            try {
                batch = changeSource.nextBatch(batch);
            } catch (RetryableException e) {
                log.warn("Retryable error fetching next batch.  Retrying.", e);
                continue;
            }
            if (batch.changes().isEmpty()) {
                log.debug("Sleeping for {} secs", pollDelay);
                Thread.sleep(pollDelay * 1000);
                continue;
            }
            log.debug("{} changes in batch", batch.changes().size());
            return batch;
        }
    }

    /**
     * Handle a change.
     * 
    *
  • Check if the RDF store has the version of the page. *
  • Fetch the RDF from the Wikibase install. *
  • Add revision information to the statements if it isn't there already. *
  • Sync data to the triple store. *
* * @throws RetryableException if there is a retryable error updating the rdf * store */ private void handleChange(Change change) throws RetryableException { log.debug("Processing data for {}", change); Collection statements = wikibase.fetchRdfForEntity(change.entityId()); Set values = new HashSet<>(repoValues.get(change.entityId())); Set refs = new HashSet<>(repoRefs.get(change.entityId())); munger.munge(change.entityId(), statements, values, refs, change); List cleanupList = new ArrayList<>(); cleanupList.addAll(values); cleanupList.addAll(refs); change.setStatements(statements); change.setCleanupList(cleanupList); } /** * Turn a Meter into a load average style report. */ private String meterReport(Meter meter) { return String.format(Locale.ROOT, "(%.1f, %.1f, %.1f)", meter.getOneMinuteRate(), meter.getFiveMinuteRate(), meter.getFifteenMinuteRate()); } }