All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikidata.query.rdf.tool.Munge Maven / Gradle / Ivy

Go to download

Tools to sync Wikibase to RDF stores. Also contains overall integration tests that rely on everything else.

The newest version!
package org.wikidata.query.rdf.tool;

import static java.lang.Boolean.FALSE;
import static org.wikidata.query.rdf.tool.StreamUtils.utf8;
import static org.wikidata.query.rdf.tool.options.OptionsUtils.handleOptions;
import static org.wikidata.query.rdf.tool.options.OptionsUtils.mungerFromOptions;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.RDFWriter;
import org.openrdf.rio.Rio;
import org.openrdf.rio.WriterConfig;
import org.openrdf.rio.helpers.BasicWriterSettings;
import org.openrdf.rio.turtle.TurtleParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.query.rdf.common.uri.OWL;
import org.wikidata.query.rdf.common.uri.Ontology;
import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
import org.wikidata.query.rdf.common.uri.UrisScheme;
import org.wikidata.query.rdf.tool.exception.ContainedException;
import org.wikidata.query.rdf.tool.options.MungeOptions;
import org.wikidata.query.rdf.tool.options.OptionsUtils;
import org.wikidata.query.rdf.tool.rdf.Munger;
import org.wikidata.query.rdf.tool.rdf.NormalizingRdfHandler;
import org.wikidata.query.rdf.tool.rdf.PrefixRecordingRdfHandler;

import com.codahale.metrics.Meter;

import de.thetaphi.forbiddenapis.SuppressForbidden;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import fi.iki.elonen.NanoHTTPD;

/**
 * Munges a Wikidata RDF dump so that it can be loaded in a single import.
 */
@SuppressWarnings("checkstyle:classfanoutcomplexity")
public class Munge implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(Munge.class);

    /**
     * Run a bulk munge configured from the command line.
     */
    @SuppressWarnings("checkstyle:illegalcatch")
    public static void main(String[] args) {
        MungeOptions options = handleOptions(MungeOptions.class, args);
        UrisScheme uris = OptionsUtils.WikibaseOptions.wikibaseUris(options);
        Munger munger = mungerFromOptions(options);

        int port = parsePort(options.to());

        OutputPicker to;
        Httpd httpd = null;
        try {
            if (options.chunkSize() > 0) {
                if (port > 0) {
                    // We have two slots just in case
                    BlockingQueue queue = new ArrayBlockingQueue<>(2);
                    httpd = new Httpd(port, queue);
                    to = new ChunkedPipedWriterOutputPicker(queue, options.chunkSize());
                } else {
                    to = new ChunkedFileWriterOutputPicker(options.to(), options.chunkSize());
                }
            } else {
                if (port > 0) {
                    PipedInputStream toHttp = new PipedInputStream();
                    Writer writer = utf8(new PipedOutputStream(toHttp));
                    BlockingQueue queue = new ArrayBlockingQueue<>(1);
                    queue.put(toHttp);
                    httpd = new Httpd(port, queue);
                    to = new AlwaysOutputPicker<>(writer);
                } else {
                    to = new AlwaysOutputPicker<>(CliUtils.writer(options.to()));
                }
            }
            if (httpd != null) {
                log.info("Starting embedded http sever on port {}", port);
                log.info("This process will exit when the whole dump has been served");
                httpd.start();
            }
        } catch (IOException e) {
            log.error("Error finding output", e);
            System.exit(1);
            return;
        } catch (InterruptedException e) {
            log.error("Interrupted while waiting on httpd", e);
            System.exit(1);
            return;
        }
        try {
            Munge munge = new Munge(uris, munger, openInput(options.from()), to);
            munge.run();
        } catch (RuntimeException e) {
            log.error("Fatal error munging RDF", e);
            System.exit(1);
        }
        waitForHttpdToShutDownIfNeeded(httpd);
    }

    /**
     * Parse the http port from the "to" parameter if there is one, return 0
     * otherwise.
     */
    private static int parsePort(String to) {
        if (to.startsWith("port:")) {
            return Integer.parseInt(to.substring("port:".length()));
        }
        return 0;
    }

    /**
     * Open the input using the "from" parameter, exiting on failure.
     */
    private static Reader openInput(String from) {
        try {
            return CliUtils.reader(from);
        } catch (IOException e) {
            log.error("Error finding input", e);
            System.exit(1);
            return null;
        }
    }

    /**
     * Wait for the HTTP server to shutdown if it was used.
     */
    private static void waitForHttpdToShutDownIfNeeded(Httpd httpd) {
        if (httpd == null) {
            return;
        }
        log.info("Finished munging and waiting for the http server to finish sending them");
        while (httpd.busy.get()) {
            try {
                Thread.sleep(100);
            } catch (InterruptedException e) {
                log.info("Interrupted while waiting for http server to finish sending", e);
                System.exit(1);
            }
        }
    }

    /**
     * Uris for this wikibase instance. Used to match the rdf as its read.
     */
    private final UrisScheme uris;
    /**
     * Munges the rdf.
     */
    private final Munger munger;
    /**
     * Source of the rdf.
     */
    private final Reader from;
    /**
     * Where the munged RDF is synced.
     */
    private final OutputPicker to;

    public Munge(UrisScheme uris, Munger munger, Reader from, OutputPicker to) {
        this.uris = uris;
        this.munger = munger;
        this.from = from;
        this.to = to;
    }

    @Override
    public void run() {
        try {
            // TODO this is a temporary hack
            // RDFParser parser = Rio.createParser(RDFFormat.TURTLE);
            RDFParser parser = new ForbiddenOk.HackedTurtleParser();
            OutputPicker writer = new WriterToRDFWriterChunkPicker(to);
            EntityMungingRdfHandler handler = new EntityMungingRdfHandler(uris, munger, writer);
            parser.setRDFHandler(new NormalizingRdfHandler(handler));
            try {
                parser.parse(from, uris.root());
            } catch (RDFParseException | RDFHandlerException | IOException e) {
                throw new RuntimeException(e);
            }
        } finally {
            try {
                from.close();
            } catch (IOException e) {
                log.error("Error closing input", e);
            }
            try {
                to.output().close();
            } catch (IOException e) {
                log.error("Error closing output", e);
            }
        }
    }

    /**
     * Collects statements about entities until it hits the next entity or the
     * end of the file, munges those statements, and then passes them to the
     * next handler. Note that this relies on the order of the data in the file
     * to be like:
     * 
    *
  • http://www.wikidata.org/wiki/Special:EntityData/EntityId ?p ?o . *
  • everything about EntityId *
  • http://www.wikidata.org/wiki/Special:EntityData/NextEntityId ?p ?o . *
  • etc *
* This is how the files are built so that is OK. */ @SuppressFBWarnings(value = "URF_UNREAD_FIELD", justification = "the unread lastStatement field is used for debugging") private static class EntityMungingRdfHandler implements RDFHandler { /** * Uris for this instance of wikibase. We match on these. */ private final UrisScheme uris; /** * Actually munges the entities! */ private final Munger munger; /** * The place where we sync munged entities. */ private final OutputPicker next; /** * The statements about the current entity. */ private final List statements = new ArrayList<>(); /** * Meter measuring the number of entities we munge in grand load average * style. */ private final Meter entitiesMeter = new Meter(); /** * Have we hit any non Special:EntityData statements? Used to make sure * we properly pick up the first few statements in every entity. */ private boolean haveNonEntityDataStatements; /** * The current entity being read. When we hit a new entity we start send * the old statements to the munger and then sync them to next. */ private String entityId; /** * Last statement. */ private Statement lastStatement; EntityMungingRdfHandler(UrisScheme uris, Munger munger, OutputPicker next) { this.uris = uris; this.munger = munger; this.next = next; } @Override public void startRDF() throws RDFHandlerException { haveNonEntityDataStatements = false; next.output().startRDF(); } @Override public void handleNamespace(String prefix, String uri) throws RDFHandlerException { // Namespaces go through to the next handler. next.output().handleNamespace(prefix, uri); } @Override public void handleComment(String comment) throws RDFHandlerException { // Comments go right through to the next handler. next.output().handleComment(comment); } @Override @SuppressFBWarnings(value = "STT_STRING_PARSING_A_FIELD", justification = "low priority to fix") public void handleStatement(Statement statement) throws RDFHandlerException { lastStatement = statement; String subject = statement.getSubject().stringValue(); if (subject.startsWith(uris.entityDataHttps()) || subject.startsWith(uris.entityData())) { if (haveNonEntityDataStatements) { munge(); } if (statement.getPredicate().stringValue().equals(SchemaDotOrg.ABOUT)) { entityId = statement.getObject().stringValue(); entityId = entityId.substring(entityId.lastIndexOf('/') + 1); } statements.add(statement); return; } if (subject.equals(Ontology.DUMP)) { if (statement.getPredicate().stringValue().equals(SchemaDotOrg.SOFTWARE_VERSION)) { munger.setFormatVersion(statement.getObject().stringValue()); } /* * Just pipe dump statements strait through. */ next.output().handleStatement(statement); return; } if (statement.getPredicate().stringValue().equals(OWL.SAME_AS)) { // Temporary fix for T100463 if (haveNonEntityDataStatements) { munge(); } entityId = subject.substring(subject.lastIndexOf('/') + 1); statements.add(statement); haveNonEntityDataStatements = true; return; } haveNonEntityDataStatements = true; statements.add(statement); } @Override public void endRDF() throws RDFHandlerException { munge(); next.output().endRDF(); } /** * Munge an entity's worth of RDF and then sync it the the output. * * @throws RDFHandlerException if there is an error syncing it */ private void munge() throws RDFHandlerException { try { log.debug("Munging {}", entityId); munger.munge(entityId, statements); for (Statement statement : statements) { next.output().handleStatement(statement); } entitiesMeter.mark(); if (entitiesMeter.getCount() % 10000 == 0) { log.info("Processed {} entities at ({}, {}, {})", entitiesMeter.getCount(), (long) entitiesMeter.getOneMinuteRate(), (long) entitiesMeter.getFiveMinuteRate(), (long) entitiesMeter.getFifteenMinuteRate()); } next.entitiesMunged((int) entitiesMeter.getCount()); } catch (ContainedException e) { log.warn("Error munging {}", entityId, e); } statements.clear(); haveNonEntityDataStatements = false; } } /** * Very simple HTTP server that only knows how to spit out results from a * queue. */ public static class Httpd extends NanoHTTPD { /** * Flag that the server is still busy. We try to make sure to set this * to false if we're not busy so the process can exit. */ private final AtomicBoolean busy = new AtomicBoolean(false); /** * Queue from which Turtle formatter RDF is read. */ private final BlockingQueue results; public Httpd(int port, BlockingQueue results) { super(port); this.results = results; } @Override public Response serve(IHTTPSession session) { try { busy.set(true); Response response = new Response(Response.Status.OK, " application/x-turtle", results.take()) { @Override protected void send(OutputStream outputStream) { super.send(outputStream); busy.set(false); } }; response.setChunkedTransfer(true); return response; } catch (InterruptedException e) { log.error("Interrupted while waiting for a result", e); Thread.currentThread().interrupt(); busy.set(false); return new Response(Response.Status.INTERNAL_ERROR, "text/plain", "internal server error"); } } } /** * Picks the right RDFHandler for writing. */ public interface OutputPicker { /** * Get the handler to write to. */ T output(); /** * Update the number of entities already handled. */ void entitiesMunged(int entitiesMunged); } /** * An output picker that always returns one output. */ public static class AlwaysOutputPicker implements OutputPicker { /** * The output to return. */ private final T next; public AlwaysOutputPicker(T next) { this.next = next; } @Override public T output() { return next; } @Override public void entitiesMunged(int entitiesMunged) { // Intentionally do nothing } } /** * Output picker that starts new chunks after processing so many entities. */ private abstract static class ChunkedWriterOutputPicker implements OutputPicker { /** * The number of entities per writer. */ private final int chunkSize; /** * Writer returned by output(). Initialized on first call to output. */ private Writer writer; /** * The chunk number that writer was built for. */ private int lastChunk = 1; ChunkedWriterOutputPicker(int chunkSize) { this.chunkSize = chunkSize; } @Override public Writer output() { if (writer == null) { writer = buildWriter(lastChunk); } return writer; } @Override public void entitiesMunged(int entitiesMunged) { int currentChunk = entitiesMunged / chunkSize + 1; if (lastChunk != currentChunk) { lastChunk = currentChunk; writer = buildWriter(lastChunk); } } /** * Build the next writer. */ protected abstract Writer buildWriter(long chunk); } /** * OutputPicker that writes to files. */ public static class ChunkedFileWriterOutputPicker extends ChunkedWriterOutputPicker { /** * Pattern for file names. */ private final String pattern; public ChunkedFileWriterOutputPicker(String pattern, int chunkSize) { super(chunkSize); this.pattern = pattern; } @Override @SuppressFBWarnings( value = "EXS_EXCEPTION_SOFTENING_NO_CHECKED", justification = "Hiding IOException is suspicious, but seems to be the usual pattern in this project") protected Writer buildWriter(long chunk) { String file = String.format(Locale.ROOT, pattern, chunk); log.info("Switching to {}", file); try { return CliUtils.writer(file); } catch (IOException e) { throw new RuntimeException("Error switching chunks", e); } } } /** * OutputPicker writes to PipedOutput stream and throws the corresponding * PipedInputStreams on a BlockingQueue. */ public static class ChunkedPipedWriterOutputPicker extends ChunkedWriterOutputPicker { /** * Queue to hold readable results streams. */ private final BlockingQueue queue; public ChunkedPipedWriterOutputPicker(BlockingQueue queue, int chunkSize) { super(chunkSize); this.queue = queue; } @Override @SuppressFBWarnings( value = "EXS_EXCEPTION_SOFTENING_NO_CHECKED", justification = "Hiding IOException is suspicious, but seems to be the usual pattern in this project") protected Writer buildWriter(long chunk) { PipedInputStream toQueue = new PipedInputStream(); try { queue.put(toQueue); return utf8(new PipedOutputStream(toQueue)); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException("Error switching chunks", e); } catch (IOException e) { throw new RuntimeException("Error switching chunks", e); } } } /** * Adapts an OutputPicker for writers to one for RDFHandlers, taking care to * always add all the prefixes. */ private static class WriterToRDFWriterChunkPicker implements OutputPicker { /** * Map containing prefixes that have been written to any RDFHandler that * we then write to all the next handlers. */ private final Map prefixes = new LinkedHashMap<>(); /** * The output picker for the writers. */ private final OutputPicker next; /** * The lastWriter used to build the RDFHandler. If it changes we build a * new RDFHandler. */ private Writer lastWriter; /** * The current RDFHandler to write to. */ private RDFHandler handler; WriterToRDFWriterChunkPicker(OutputPicker next) { this.next = next; lastWriter = next.output(); try { setHandlerFromLastWriter(); } catch (RDFHandlerException e) { throw new RuntimeException("Error setting up first rdf writer", e); } } @Override public RDFHandler output() { Writer nextWriter = next.output(); if (nextWriter == lastWriter) { return handler; } try { /* * When we hit a new chunk we have to terminate rdf and start it * on the next chunk. */ handler.endRDF(); lastWriter.close(); lastWriter = nextWriter; setHandlerFromLastWriter(); handler.startRDF(); } catch (RDFHandlerException | IOException e) { throw new RuntimeException("Error switching chunks", e); } return handler; } @Override public void entitiesMunged(int entitiesMunged) { next.entitiesMunged(entitiesMunged); } /** * Set the next handler from the lastWriter field. * * @throws RDFHandlerException if the handler throws it while * initializing */ private void setHandlerFromLastWriter() throws RDFHandlerException { final RDFWriter writer = Rio.createWriter(RDFFormat.TURTLE, lastWriter); final WriterConfig config = writer.getWriterConfig(); config.set(BasicWriterSettings.PRETTY_PRINT, FALSE); handler = new PrefixRecordingRdfHandler(writer, prefixes); for (Map.Entry prefix : prefixes.entrySet()) { handler.handleNamespace(prefix.getKey(), prefix.getValue()); } } } /** * We need access to getMessage from exceptions. This is brittle but * (hopefully) temporary. */ @SuppressForbidden private static class ForbiddenOk { /** * TurtleParser that tries to recover from errors we see in wikibase. */ private static class HackedTurtleParser extends TurtleParser { @Override protected URI parseURI() throws IOException, RDFParseException { try { return super.parseURI(); } catch (RDFParseException e) { if (e.getMessage().startsWith("IRI includes string escapes: ") || e.getMessage().startsWith("IRI included an unencoded space: '32'")) { log.warn("Attempting to recover from", e); if (!e.getMessage().startsWith("IRI includes string escapes: '\\62'")) { while (readCodePoint() != '>') { /* * Dump until the end of the uri. */ } } return super.resolveURI("http://example.com/error"); } throw e; } } @Override protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException { try { super.parseStatement(); } catch (RDFParseException e) { if (e.getMessage().startsWith("Namespace prefix 'Warning' used but not defined")) { log.warn("Attempting to recover from", e); while (readCodePoint() != '\n') { /* * Just dump the rest of the line. Hopefully that'll * be enough to recover. */ } } else { throw e; } } } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy