All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikidata.wdtk.examples.RdfSerializationExample Maven / Gradle / Ivy

Go to download

This module contains examples to use the Wikidata Toolkit. This module also contains classes that are used to run benchmarks.

There is a newer version: 0.16.0
Show newest version
package org.wikidata.wdtk.examples;

/*
 * #%L
 * Wikidata Toolkit Examples
 * %%
 * Copyright (C) 2014 Wikidata Toolkit Developers
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipParameters;
import org.openrdf.rio.RDFFormat;
import org.wikidata.wdtk.datamodel.interfaces.Sites;
import org.wikidata.wdtk.dumpfiles.DumpProcessingController;
import org.wikidata.wdtk.dumpfiles.MwRevision;
import org.wikidata.wdtk.dumpfiles.MwRevisionProcessor;
import org.wikidata.wdtk.dumpfiles.StatisticsMwRevisionProcessor;
import org.wikidata.wdtk.rdf.RdfSerializer;

/**
 * This class shows how convert data from wikidata.org to RDF in N3 format. The
 * compressed output will be written into several files that will be placed in
 * the example directory.
 * 

* In the future, this will probably become a stand-alone tool that can be * called directly. * * @author Michael Günther * @author Markus Kroetzsch */ public class RdfSerializationExample { final static String COMPRESS_BZ2 = ".bz2"; final static String COMPRESS_GZIP = ".gz"; final static String COMPRESS_NONE = ""; private static Sites sites; private static DumpProcessingController dumpProcessingController; private static List serializers = new ArrayList(); private static List serializerNames = new ArrayList(); public static void main(String[] args) throws IOException { // Define where log messages go ExampleHelpers.configureLogging(); // Print information about this program printDocumentation(); // Controller object for processing dumps: dumpProcessingController = new DumpProcessingController("wikidatawiki"); // dumpProcessingController.setOfflineMode(true); // Initialize sites; needed to link to Wikipedia pages in RDF sites = dumpProcessingController.getSitesInformation(); // Create serializers for several data parts and encodings: createRdfSerializer("wikidata-properties.nt", COMPRESS_GZIP, RdfSerializer.TASK_PROPERTIES | RdfSerializer.TASK_ALL_EXACT_DATA); createRdfSerializer("wikidata-terms.nt", COMPRESS_GZIP, RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_TERMS); createRdfSerializer("wikidata-statements.nt", COMPRESS_GZIP, RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_STATEMENTS); createRdfSerializer("wikidata-simple-statements.nt", COMPRESS_GZIP, RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_SIMPLE_STATEMENTS); createRdfSerializer("wikidata-taxonomy.nt", COMPRESS_GZIP, RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_TAXONOMY); createRdfSerializer("wikidata-instances.nt", COMPRESS_GZIP, RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_INSTANCE_OF); createRdfSerializer("wikidata-sitelinks.nt", COMPRESS_GZIP, RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_SITELINKS); // General statistics and time keeping: MwRevisionProcessor rpRevisionStats = new StatisticsMwRevisionProcessor( "revision processing statistics", 10000); // Subscribe to all current revisions (null = no filter): dumpProcessingController.registerMwRevisionProcessor(rpRevisionStats, null, true); // Set up the serializer and write headers startSerializers(); // Start processing (may trigger downloads where needed) dumpProcessingController.processMostRecentMainDump(); // Finish the serialization closeSerializers(); } /** * Creates a new RDF Serializer. Output is written to the file of the given * name (it will always be a compressed file, so the name should reflect * that). The tasks define what the serializer will be writing into this * file. The new serializer is also registered in an internal list, so it * can be started and closed more conveniently. * * @param outputFileName * filename to write output to * @param compressionExtension * the extension of the chosen compression format or the empty * string for no compression * @param tasks * an integer that is a bitwise OR of flags like * {@link RdfSerializer#TASK_LABELS}. * @return the newly created serializer * @throws FileNotFoundException * if the given file cannot be opened for writing for some * reason * @throws IOException * if it was not possible to write the BZ2 header to the file */ @SuppressWarnings("resource") private static RdfSerializer createRdfSerializer(String outputFileName, String compressionExtension, int tasks) throws FileNotFoundException, IOException { OutputStream bufferedFileOutputStream = new BufferedOutputStream( new FileOutputStream(outputFileName + compressionExtension), 1024 * 1024 * 5 * 0 + 100); OutputStream compressorOutputStream = null; switch (compressionExtension) { case COMPRESS_BZ2: compressorOutputStream = new BZip2CompressorOutputStream( bufferedFileOutputStream); break; case COMPRESS_GZIP: GzipParameters gzipParameters = new GzipParameters(); gzipParameters.setCompressionLevel(7); compressorOutputStream = new GzipCompressorOutputStream( bufferedFileOutputStream, gzipParameters); break; case COMPRESS_NONE: compressorOutputStream = bufferedFileOutputStream; break; default: throw new IllegalArgumentException( "Unsupported compression format: " + compressionExtension); } OutputStream exportOutputStream = asynchronousOutputStream(compressorOutputStream); // // Alternative code: if not using separate threads, increase // // pre-compression buffer: // OutputStream exportOutputStream = new // BufferedOutputStream(compressorOutputStream,1024 * 1024 * 50); RdfSerializer serializer = new RdfSerializer(RDFFormat.NTRIPLES, exportOutputStream, sites); serializer.setTasks(tasks); dumpProcessingController.registerEntityDocumentProcessor(serializer, MwRevision.MODEL_WIKIBASE_ITEM, true); dumpProcessingController.registerEntityDocumentProcessor(serializer, MwRevision.MODEL_WIKIBASE_PROPERTY, true); serializers.add(serializer); serializerNames.add(outputFileName); return serializer; } /** * Starts the serializers. This includes the writing of headers if any (N3 * has no headers, but other formats have). */ private static void startSerializers() { for (RdfSerializer serializer : serializers) { serializer.start(); } } /** * Closes the serializers (and their output streams), and prints a short * summary of the number of triples serialized by each. */ private static void closeSerializers() { Iterator nameIterator = serializerNames.iterator(); for (RdfSerializer serializer : serializers) { serializer.close(); System.out.println("*** Finished serialization of " + serializer.getTripleCount() + " RDF triples in file " + nameIterator.next()); } } /** * Print some basic documentation about this program. */ private static void printDocumentation() { System.out .println("********************************************************************"); System.out.println("*** Wikidata Toolkit: Serialization Example"); System.out.println("*** "); System.out .println("*** This program will download dumps from Wikidata and serialize the data in a RDF format."); System.out .println("*** Downloading may take some time initially. After that, files"); System.out .println("*** are stored on disk and are used until newer dumps are available."); System.out .println("*** You can delete files manually when no longer needed (see "); System.out .println("*** message below for the directory where dump files are found)."); System.out .println("*** The output will be stored in the directory of the example."); System.out .println("********************************************************************"); } /** * Creates a separate thread for writing into the given output stream and * returns a pipe output stream that can be used to pass data to this * thread. *

* This code is inspired by * http://stackoverflow.com/questions/12532073/gzipoutputstream * -that-does-its-compression-in-a-separate-thread * * @param outputStream * the stream to write to in the thread * @return a new stream that data should be written to * @throws IOException * if the pipes could not be created for some reason */ public static OutputStream asynchronousOutputStream( final OutputStream outputStream) throws IOException { final int SIZE = 1024 * 1024 * 10; final PipedOutputStream pos = new PipedOutputStream(); final PipedInputStream pis = new PipedInputStream(pos, SIZE); new Thread(new Runnable() { @Override public void run() { try { byte[] bytes = new byte[SIZE]; for (int len; (len = pis.read(bytes)) > 0;) { outputStream.write(bytes, 0, len); } } catch (IOException ioException) { ioException.printStackTrace(); } finally { close(pis); close(outputStream); } } }, "async-output-stream").start(); return pos; } /** * Closes a Closeable and swallows any exceptions that might occur in the * process. * * @param closeable */ static void close(Closeable closeable) { if (closeable != null) { try { closeable.close(); } catch (IOException ignored) { } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy