org.wikidata.wdtk.dumpfiles.MwRevisionDumpFileProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of wdtk-dumpfiles Show documentation
WDTK support for processing Wikibase dump files
There is a newer version: 0.16.0
package org.wikidata.wdtk.dumpfiles;

/*
 * #%L
 * Wikidata Toolkit Dump File Handling
 * %%
 * Copyright (C) 2014 Wikidata Toolkit Developers
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class processes MediaWiki dumpfiles that contain lists of page revisions
 * in the specific XML format used by MediaWiki for exporting pages. It extracts
 * all revisions and forwards them to any registered revision processor. The
 * class also keeps track of whether or not a certain article respectively
 * revision has already been encountered. Therefore, no revision is processed
 * twice and the registered revision processors can be informed whether the
 * revision is the first of the given article or not. The first revision of an
 * article that is encountered in a MediaWiki dump file is usually the most
 * recent one. If multiple dump files are processed in reverse chronological
 * order, the first revision that is encountered is also the most recent one
 * overall.
 * 
 * @author Markus Kroetzsch
 * 
 */
public class MwRevisionDumpFileProcessor implements MwDumpFileProcessor {

	static final String E_MEDIAWIKI = "mediawiki";
	static final String E_SITEINFO = "siteinfo";
	static final String E_SITENAME = "sitename";
	static final String E_BASEURL = "base";
	static final String E_NAMESPACE = "namespace";
	static final String A_NSKEY = "key";

	static final String E_PAGE = "page";
	static final String E_PAGE_TITLE = "title";
	static final String E_PAGE_ID = "id";
	static final String E_PAGE_NAMESPACE = "ns";
	static final String E_PAGE_REVISION = "revision";
	static final String E_PAGE_REDIRECT = "redirect";

	static final String E_REV_ID = "id";
	static final String E_REV_PARENT_ID = "parentid";
	static final String E_REV_TIMESTAMP = "timestamp";
	static final String E_REV_COMMENT = "comment";
	static final String E_REV_MODEL = "model";
	static final String E_REV_TEXT = "text";
	static final String E_REV_CONTRIBUTOR = "contributor";
	static final String E_REV_FORMAT = "format";
	static final String E_REV_SHA1 = "sha1";
	static final String E_REV_MINOR = "minor";

	static final String E_CONTRIBUTOR_NAME = "username";
	static final String E_CONTRIBUTOR_ID = "id";
	static final String E_CONTRIBUTOR_IP = "ip";

	static final Logger logger = LoggerFactory
			.getLogger(MwRevisionDumpFileProcessor.class);

	final XMLInputFactory xmlFactory;
	XMLStreamReader xmlReader;

	/**
	 * Map from integer namespace ids to namespace prefixes. Namespace strings
	 * do not include the final ":" used in MediaWiki to separate namespace
	 * prefixes from article titles. Moreover, the prefixes use spaces, not
	 * underscores as in MediaWiki URLs.
	 */
	final Map namespaces;
	/**
	 * Name of the site as set in the dump file.
	 */
	String sitename = "";
	/**
	 * Base URL of the site as set in the dump file.
	 */
	String baseUrl = "";

	/**
	 * Object used to store data about the current revision.
	 */
	final MwRevisionImpl mwRevision;
	/**
	 * Object used to report all revisions to.
	 */
	final MwRevisionProcessor mwRevisionProcessor;

	/**
	 * Constructor.
	 * 
	 * @param mwRevisionProcessor
	 *            the revision processor to which all revisions will be reported
	 */
	public MwRevisionDumpFileProcessor(MwRevisionProcessor mwRevisionProcessor) {
		this.xmlFactory = XMLInputFactory.newInstance();
		this.namespaces = new HashMap<>();
		this.mwRevision = new MwRevisionImpl();
		this.mwRevisionProcessor = mwRevisionProcessor;
		reset();
	}

	/**
	 * Resets the internal state of the object. All information gathered from
	 * previously processed dumps and all related statistics will be forgotten.
	 * If this method is not called, then consecutive invocations of
	 * {@link #processDumpFileContents(InputStream, MwDumpFile)} will continue
	 * to add to the internal state. This is useful for processing dumps that
	 * are split into several parts.
	 * 
	 * This will not unregister any MwRevisionProcessors.
	 */
	public void reset() {
		this.namespaces.clear();
	}

	@Override
	public void processDumpFileContents(InputStream inputStream,
			MwDumpFile dumpFile) {

		logger.info("Processing revision dump file " + dumpFile.toString());

		this.namespaces.clear();
		this.sitename = "";
		this.baseUrl = "";

		this.xmlReader = null;

		try {
			this.xmlReader = this.xmlFactory.createXMLStreamReader(inputStream);
			processXmlMediawiki();
		} catch (XMLStreamException | MwDumpFormatException e) {
			MwRevisionDumpFileProcessor.logger.error(e.toString());
		} finally { // unfortunately, xmlReader does not implement AutoClosable
			if (this.xmlReader != null) {
				try {
					this.xmlReader.close();
				} catch (XMLStreamException e) {
					throw new RuntimeException(
							"Problem closing XML Reader. This hides an earlier exception.",
							e);
				}
			}
		}

		this.mwRevisionProcessor.finishRevisionProcessing();
	}

	/**
	 * Processes current XML starting from a <mediawiki> start tag up to
	 * the corresponding end tag. This method uses the current state of
	 * {@link #xmlReader} and stores its results in according member fields.
	 * When the method has finished, {@link #xmlReader} will be at the next
	 * element after the closing tag of this block.
	 * 
	 * @throws XMLStreamException
	 *             if there was a problem reading the XML or if the XML is
	 *             malformed
	 * @throws MwDumpFormatException
	 *             if the contents of the XML file did not match our
	 *             expectations of a MediaWiki XML dump
	 */
	void processXmlMediawiki() throws XMLStreamException, MwDumpFormatException {

		while (this.xmlReader.hasNext()) {
			switch (this.xmlReader.getEventType()) {

			case XMLStreamConstants.START_ELEMENT:
				switch (this.xmlReader.getLocalName()) {
				case MwRevisionDumpFileProcessor.E_MEDIAWIKI:
					break;
				case MwRevisionDumpFileProcessor.E_SITEINFO:
					processXmlSiteinfo();
					this.mwRevisionProcessor.startRevisionProcessing(
							this.sitename, this.baseUrl, this.namespaces);
					break;
				case MwRevisionDumpFileProcessor.E_PAGE:
					tryProcessXmlPage();
					break;
				}
				break;

			case XMLStreamConstants.END_ELEMENT:
				if (!"mediawiki".equals(this.xmlReader.getLocalName())) {
					throw new MwDumpFormatException("Unexpected end element .");
				}
				break;

			}

			this.xmlReader.next();
		}
	}

	/**
	 * Processes current XML starting from a <siteinfo> start tag up to
	 * the corresponding end tag. This method uses the current state of
	 * {@link #xmlReader} and stores its results in according member fields.
	 * When the method has finished, {@link #xmlReader} will be at the next
	 * element after the closing tag of this block.
	 * 
	 * @throws XMLStreamException
	 *             if there was a problem reading the XML or if the XML is
	 *             malformed
	 * @throws MwDumpFormatException
	 *             if the contents of the XML file did not match our
	 *             expectations of a MediaWiki XML dump
	 */
	void processXmlSiteinfo() throws XMLStreamException {

		this.xmlReader.next(); // skip current start tag
		while (this.xmlReader.hasNext()) {
			switch (this.xmlReader.getEventType()) {

			case XMLStreamConstants.START_ELEMENT:
				switch (xmlReader.getLocalName()) {
				case MwRevisionDumpFileProcessor.E_SITENAME:
					this.sitename = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_NAMESPACE:
					Integer namespaceKey = Integer.parseInt(
							this.xmlReader.getAttributeValue(null,
									MwRevisionDumpFileProcessor.A_NSKEY));
					this.namespaces.put(namespaceKey,
							this.xmlReader.getElementText());
					break;
				case MwRevisionDumpFileProcessor.E_BASEURL:
					this.baseUrl = this.xmlReader.getElementText();
					break;
				}
				break;

			case XMLStreamConstants.END_ELEMENT:
				if (MwRevisionDumpFileProcessor.E_SITEINFO
						.equals(this.xmlReader.getLocalName())) {
					return;
				}
				break;

			}

			this.xmlReader.next();
		}
	}

	/**
	 * Tries to processes current XML starting from a <page> start tag up
	 * to the corresponding end tag using {@link #processXmlPage()}. If this
	 * fails for some reason, it tries to recover to read all remaining page
	 * blocks nonetheless.
	 * 
	 * @throws XMLStreamException
	 *             if there was a problem reading the XML
	 */
	void tryProcessXmlPage() throws XMLStreamException {
		try {
			processXmlPage();
		} catch (MwDumpFormatException e) {
			MwRevisionDumpFileProcessor.logger
					.error("Error when trying to process revision block for page \""
							+ this.mwRevision.getPrefixedTitle()
							+ "\" (namespace "
							+ this.mwRevision.getNamespace()
							+ ", id "
							+ this.mwRevision.getPageId()
							+ "): "
							+ e.toString());

			MwRevisionDumpFileProcessor.logger.info("Trying to recover ...");
			while (this.xmlReader.hasNext()) {
				this.xmlReader.next();
				if (this.xmlReader.getEventType() == XMLStreamConstants.END_ELEMENT
						&& this.xmlReader.getLocalName().equals(MwRevisionDumpFileProcessor.E_PAGE)) {
					MwRevisionDumpFileProcessor.logger
							.info("... recovery successful. Continuing processing.");
					return;
				}
			}
			MwRevisionDumpFileProcessor.logger
					.error("Recovery failed. Could not process remaining XML.");
		}
	}

	/**
	 * Processes current XML starting from a <page> start tag up to the
	 * corresponding end tag. This method uses the current state of
	 * {@link #xmlReader} and stores its results in according member fields.
	 * When the method has finished, {@link #xmlReader} will be at the next
	 * element after the closing tag of this block.
	 * 
	 * @throws XMLStreamException
	 *             if there was a problem reading the XML or if the XML is
	 *             malformed
	 * @throws MwDumpFormatException
	 *             if the contents of the XML file did not match our
	 *             expectations of a MediaWiki XML dump
	 */
	void processXmlPage() throws XMLStreamException, MwDumpFormatException {

		this.mwRevision.resetCurrentPageData();

		this.xmlReader.next(); // skip current start tag
		while (this.xmlReader.hasNext()) {
			switch (this.xmlReader.getEventType()) {

			case XMLStreamConstants.START_ELEMENT:
				switch (this.xmlReader.getLocalName()) {
				case MwRevisionDumpFileProcessor.E_PAGE_TITLE:
					this.mwRevision.prefixedTitle = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_PAGE_NAMESPACE:
					this.mwRevision.namespace = Integer.parseInt(this.xmlReader.getElementText());
					break;
				case MwRevisionDumpFileProcessor.E_PAGE_ID:
					this.mwRevision.pageId = Integer.parseInt(this.xmlReader.getElementText());
					break;
				case MwRevisionDumpFileProcessor.E_PAGE_REVISION:
					processXmlRevision();
					break;
				case MwRevisionDumpFileProcessor.E_PAGE_REDIRECT:
					break;
				default:
					throw new MwDumpFormatException("Unexpected element \""
							+ this.xmlReader.getLocalName() + "\" in page.");
				}
				break;

			case XMLStreamConstants.END_ELEMENT:
				if (MwRevisionDumpFileProcessor.E_PAGE.equals(xmlReader
						.getLocalName())) {
					return;
				}
				break;
			}

			this.xmlReader.next();
		}
	}

	/**
	 * Processes current XML starting from a <revision> start tag up to
	 * the corresponding end tag. This method uses the current state of
	 * {@link #xmlReader} and stores its results in according member fields.
	 * When the method has finished, {@link #xmlReader} will be at the next
	 * element after the closing tag of this block.
	 * 
	 * @throws XMLStreamException
	 *             if there was a problem reading the XML or if the XML is
	 *             malformed
	 * @throws MwDumpFormatException
	 *             if the contents of the XML file did not match our
	 *             expectations of a MediaWiki XML dump
	 */
	void processXmlRevision() throws XMLStreamException, MwDumpFormatException {

		this.mwRevision.resetCurrentRevisionData();

		this.xmlReader.next(); // skip current start tag
		while (this.xmlReader.hasNext()) {
			switch (this.xmlReader.getEventType()) {

			case XMLStreamConstants.START_ELEMENT:
				switch (this.xmlReader.getLocalName()) {
				case MwRevisionDumpFileProcessor.E_REV_COMMENT:
					this.mwRevision.comment = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_REV_TEXT:
					this.mwRevision.text = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_REV_TIMESTAMP:
					this.mwRevision.timeStamp = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_REV_FORMAT:
					this.mwRevision.format = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_REV_MODEL:
					this.mwRevision.model = this.xmlReader.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_REV_CONTRIBUTOR:
					processXmlContributor();
					break;
				case MwRevisionDumpFileProcessor.E_REV_ID:
					this.mwRevision.revisionId = Long.parseLong(this.xmlReader.getElementText());
					break;
				case MwRevisionDumpFileProcessor.E_REV_PARENT_ID:
					this.mwRevision.parentRevisionId = Long.parseLong(this.xmlReader.getElementText());
					break;
				case MwRevisionDumpFileProcessor.E_REV_SHA1:
				case MwRevisionDumpFileProcessor.E_REV_MINOR:
					break;
				default:
					throw new MwDumpFormatException("Unexpected element \""
							+ this.xmlReader.getLocalName() + "\" in revision.");
				}

				break;

			case XMLStreamConstants.END_ELEMENT:
				if (MwRevisionDumpFileProcessor.E_PAGE_REVISION
						.equals(this.xmlReader.getLocalName())) {
					this.mwRevisionProcessor.processRevision(this.mwRevision);
					return;
				}
				break;
			}

			this.xmlReader.next();
		}
	}

	/**
	 * Processes current XML starting from a <contributor> start tag up to
	 * the corresponding end tag. This method uses the current state of
	 * {@link #xmlReader} and stores its results in according member fields.
	 * When the method has finished, {@link #xmlReader} will be at the next
	 * element after the closing tag of this block.
	 * 
	 * @throws XMLStreamException
	 *             if there was a problem reading the XML or if the XML is
	 *             malformed
	 * @throws MwDumpFormatException
	 *             if the contents of the XML file did not match our
	 *             expectations of a MediaWiki XML dump
	 */
	void processXmlContributor() throws XMLStreamException,
			MwDumpFormatException {

		this.xmlReader.next(); // skip current start tag
		while (this.xmlReader.hasNext()) {
			switch (this.xmlReader.getEventType()) {

			case XMLStreamConstants.START_ELEMENT:
				switch (this.xmlReader.getLocalName()) {
				case MwRevisionDumpFileProcessor.E_CONTRIBUTOR_NAME:
					this.mwRevision.contributor = this.xmlReader
							.getElementText();
					break;
				case MwRevisionDumpFileProcessor.E_CONTRIBUTOR_ID:
					this.mwRevision.contributorId = Integer.parseInt(this.xmlReader.getElementText());
					break;
				case MwRevisionDumpFileProcessor.E_CONTRIBUTOR_IP:
					this.mwRevision.contributor = this.xmlReader
							.getElementText();
					this.mwRevision.contributorId = -1;
					break;
				default:
					throw new MwDumpFormatException("Unexpected element \""
							+ this.xmlReader.getLocalName()
							+ "\" in contributor.");
				}

				break;

			case XMLStreamConstants.END_ELEMENT:
				if (MwRevisionDumpFileProcessor.E_REV_CONTRIBUTOR
						.equals(this.xmlReader.getLocalName())) {
					return;
				}
				break;
			}

			this.xmlReader.next();
		}
	}

}