All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.tudarmstadt.ukp.wikipedia.mwdumper.importer.XmlDumpReader Maven / Gradle / Ivy

/*
 * MediaWiki import/export processing tools
 * Copyright 2005 by Brion Vibber
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * $Id: XmlDumpReader.java 59325 2009-11-22 01:21:03Z rainman $
 */

package de.tudarmstadt.ukp.wikipedia.mwdumper.importer;

import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Map;
import java.util.TimeZone;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class XmlDumpReader  extends DefaultHandler {
	InputStream input;
	DumpWriter writer;
	
	private char[] buffer;
	private int len;
	private boolean hasContent = false;
	private boolean deleted = false;
	
	Siteinfo siteinfo;
	Page page;
	boolean pageSent;
	Contributor contrib;
	Revision rev;
	int nskey;
	
	boolean abortFlag;
	
	/**
	 * Initialize a processor for a MediaWiki XML dump stream.
	 * Events are sent to a single DumpWriter output sink, but you
	 * can chain multiple output processors with a MultiWriter.
	 * @param inputStream Stream to read XML from.
	 * @param writer Output sink to send processed events to.
	 */
	public XmlDumpReader(InputStream inputStream, DumpWriter writer) {
		input = inputStream;
		this.writer = writer;
		buffer = new char[4096];
		len = 0;
		hasContent = false;
	}
	
	/**
	 * Reads through the entire XML dump on the input stream, sending
	 * events to the DumpWriter as it goes. May throw exceptions on
	 * invalid input or due to problems with the output.
	 * @throws IOException
	 */
	public void readDump() throws IOException {
		try {
			SAXParserFactory factory = SAXParserFactory.newInstance();
			SAXParser parser = factory.newSAXParser();
	
			parser.parse(input, this);
		} catch (ParserConfigurationException e) {
			throw (IOException)new IOException(e.getMessage()).initCause(e);
		} catch (SAXException e) {
			throw (IOException)new IOException(e.getMessage()).initCause(e);
		}
		writer.close();
	}
	
	/**
	 * Request that the dump processing be aborted.
	 * At the next element, an exception will be thrown to stop the XML parser.
	 * @fixme Is setting a bool thread-safe? It should be atomic...
	 */
	public void abort() {
		abortFlag = true;
	}
	
	// --------------------------
	// SAX handler interface methods:
	
	private static final Map startElements = new HashMap(64);
	private static final Map endElements = new HashMap(64);
	static {
		startElements.put("revision","revision");
		startElements.put("contributor","contributor");
		startElements.put("page","page");
		startElements.put("mediawiki", "mediawiki");
		startElements.put("siteinfo","siteinfo");
		startElements.put("namespaces","namespaces");
		startElements.put("namespace","namespace");

		endElements.put("ThreadSubject","ThreadSubject");
		endElements.put("ThreadParent","ThreadParent");
		endElements.put("ThreadAncestor","ThreadAncestor");
		endElements.put("ThreadPage","ThreadPage");
		endElements.put("ThreadID","ThreadID");
		endElements.put("ThreadSummaryPage","ThreadSummaryPage");
		endElements.put("ThreadAuthor","ThreadAuthor");
		endElements.put("ThreadEditStatus","ThreadEditStatus");
		endElements.put("ThreadType","ThreadType");
		endElements.put("base","base");
		endElements.put("case","case");
		endElements.put("comment","comment");
		endElements.put("contributor","contributor");
		endElements.put("generator","generator");
		endElements.put("id","id");
		endElements.put("ip","ip");
		endElements.put("mediawiki", "mediawiki");
		endElements.put("minor","minor");
		endElements.put("namespaces","namespaces");
		endElements.put("namespace","namespace");
		endElements.put("page","page");
		endElements.put("restrictions","restrictions");
		endElements.put("revision","revision");
		endElements.put("siteinfo","siteinfo");
		endElements.put("sitename","sitename");
		endElements.put("text","text");
		endElements.put("timestamp","timestamp");
		endElements.put("title","title");
		endElements.put("username","username");
	}
	
	public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException {
		// Clear the buffer for character data; we'll initialize it
		// if and when character data arrives -- at that point we
		// have a length.
		len = 0;
		hasContent = false;
		
		if (abortFlag)
			throw new SAXException("XmlDumpReader set abort flag.");

		// check for deleted="deleted", and set deleted flag for the current element. 
		String d = attributes.getValue("deleted");
		deleted = (d!=null && d.equals("deleted")); 
		
		try {
			qName = (String)startElements.get(qName);
			if (qName == null)
				return;
			// frequent tags:
			if (qName == "revision") openRevision();
			else if (qName == "contributor") openContributor();
			else if (qName == "page") openPage();
			// rare tags:
			else if (qName == "mediawiki") openMediaWiki();
			else if (qName == "siteinfo") openSiteinfo();
			else if (qName == "namespaces") openNamespaces();
			else if (qName == "namespace") openNamespace(attributes);
		} catch (IOException e) {
			throw new SAXException(e);
		}
	}
	
	public void characters(char[] ch, int start, int length) {
		if (buffer.length < len + length) {
			int maxlen = buffer.length * 2;
			if (maxlen < len + length)
				maxlen = len + length;
			char[] tmp = new char[maxlen];
			System.arraycopy(buffer, 0, tmp, 0, len);
			buffer = tmp;
		}
		System.arraycopy(ch, start, buffer, len, length);
		len += length;
		hasContent = true;
	}
	
	public void endElement(String uri, String localname, String qName) throws SAXException {
		try {
			qName = (String)endElements.get(qName);
			if (qName == null)
				return;
			// frequent tags:
			if (qName == "id") readId();
			else if (qName == "revision") closeRevision();
			else if (qName == "timestamp") readTimestamp();
			else if (qName == "text") readText();
			else if (qName == "contributor") closeContributor();
			else if (qName == "username") readUsername();
			else if (qName == "ip") readIp();
			else if (qName == "comment") readComment();
			else if (qName == "minor") readMinor();
			else if (qName == "page") closePage();
			else if (qName == "title") readTitle();
			else if (qName == "restrictions") readRestrictions();
			// rare tags:
			else if (qName.startsWith("Thread")) threadAttribute(qName);
			else if (qName == "mediawiki") closeMediaWiki();
			else if (qName == "siteinfo") closeSiteinfo();
			else if (qName == "sitename") readSitename();
			else if (qName == "base") readBase();
			else if (qName == "generator") readGenerator();
			else if (qName == "case") readCase();
			else if (qName == "namespaces") closeNamespaces();
			else if (qName == "namespace") closeNamespace();
//			else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")");
		} catch (IOException e) {
			throw (SAXException)new SAXException(e.getMessage()).initCause(e);
		}
	}

	// ----------
	
	void threadAttribute(String attrib) throws IOException {
		if(attrib.equals("ThreadPage")) // parse title
			page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces));
		else
			page.DiscussionThreadingInfo.put(attrib, bufferContents());
	}
	
	void openMediaWiki() throws IOException {
		siteinfo = null;
		writer.writeStartWiki();
	}
	
	void closeMediaWiki() throws IOException {
		writer.writeEndWiki();
		siteinfo = null;
	}
	
	// ------------------
		
	void openSiteinfo() {
		siteinfo = new Siteinfo();
	}
	
	void closeSiteinfo() throws IOException {
		writer.writeSiteinfo(siteinfo);
	}

	private String bufferContentsOrNull() {
		if (!hasContent) return null;
		else return bufferContents();
	}
	
	private String bufferContents() {
		return len == 0 ? "" : new String(buffer, 0, len);
	}
	
	void readSitename() {
		siteinfo.Sitename = bufferContents();
	}
	
	void readBase() {
		siteinfo.Base = bufferContents();
	}
	
	void readGenerator() {
		siteinfo.Generator = bufferContents();
	}
	
	void readCase() {
		siteinfo.Case = bufferContents();
	}
	
	void openNamespaces() {
		siteinfo.Namespaces = new NamespaceSet();
	}
	
	void openNamespace(Attributes attribs) {
		nskey = Integer.parseInt(attribs.getValue("key"));
	}
	
	void closeNamespace() {
		siteinfo.Namespaces.add(nskey, bufferContents());
	}

	void closeNamespaces() {
		// NOP
	}
	
	// -----------
	
	void openPage() {
		page = new Page();
		pageSent = false;
	}
	
	void closePage() throws IOException {
		if (pageSent)
			writer.writeEndPage();
		page = null;
	}
	
	void readTitle() {
		page.Title = new Title(bufferContents(), siteinfo.Namespaces);
	}
	
	void readId() {
		int id = Integer.parseInt(bufferContents());
		if (contrib != null) 
			contrib.Id = id;
		else if (rev != null)
			rev.Id = id;
		else if (page != null)
			page.Id = id;
		else
			throw new IllegalArgumentException("Unexpected  outside a , , or ");
	}
	
	void readRestrictions() {
		page.Restrictions = bufferContents();
	}
	
	// ------
	
	void openRevision() throws IOException {
		if (!pageSent) {
			writer.writeStartPage(page);
			pageSent = true;
		}
		
		rev = new Revision();
	}
	
	void closeRevision() throws IOException {
		writer.writeRevision(rev);
		rev = null;
	}

	void readTimestamp() {
		rev.Timestamp = parseUTCTimestamp(bufferContents());
	}

	void readComment() {
		rev.Comment = bufferContentsOrNull();
		if (rev.Comment==null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed
	}

	void readMinor() {
		rev.Minor = true;
	}

	void readText() {
		rev.Text = bufferContentsOrNull();
		if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed
	}
	
	// -----------
	void openContributor() {
		//XXX: record deleted flag?! as it is, any empty  tag counts as "deleted"
		contrib =  new Contributor();
	}
	
	void closeContributor() {
		//NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object
		rev.Contributor = contrib;
		contrib = null;
	}


	void readUsername() {
		contrib.Username = bufferContentsOrNull();
	}
	
	void readIp() {
		contrib.Username = bufferContents();
		contrib.isIP = true;
	}
	
	private static final TimeZone utc = TimeZone.getTimeZone("UTC");
	private static Calendar parseUTCTimestamp(String text) {
		// 2003-10-26T04:50:47Z
		// We're doing this manually for now, though DateFormatter might work...
		String trimmed = text.trim();
		GregorianCalendar ts = new GregorianCalendar(utc);
		ts.set(
			Integer.parseInt(trimmed.substring(0,0+4)),     // year
			Integer.parseInt(trimmed.substring(5,5+2)) - 1, // month is 0-based!
			Integer.parseInt(trimmed.substring(8,8+2)),     // day
			Integer.parseInt(trimmed.substring(11,11+2)),   // hour
			Integer.parseInt(trimmed.substring(14,14+2)),   // minute
			Integer.parseInt(trimmed.substring(17,17+2)));  // second
		return ts;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy