All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dkpro.jwpl.mwdumper.importer.XmlDumpReader Maven / Gradle / Ivy

The newest version!
/*
 * MediaWiki import/export processing tools
 * Copyright 2005 by Brion Vibber
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * $Id: XmlDumpReader.java 59325 2009-11-22 01:21:03Z rainman $
 */

package org.dkpro.jwpl.mwdumper.importer;

import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Map;
import java.util.TimeZone;

import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class XmlDumpReader
    extends DefaultHandler
{
    InputStream input;
    DumpWriter writer;

    private char[] buffer;
    private int len;
    private boolean hasContent;
    private boolean deleted = false;

    Siteinfo siteinfo;
    Page page;
    boolean pageSent;
    Contributor contrib;
    Revision rev;
    int nskey;

    boolean abortFlag;

    /**
     * Initialize a processor for a MediaWiki XML dump stream. Events are sent to a single
     * DumpWriter output sink, but you can chain multiple output processors with a MultiWriter.
     *
     * @param inputStream
     *            Stream to read XML from.
     * @param writer
     *            Output sink to send processed events to.
     */
    public XmlDumpReader(InputStream inputStream, DumpWriter writer)
    {
        input = inputStream;
        this.writer = writer;
        buffer = new char[4096];
        len = 0;
        hasContent = false;
    }

    /**
     * Reads through the entire XML dump on the input stream, sending events to the DumpWriter as it
     * goes. May throw exceptions on invalid input or due to problems with the output.
     *
     * @throws IOException
     */
    public void readDump() throws IOException
    {
        try {
            SAXParserFactory factory = SAXParserFactory.newInstance();
            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, false);
            SAXParser parser = factory.newSAXParser();

            parser.parse(input, this);
        }
        catch (ParserConfigurationException | SAXException e) {
            throw (IOException) new IOException(e.getMessage()).initCause(e);
        }
        writer.close();
    }

    /**
     * Request that the dump processing be aborted. At the next element, an exception will be thrown
     * to stop the XML parser. FIXME Is setting a bool thread-safe? It should be atomic...
     */
    public void abort()
    {
        abortFlag = true;
    }

    // --------------------------
    // SAX handler interface methods:

    private static final Map startElements = new HashMap<>(64);
    private static final Map endElements = new HashMap<>(64);

    static {
        startElements.put("revision", "revision");
        startElements.put("contributor", "contributor");
        startElements.put("page", "page");
        startElements.put("mediawiki", "mediawiki");
        startElements.put("siteinfo", "siteinfo");
        startElements.put("namespaces", "namespaces");
        startElements.put("namespace", "namespace");

        endElements.put("ThreadSubject", "ThreadSubject");
        endElements.put("ThreadParent", "ThreadParent");
        endElements.put("ThreadAncestor", "ThreadAncestor");
        endElements.put("ThreadPage", "ThreadPage");
        endElements.put("ThreadID", "ThreadID");
        endElements.put("ThreadSummaryPage", "ThreadSummaryPage");
        endElements.put("ThreadAuthor", "ThreadAuthor");
        endElements.put("ThreadEditStatus", "ThreadEditStatus");
        endElements.put("ThreadType", "ThreadType");
        endElements.put("base", "base");
        endElements.put("case", "case");
        endElements.put("comment", "comment");
        endElements.put("contributor", "contributor");
        endElements.put("generator", "generator");
        endElements.put("id", "id");
        endElements.put("ip", "ip");
        endElements.put("mediawiki", "mediawiki");
        endElements.put("minor", "minor");
        endElements.put("namespaces", "namespaces");
        endElements.put("namespace", "namespace");
        endElements.put("page", "page");
        endElements.put("restrictions", "restrictions");
        endElements.put("revision", "revision");
        endElements.put("siteinfo", "siteinfo");
        endElements.put("sitename", "sitename");
        endElements.put("text", "text");
        endElements.put("timestamp", "timestamp");
        endElements.put("title", "title");
        endElements.put("username", "username");
    }

    public void startElement(String uri, String localname, String qName, Attributes attributes)
        throws SAXException
    {
        // Clear the buffer for character data; we'll initialize it
        // if and when character data arrives -- at that point we
        // have a length.
        len = 0;
        hasContent = false;

        if (abortFlag)
            throw new SAXException("XmlDumpReader set abort flag.");

        // check for deleted="deleted", and set deleted flag for the current element.
        String d = attributes.getValue("deleted");
        deleted = (d != null && d.equals("deleted"));

        try {
            qName = startElements.get(qName);
            if (qName == null)
                return;
            // frequent tags:
            if (qName == "revision")
                openRevision();
            else if (qName == "contributor")
                openContributor();
            else if (qName == "page")
                openPage();
            // rare tags:
            else if (qName == "mediawiki")
                openMediaWiki();
            else if (qName == "siteinfo")
                openSiteinfo();
            else if (qName == "namespaces")
                openNamespaces();
            else if (qName == "namespace")
                openNamespace(attributes);
        }
        catch (IOException e) {
            throw new SAXException(e);
        }
    }

    public void characters(char[] ch, int start, int length)
    {
        if (buffer.length < len + length) {
            int maxlen = buffer.length * 2;
            if (maxlen < len + length)
                maxlen = len + length;
            char[] tmp = new char[maxlen];
            System.arraycopy(buffer, 0, tmp, 0, len);
            buffer = tmp;
        }
        System.arraycopy(ch, start, buffer, len, length);
        len += length;
        hasContent = true;
    }

    public void endElement(String uri, String localname, String qName) throws SAXException
    {
        try {
            qName = endElements.get(qName);
            if (qName == null)
                return;
            // frequent tags:
            if (qName == "id")
                readId();
            else if (qName == "revision")
                closeRevision();
            else if (qName == "timestamp")
                readTimestamp();
            else if (qName == "text")
                readText();
            else if (qName == "contributor")
                closeContributor();
            else if (qName == "username")
                readUsername();
            else if (qName == "ip")
                readIp();
            else if (qName == "comment")
                readComment();
            else if (qName == "minor")
                readMinor();
            else if (qName == "page")
                closePage();
            else if (qName == "title")
                readTitle();
            else if (qName == "restrictions")
                readRestrictions();
            // rare tags:
            else if (qName.startsWith("Thread"))
                threadAttribute(qName);
            else if (qName == "mediawiki")
                closeMediaWiki();
            else if (qName == "siteinfo")
                closeSiteinfo();
            else if (qName == "sitename")
                readSitename();
            else if (qName == "base")
                readBase();
            else if (qName == "generator")
                readGenerator();
            else if (qName == "case")
                readCase();
            else if (qName == "namespaces")
                closeNamespaces();
            else if (qName == "namespace")
                closeNamespace();
            // else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring
            // "+qName.length()+qName.substring(0,6)+")");
        }
        catch (IOException e) {
            throw (SAXException) new SAXException(e.getMessage()).initCause(e);
        }
    }

    // ----------

    void threadAttribute(String attrib) throws IOException
    {
        if (attrib.equals("ThreadPage")) // parse title
            page.DiscussionThreadingInfo.put(attrib,
                    new Title(bufferContents(), siteinfo.Namespaces));
        else
            page.DiscussionThreadingInfo.put(attrib, bufferContents());
    }

    void openMediaWiki() throws IOException
    {
        siteinfo = null;
        writer.writeStartWiki();
    }

    void closeMediaWiki() throws IOException
    {
        writer.writeEndWiki();
        siteinfo = null;
    }

    // ------------------

    void openSiteinfo()
    {
        siteinfo = new Siteinfo();
    }

    void closeSiteinfo() throws IOException
    {
        writer.writeSiteinfo(siteinfo);
    }

    private String bufferContentsOrNull()
    {
        if (!hasContent)
            return null;
        else
            return bufferContents();
    }

    private String bufferContents()
    {
        return len == 0 ? "" : new String(buffer, 0, len);
    }

    void readSitename()
    {
        siteinfo.Sitename = bufferContents();
    }

    void readBase()
    {
        siteinfo.Base = bufferContents();
    }

    void readGenerator()
    {
        siteinfo.Generator = bufferContents();
    }

    void readCase()
    {
        siteinfo.Case = bufferContents();
    }

    void openNamespaces()
    {
        siteinfo.Namespaces = new NamespaceSet();
    }

    void openNamespace(Attributes attribs)
    {
        nskey = Integer.parseInt(attribs.getValue("key"));
    }

    void closeNamespace()
    {
        siteinfo.Namespaces.add(nskey, bufferContents());
    }

    void closeNamespaces()
    {
        // NOP
    }

    // -----------

    void openPage()
    {
        page = new Page();
        pageSent = false;
    }

    void closePage() throws IOException
    {
        if (pageSent)
            writer.writeEndPage();
        page = null;
    }

    void readTitle()
    {
        page.Title = new Title(bufferContents(), siteinfo.Namespaces);
    }

    void readId()
    {
        int id = Integer.parseInt(bufferContents());
        if (contrib != null)
            contrib.Id = id;
        else if (rev != null)
            rev.Id = id;
        else if (page != null)
            page.Id = id;
        else
            throw new IllegalArgumentException(
                    "Unexpected  outside a , , or ");
    }

    void readRestrictions()
    {
        page.Restrictions = bufferContents();
    }

    // ------

    void openRevision() throws IOException
    {
        if (!pageSent) {
            writer.writeStartPage(page);
            pageSent = true;
        }

        rev = new Revision();
    }

    void closeRevision() throws IOException
    {
        writer.writeRevision(rev);
        rev = null;
    }

    void readTimestamp()
    {
        rev.Timestamp = parseUTCTimestamp(bufferContents());
    }

    void readComment()
    {
        rev.Comment = bufferContentsOrNull();
        if (rev.Comment == null && !deleted)
            rev.Comment = ""; // NOTE: null means deleted/supressed
    }

    void readMinor()
    {
        rev.Minor = true;
    }

    void readText()
    {
        rev.Text = bufferContentsOrNull();
        if (rev.Text == null && !deleted)
            rev.Text = ""; // NOTE: null means deleted/supressed
    }

    // -----------
    void openContributor()
    {
        // XXX: record deleted flag?! as it is, any empty  tag counts as "deleted"
        contrib = new Contributor();
    }

    void closeContributor()
    {
        // NOTE: if the contributor was supressed, nither username nor id have been set in the
        // Contributor object
        rev.Contributor = contrib;
        contrib = null;
    }

    void readUsername()
    {
        contrib.Username = bufferContentsOrNull();
    }

    void readIp()
    {
        contrib.Username = bufferContents();
        contrib.isIP = true;
    }

    private static final TimeZone utc = TimeZone.getTimeZone("UTC");

    private static Calendar parseUTCTimestamp(String text)
    {
        // 2003-10-26T04:50:47Z
        // We're doing this manually for now, though DateFormatter might work...
        String trimmed = text.trim();
        GregorianCalendar ts = new GregorianCalendar(utc);
        ts.set(Integer.parseInt(trimmed.substring(0, 0 + 4)), // year
                Integer.parseInt(trimmed.substring(5, 5 + 2)) - 1, // month is 0-based!
                Integer.parseInt(trimmed.substring(8, 8 + 2)), // day
                Integer.parseInt(trimmed.substring(11, 11 + 2)), // hour
                Integer.parseInt(trimmed.substring(14, 14 + 2)), // minute
                Integer.parseInt(trimmed.substring(17, 17 + 2))); // second
        return ts;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy