com.marklogic.mapreduce.examples.WikiLoader Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2003-2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce.examples;

import info.bliki.wiki.model.WikiModel;

import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.GenericOptionsParser;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParserFactory;

import com.marklogic.cpox.SimpleLogger;
import com.marklogic.cpox.Utilities;
import com.marklogic.mapreduce.ContentOutputFormat;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.xcc.Session;

/**
 * Load wiki documents from HDFS into MarkLogic Server.
 * Used with the configuration file conf/marklogic-wiki.xml.
 */

public class WikiLoader {
    public static class ArticleMapper 
    extends Mapper {
        
        private DocumentURI uri = new DocumentURI();
        
        public void map(Text path, Text page, Context context) 
        throws IOException, InterruptedException {
            uri.setUri(path.toString());
            context.write(uri, page);
        }
    }
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        if (args.length < 2) {
            System.err.println("Usage: WikiLoader configFile inputDir");
            System.exit(2);
        }
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       
        Job job = Job.getInstance(conf, "wiki loader");
        job.setJarByClass(WikiLoader.class);
        job.setInputFormatClass(WikiInputFormat.class);
        job.setMapperClass(ArticleMapper.class);
        job.setMapOutputKeyClass(DocumentURI.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputFormatClass(ContentOutputFormat.class);
        
        ContentInputFormat.setInputPaths(job, new Path(otherArgs[1]));

        conf = job.getConfiguration();
        conf.addResource(otherArgs[0]);
         
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

class WikiInputFormat extends FileInputFormat {

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return true;
    }
    
    @Override
    public RecordReader createRecordReader(InputSplit split,
            TaskAttemptContext context) throws IOException, InterruptedException {
        return new WikiReader();
    }
    
}

class Article {
    String title;
    StringBuilder pageContent;
    
    public Article(String title, StringBuilder pageContent) {
        this.title = title;
        this.pageContent = pageContent;
    }
}

class WikiReader extends RecordReader {

    static final int BUFFER_SIZE = 65536;
    static final int READ_AHEAD_SIZE = 2048;
    static final String BEGIN_PAGE_TAG = "";
    static final String END_PAGE_TAG = "";
    static final String END_DOC_TAG = "";
    private Text key = new Text();
    private Text value = new Text();
    private List articles;
    private int recordCount = 0;
    
    public WikiReader() {
    }

    @Override
    public void close() throws IOException {
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (articles == null || articles.isEmpty()) {
            return 0;
        }
        return recordCount / (float)articles.size();
    }

    @Override
    public void initialize(InputSplit inSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        Path file = ((FileSplit)inSplit).getPath();
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        FSDataInputStream fileIn = fs.open(file);
        byte[] buf = new byte[BUFFER_SIZE];
        long bytesTotal = inSplit.getLength();
        long start = ((FileSplit)inSplit).getStart();
        fileIn.seek(start);
        long bytesRead = 0;
        StringBuilder pages = new StringBuilder();
        int sindex = -1;
        while (true) {
            int length = (int)Math.min(bytesTotal - bytesRead, buf.length);
            int read = fileIn.read(buf, 0, length);
            if (read == -1) {
                System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                        "bytesRead=" + bytesRead);
                break;
            }
            bytesRead += read;  
            String temp = new String(new String(buf, 0, read));
            if (sindex == -1) { // haven't found the start yet    
                sindex = temp.indexOf(BEGIN_PAGE_TAG);
                if (sindex > -1) {
                    pages.append(temp.substring(sindex));
                }
            } else if (bytesRead < bytesTotal) { // haven't completed the split
                pages.append(temp);
            } else { // reached the end of this split
                // look for end
                int eindex = 0;
                if (temp.contains(END_DOC_TAG) || // reached the end of doc
                    temp.endsWith(END_PAGE_TAG)) {
                    eindex = temp.lastIndexOf(END_PAGE_TAG);
                    pages.append(temp.substring(0, 
                        eindex + END_PAGE_TAG.length()));   
                    System.out.println("Found end of doc.");
                } else { // need to read ahead to look for end of page
                    while (true) {
                        read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                        if (read == -1) { // no more to read
                            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                                    "bytesRead=" + bytesRead);
                            System.out.println(temp);
                            break;
                        }
                        bytesRead += read;
                        // look for end
                        temp = new String(buf, 0, read);
                        eindex = temp.indexOf(END_PAGE_TAG);
                        if (eindex > -1) {
                            pages.append(temp.substring(0, 
                                    eindex + END_PAGE_TAG.length()));
                            break;
                        } else {
                            pages.append(temp);
                        }
                    }
                }
                break;
            }
        }
        fileIn.close();
        articles = WikiModelProcessor.process(pages);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (articles != null && articles.size() > recordCount) {
            Article article = articles.get(recordCount);
            key.set(article.title);
            value.set(article.pageContent.toString());
            recordCount++;
            return true;
        }
        return false;
    }

    static class  WikiModelProcessor {
        /**
         * 
         */
        private static final String TITLE = "title";

        /**
         * 
         */
        private static final String PAGE = "page";

        private static final String ROOT = "mediawiki";

        private static final String NS_XML = "http://www.w3.org/XML/1998/namespace";
        
        private static final String HEADER = 
            " \n" +
            "   \n" +
            "    Wikipedia \n" +
            "    http://en.wikipedia.org/wiki/Main_Page \n" +
            "    MediaWiki 1.16alpha-wmf \n" +
            "    first-letter \n" +
            "     \n" +
            "      Media \n" +
            "      Special \n" +
            "       \n" +
            "      Talk \n" +
            "      User \n" +
            "      User talk \n" +
            "      Wikipedia \n" +
            "      Wikipedia talk \n" +
            "      File \n" +
            "      File talk \n" +
            "      MediaWiki \n" +
            "      MediaWiki talk \n" +
            "      Template \n" +
            "      Template talk \n" +
            "      Help \n" +
            "      Help talk \n" +
            "      Category \n" +
            "      Category talk \n" +
            "      Portal \n" +
            "      Portal talk \n" +
            "     \n" +
            "   \n";
        
        private static final String FOOTER = "\n";

        private static LinkedList path;

        private static StringBuilder article;

        private static String title;

        private static XmlPullParser xpp;

        static SimpleLogger logger = SimpleLogger.getSimpleLogger();

        private static int errors = 0;

        private static int pages = 0;

        private static String namespace;

        private static String language;

        private static XmlPullParserFactory factory;

        private static XmlPullParser parser;
        
        private static Session session;
        
        private static List articles;

        /**
         * @param args
         * @throws Exception
         */
        public static List process(StringBuilder input) {
            input.insert(0, HEADER);
            input.append(FOOTER);
            Properties properties = new Properties();
            try {
                factory = XmlPullParserFactory.newInstance(properties
                        .getProperty(XmlPullParserFactory.PROPERTY_NAME), null);
                factory.setNamespaceAware(true);
                xpp = factory.newPullParser();
                xpp.setInput(new StringReader(input.toString()));
    
                // TODO feature isn't supported by xpp3 - look at xpp5?
                // xpp.setFeature(XmlPullParser.FEATURE_DETECT_ENCODING, true);
                // TODO feature isn't supported by xpp3 - look at xpp5?
                // xpp.setFeature(XmlPullParser.FEATURE_PROCESS_DOCDECL, true);
                xpp.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);
    
                logger.configureLogger(new Properties());
    
                process();
            } catch (Exception ex) {
                logger.logException(ex);
            }
            logger.info("finished " + pages + " pages with " + errors
                    + " errors");
            return articles;
        }

        /**
         * @throws IOException
         * @throws XmlPullParserException
         */
        private static void process() throws XmlPullParserException,
        IOException {
            // transform to final output
            int event;
            path = new LinkedList();
            article = null;
            title = null;

            logger.info("starting loop");

            while (true) {
                event = xpp.next();
                switch (event) {
                case XmlPullParser.END_DOCUMENT:
                    processEndDocument();
                    // exit the loop
                    return;
                case XmlPullParser.END_TAG:
                    processEndElement(xpp.getName());
                    break;
                case XmlPullParser.START_TAG:
                    processStartElement(xpp.getName());
                    break;
                case XmlPullParser.TEXT:
                    if (null != article) {
                        String name = path.getLast();
                        if ("comment".equals(name) || "text".equals(name)) {
                            // parse comment elements
                            // parse text elements
                            article.append(parse(xpp.getText()));
                        } else {
                            article
                            .append(Utilities
                                    .escapeXml(xpp.getText()));
                        }
                    }
                    break;
                default:
                    throw new IOException("unexpected event: " + event
                            + " at " + xpp.getPositionDescription());
                }
            }
        }

        /**
         * @param text
         * @return
         * @throws IOException
         */
        private static String parse(String text) throws IOException {
            if (null == text || "".equals(text.trim())) {
                return null;
            }
            // parse wiki markup to xml
            // TODO: this is slow with bliki - might need concurrency
            // use a new object every time, to prevent leaks
            // no doubt this makes it slower...
            String xml = new WikiModel("${image}", "${title}").render(text);

            if (null == xml || "".equals(xml.trim())) {
                return xml;
            }

            // verify xml is well-formed
            try {
                // use this xpp object to check output from the wikimedia parser
                parser = factory.newPullParser();
                parser
                .setInput(new StringReader("" + xml
                        + ""));
                parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES,
                        true);
                int event;
                String temp;
                char[] chars;
                int c;
                while (true) {
                    // with some Japanese text, next() throws
                    // ArrayIndexOutOfBoundsException
                    try {
                        event = parser.next();
                    } catch (ArrayIndexOutOfBoundsException e) {
                        throw new XmlPullParserException(e.getMessage(),
                                parser, null);
                    }
                    switch (event) {
                    case XmlPullParser.END_DOCUMENT:
                        // exit the loop
                        return xml;
                    case XmlPullParser.END_TAG:
                        parser.getName();
                        parser.getNamespace();
                        parser.getText();
                        break;
                    case XmlPullParser.START_TAG:
                        parser.getName();
                        parser.getNamespace();
                        parser.getText();
                        break;
                    case XmlPullParser.TEXT:
                        temp = parser.getText();
                        if (null != temp) {
                            chars = temp.toCharArray();
                            // xpp3 doesn't check codepoint values
                            // check them to avoid XDMP errors
                            for (int i = 0; i < chars.length; i++) {
                                c = chars[i];
                                // #x9 | #xA | #xD
                                // | [#x20-#xD7FF]
                                // | [#xE000-#xFFFD]
                                // | [#x10000-#x10FFFF]
                                // this implementation is abbreviated
                                if (9 == c || 10 == c || 13 == c || c > 31) {
                                    continue;
                                }
                                throw new XmlPullParserException(
                                        "bad codepoint value: " + c, parser,
                                        null);
                            }
                        }
                        break;
                    default:
                        throw new IOException("unexpected event: " + event
                                + " at " + parser.getPositionDescription());
                    }
                }
            } catch (XmlPullParserException e) {
                logger.warning(title + ": " + e.getMessage());
                errors++;
                return Utilities.escapeXml(text);
            }
        }

        /**
         * @param name
         * @throws IOException
         */
        private static void processEndElement(String name) throws IOException {
            // logger.info(name);
            if (!path.getLast().equals(name)) {
                throw new IOException("found " + name + " expected "
                        + path.getLast() + "; " + title + "; " + article);
            }
            path.removeLast();

            if (null == article) {
                return;
            }

            article.append(xpp.getText());

            // look for end of article
            if (!PAGE.equals(name)) {
                return;
            }

            boolean encodeTitle = false;
            URI uri = null;
            if (encodeTitle) {
                // try encoding the entry name
                try {
                    // this form of URI() does escaping nicely
                    uri = new URI(null, title, null);
                } catch (URISyntaxException e) {
                    try {
                        // URI(schema, ssp, fragment) constructor cannot handle
                        // ssp = 2008-11-07T12:23:47.617766-08:00/1
                        // (despite what the javadoc says)...
                        // in this situation, treat the path as the fragment.
                        uri = new URI(null, null, title);
                    } catch (URISyntaxException e1) {
                        throw new IOException(e);
                    }
                }
            }

            // add article to list
            // include the language in the title        
            String path = language + "wiki/"
            + (encodeTitle ? uri.toString() : title);
            if (articles == null) {
                articles = new ArrayList();
            }
            articles.add(new Article(path, article));
           
            // ready for the next page
            article = null;
        }

        /**
         * @param name
         * @throws IOException
         * @throws XmlPullParserException
         */
        private static void processStartElement(String name)
        throws IOException, XmlPullParserException {
            // logger.info(name);
            path.add(name);
            // look for start of article
            if (ROOT.equals(name)) {
                namespace = xpp.getNamespace();
                language = xpp.getAttributeValue(NS_XML, "lang");
                return;
            }

            if (PAGE.equals(name)) {
                if (null != article) {
                    throw new IOException("article not null at start of page");
                }
                // this is clumsy, but should work ok
                article = new StringBuilder("<"
                        + PAGE
                        // propagate the XML namespace
                        + (null == namespace ? ""
                                : (" xmlns=\"" + namespace + "\""))
                                // propagate the xml:lang attribute
                                + (null == language ? ""
                                        : (" xml:lang=\"" + language + "\""))
                                        // end of the start tag
                                        + ">");
                pages++;
                return;
            }

            if (null != article && !xpp.isEmptyElementTag()) {
                // write empty elements via end-element, only.
                // note that attributes are still ok in this case
                article.append(xpp.getText());
            }

            if (!TITLE.equals(name)) {
                return;
            }

            // create zip entry when we see the title element
            title = xpp.nextText().trim();
            article.append(Utilities.escapeXml(title));
            // this puts us at the end element for title
            processEndElement(name);
        }

        /**
         * @throws IOException
         */
        private static void processEndDocument() throws IOException {
            if (0 != path.size()) {
                throw new IOException("document end before end tag ("
                        + path.size() + ") " + path.getLast() + " "
                        + xpp.getPositionDescription());
            }
            if (null != article) {
                throw new IOException("article not null at end of document: "
                        + title + "; " + article.toString() + "; "
                        + xpp.getPositionDescription());
            }
            if (session != null) {
                session.close();
            }
        }
    }
}