All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.mapreduce.examples.WikiLoader Maven / Gradle / Ivy

/*
 * Copyright 2003-2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce.examples;

import info.bliki.wiki.model.WikiModel;

import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.GenericOptionsParser;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParserFactory;

import com.marklogic.cpox.SimpleLogger;
import com.marklogic.cpox.Utilities;
import com.marklogic.mapreduce.ContentOutputFormat;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.xcc.Session;

/**
 * Load wiki documents from HDFS into MarkLogic Server.
 * Used with the configuration file conf/marklogic-wiki.xml.
 */

public class WikiLoader {
    public static class ArticleMapper 
    extends Mapper {
        
        private DocumentURI uri = new DocumentURI();
        
        public void map(Text path, Text page, Context context) 
        throws IOException, InterruptedException {
            uri.setUri(path.toString());
            context.write(uri, page);
        }
    }
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        if (args.length < 2) {
            System.err.println("Usage: WikiLoader configFile inputDir");
            System.exit(2);
        }
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       
        Job job = Job.getInstance(conf, "wiki loader");
        job.setJarByClass(WikiLoader.class);
        job.setInputFormatClass(WikiInputFormat.class);
        job.setMapperClass(ArticleMapper.class);
        job.setMapOutputKeyClass(DocumentURI.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputFormatClass(ContentOutputFormat.class);
        
        ContentInputFormat.setInputPaths(job, new Path(otherArgs[1]));

        conf = job.getConfiguration();
        conf.addResource(otherArgs[0]);
         
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

class WikiInputFormat extends FileInputFormat {

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return true;
    }
    
    @Override
    public RecordReader createRecordReader(InputSplit split,
            TaskAttemptContext context) throws IOException, InterruptedException {
        return new WikiReader();
    }
    
}

class Article {
    String title;
    StringBuilder pageContent;
    
    public Article(String title, StringBuilder pageContent) {
        this.title = title;
        this.pageContent = pageContent;
    }
}

class WikiReader extends RecordReader {

    static final int BUFFER_SIZE = 65536;
    static final int READ_AHEAD_SIZE = 2048;
    static final String BEGIN_PAGE_TAG = "";
    static final String END_PAGE_TAG = "";
    static final String END_DOC_TAG = "";
    private Text key = new Text();
    private Text value = new Text();
    private List
articles; private int recordCount = 0; public WikiReader() { } @Override public void close() throws IOException { } @Override public Text getCurrentKey() throws IOException, InterruptedException { return key; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } @Override public float getProgress() throws IOException, InterruptedException { if (articles == null || articles.isEmpty()) { return 0; } return recordCount / (float)articles.size(); } @Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit)inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); byte[] buf = new byte[BUFFER_SIZE]; long bytesTotal = inSplit.getLength(); long start = ((FileSplit)inSplit).getStart(); fileIn.seek(start); long bytesRead = 0; StringBuilder pages = new StringBuilder(); int sindex = -1; while (true) { int length = (int)Math.min(bytesTotal - bytesRead, buf.length); int read = fileIn.read(buf, 0, length); if (read == -1) { System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); break; } bytesRead += read; String temp = new String(new String(buf, 0, read)); if (sindex == -1) { // haven't found the start yet sindex = temp.indexOf(BEGIN_PAGE_TAG); if (sindex > -1) { pages.append(temp.substring(sindex)); } } else if (bytesRead < bytesTotal) { // haven't completed the split pages.append(temp); } else { // reached the end of this split // look for end int eindex = 0; if (temp.contains(END_DOC_TAG) || // reached the end of doc temp.endsWith(END_PAGE_TAG)) { eindex = temp.lastIndexOf(END_PAGE_TAG); pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); System.out.println("Found end of doc."); } else { // need to read ahead to look for end of page while (true) { read = fileIn.read(buf, 0, READ_AHEAD_SIZE); if (read == -1) { // no more to read System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); System.out.println(temp); break; } bytesRead += read; // look for end temp = new String(buf, 0, read); eindex = temp.indexOf(END_PAGE_TAG); if (eindex > -1) { pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); break; } else { pages.append(temp); } } } break; } } fileIn.close(); articles = WikiModelProcessor.process(pages); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (articles != null && articles.size() > recordCount) { Article article = articles.get(recordCount); key.set(article.title); value.set(article.pageContent.toString()); recordCount++; return true; } return false; } static class WikiModelProcessor { /** * */ private static final String TITLE = "title"; /** * */ private static final String PAGE = "page"; private static final String ROOT = "mediawiki"; private static final String NS_XML = "http://www.w3.org/XML/1998/namespace"; private static final String HEADER = " \n" + " \n" + " Wikipedia \n" + " http://en.wikipedia.org/wiki/Main_Page \n" + " MediaWiki 1.16alpha-wmf \n" + " first-letter \n" + " \n" + " Media \n" + " Special \n" + " \n" + " Talk \n" + " User \n" + " User talk \n" + " Wikipedia \n" + " Wikipedia talk \n" + " File \n" + " File talk \n" + " MediaWiki \n" + " MediaWiki talk \n" + " Template \n" + " Template talk \n" + " Help \n" + " Help talk \n" + " Category \n" + " Category talk \n" + " Portal \n" + " Portal talk \n" + " \n" + " \n"; private static final String FOOTER = "\n"; private static LinkedList path; private static StringBuilder article; private static String title; private static XmlPullParser xpp; static SimpleLogger logger = SimpleLogger.getSimpleLogger(); private static int errors = 0; private static int pages = 0; private static String namespace; private static String language; private static XmlPullParserFactory factory; private static XmlPullParser parser; private static Session session; private static List
articles; /** * @param args * @throws Exception */ public static List
process(StringBuilder input) { input.insert(0, HEADER); input.append(FOOTER); Properties properties = new Properties(); try { factory = XmlPullParserFactory.newInstance(properties .getProperty(XmlPullParserFactory.PROPERTY_NAME), null); factory.setNamespaceAware(true); xpp = factory.newPullParser(); xpp.setInput(new StringReader(input.toString())); // TODO feature isn't supported by xpp3 - look at xpp5? // xpp.setFeature(XmlPullParser.FEATURE_DETECT_ENCODING, true); // TODO feature isn't supported by xpp3 - look at xpp5? // xpp.setFeature(XmlPullParser.FEATURE_PROCESS_DOCDECL, true); xpp.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true); logger.configureLogger(new Properties()); process(); } catch (Exception ex) { logger.logException(ex); } logger.info("finished " + pages + " pages with " + errors + " errors"); return articles; } /** * @throws IOException * @throws XmlPullParserException */ private static void process() throws XmlPullParserException, IOException { // transform to final output int event; path = new LinkedList(); article = null; title = null; logger.info("starting loop"); while (true) { event = xpp.next(); switch (event) { case XmlPullParser.END_DOCUMENT: processEndDocument(); // exit the loop return; case XmlPullParser.END_TAG: processEndElement(xpp.getName()); break; case XmlPullParser.START_TAG: processStartElement(xpp.getName()); break; case XmlPullParser.TEXT: if (null != article) { String name = path.getLast(); if ("comment".equals(name) || "text".equals(name)) { // parse comment elements // parse text elements article.append(parse(xpp.getText())); } else { article .append(Utilities .escapeXml(xpp.getText())); } } break; default: throw new IOException("unexpected event: " + event + " at " + xpp.getPositionDescription()); } } } /** * @param text * @return * @throws IOException */ private static String parse(String text) throws IOException { if (null == text || "".equals(text.trim())) { return null; } // parse wiki markup to xml // TODO: this is slow with bliki - might need concurrency // use a new object every time, to prevent leaks // no doubt this makes it slower... String xml = new WikiModel("${image}", "${title}").render(text); if (null == xml || "".equals(xml.trim())) { return xml; } // verify xml is well-formed try { // use this xpp object to check output from the wikimedia parser parser = factory.newPullParser(); parser .setInput(new StringReader("" + xml + "")); parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true); int event; String temp; char[] chars; int c; while (true) { // with some Japanese text, next() throws // ArrayIndexOutOfBoundsException try { event = parser.next(); } catch (ArrayIndexOutOfBoundsException e) { throw new XmlPullParserException(e.getMessage(), parser, null); } switch (event) { case XmlPullParser.END_DOCUMENT: // exit the loop return xml; case XmlPullParser.END_TAG: parser.getName(); parser.getNamespace(); parser.getText(); break; case XmlPullParser.START_TAG: parser.getName(); parser.getNamespace(); parser.getText(); break; case XmlPullParser.TEXT: temp = parser.getText(); if (null != temp) { chars = temp.toCharArray(); // xpp3 doesn't check codepoint values // check them to avoid XDMP errors for (int i = 0; i < chars.length; i++) { c = chars[i]; // #x9 | #xA | #xD // | [#x20-#xD7FF] // | [#xE000-#xFFFD] // | [#x10000-#x10FFFF] // this implementation is abbreviated if (9 == c || 10 == c || 13 == c || c > 31) { continue; } throw new XmlPullParserException( "bad codepoint value: " + c, parser, null); } } break; default: throw new IOException("unexpected event: " + event + " at " + parser.getPositionDescription()); } } } catch (XmlPullParserException e) { logger.warning(title + ": " + e.getMessage()); errors++; return Utilities.escapeXml(text); } } /** * @param name * @throws IOException */ private static void processEndElement(String name) throws IOException { // logger.info(name); if (!path.getLast().equals(name)) { throw new IOException("found " + name + " expected " + path.getLast() + "; " + title + "; " + article); } path.removeLast(); if (null == article) { return; } article.append(xpp.getText()); // look for end of article if (!PAGE.equals(name)) { return; } boolean encodeTitle = false; URI uri = null; if (encodeTitle) { // try encoding the entry name try { // this form of URI() does escaping nicely uri = new URI(null, title, null); } catch (URISyntaxException e) { try { // URI(schema, ssp, fragment) constructor cannot handle // ssp = 2008-11-07T12:23:47.617766-08:00/1 // (despite what the javadoc says)... // in this situation, treat the path as the fragment. uri = new URI(null, null, title); } catch (URISyntaxException e1) { throw new IOException(e); } } } // add article to list // include the language in the title String path = language + "wiki/" + (encodeTitle ? uri.toString() : title); if (articles == null) { articles = new ArrayList
(); } articles.add(new Article(path, article)); // ready for the next page article = null; } /** * @param name * @throws IOException * @throws XmlPullParserException */ private static void processStartElement(String name) throws IOException, XmlPullParserException { // logger.info(name); path.add(name); // look for start of article if (ROOT.equals(name)) { namespace = xpp.getNamespace(); language = xpp.getAttributeValue(NS_XML, "lang"); return; } if (PAGE.equals(name)) { if (null != article) { throw new IOException("article not null at start of page"); } // this is clumsy, but should work ok article = new StringBuilder("<" + PAGE // propagate the XML namespace + (null == namespace ? "" : (" xmlns=\"" + namespace + "\"")) // propagate the xml:lang attribute + (null == language ? "" : (" xml:lang=\"" + language + "\"")) // end of the start tag + ">"); pages++; return; } if (null != article && !xpp.isEmptyElementTag()) { // write empty elements via end-element, only. // note that attributes are still ok in this case article.append(xpp.getText()); } if (!TITLE.equals(name)) { return; } // create zip entry when we see the title element title = xpp.nextText().trim(); article.append(Utilities.escapeXml(title)); // this puts us at the end element for title processEndElement(name); } /** * @throws IOException */ private static void processEndDocument() throws IOException { if (0 != path.size()) { throw new IOException("document end before end tag (" + path.size() + ") " + path.getLast() + " " + xpp.getPositionDescription()); } if (null != article) { throw new IOException("article not null at end of document: " + title + "; " + article.toString() + "; " + xpp.getPositionDescription()); } if (session != null) { session.close(); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy