org.apache.mahout.text.wikipedia.WikipediaXmlSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-integration Show documentation
Show all versions of mahout-integration Show documentation
Optional components of Mahout which generally support interaction with third party systems,
formats, APIs, etc.
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.text.wikipedia;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The Bayes example package provides some helper classes for training the Naive Bayes classifier
* on the Twenty Newsgroups data. See {@code PrepareTwentyNewsgroups}
* for details on running the trainer and
* formatting the Twenty Newsgroups data properly for the training.
*
* The easiest way to prepare the data is to use the ant task in core/build.xml:
*
* {@code ant extract-20news-18828}
*
* This runs the arg line:
*
* {@code -p $\{working.dir\}/20news-18828/ -o $\{working.dir\}/20news-18828-collapse -a $\{analyzer\} -c UTF-8}
*
* To Run the Wikipedia examples (assumes you've built the Mahout Job jar):
*
*
* - Download the Wikipedia Dataset. Use the Ant target: {@code ant enwiki-files}
* - Chunk the data using the WikipediaXmlSplitter (from the Hadoop home):
* {@code bin/hadoop jar $MAHOUT_HOME/target/mahout-examples-0.x
* org.apache.mahout.classifier.bayes.WikipediaXmlSplitter
* -d $MAHOUT_HOME/examples/temp/enwiki-latest-pages-articles.xml
* -o $MAHOUT_HOME/examples/work/wikipedia/chunks/ -c 64}
*
*/
public final class WikipediaXmlSplitter {
private static final Logger log = LoggerFactory.getLogger(WikipediaXmlSplitter.class);
private WikipediaXmlSplitter() { }
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument(
abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription(
"The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d").create();
Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(
abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription(
"The output directory to place the splits in:\n"
+ "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
+ "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
+ "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
+ "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")
.withShortName("o").create();
Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false).withArgument(
abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 ID key")
.withShortName("i").create();
Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false).withArgument(
abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription(
"Amazon S3 secret key").withShortName("s").create();
Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument(
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
"The Size of the chunk, in megabytes").withShortName("c").create();
Option numChunksOpt = obuilder
.withLongName("numChunks")
.withRequired(false)
.withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
.withDescription(
"The maximum number of chunks to create. If specified, program will only create a subset of the chunks")
.withShortName("n").create();
Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(
chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create();
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine;
try {
cmdLine = parser.parse(args);
} catch (OptionException e) {
log.error("Error while parsing options", e);
CommandLineUtil.printHelp(group);
return;
}
Configuration conf = new Configuration();
String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
if (cmdLine.hasOption(s3IdOpt)) {
String id = (String) cmdLine.getValue(s3IdOpt);
conf.set("fs.s3n.awsAccessKeyId", id);
conf.set("fs.s3.awsAccessKeyId", id);
}
if (cmdLine.hasOption(s3SecretOpt)) {
String secret = (String) cmdLine.getValue(s3SecretOpt);
conf.set("fs.s3n.awsSecretAccessKey", secret);
conf.set("fs.s3.awsSecretAccessKey", secret);
}
// do not compute crc file when using local FS
conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);
int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
int numChunks = Integer.MAX_VALUE;
if (cmdLine.hasOption(numChunksOpt)) {
numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
}
String header = "\n" + " \n" + "Wikipedia \n"
+ " http://en.wikipedia.org/wiki/Main_Page\n"
+ " MediaWiki 1.13alpha \n" + " first-letter \n"
+ " \n" + " Media \n"
+ " Special \n" + " \n"
+ " Talk \n"
+ " User \n"
+ " User talk \n"
+ " Wikipedia \n"
+ " Wikipedia talk \n"
+ " Image \n"
+ " Image talk \n"
+ " MediaWiki \n"
+ " MediaWiki talk \n"
+ " Template \n"
+ " Template talk \n"
+ " Help \n"
+ " Help talk \n"
+ " Category \n"
+ " Category talk \n"
+ " Portal \n"
+ " Portal talk \n" + " \n"
+ " \n";
StringBuilder content = new StringBuilder();
content.append(header);
NumberFormat decimalFormatter = new DecimalFormat("0000");
File dumpFile = new File(dumpFilePath);
// If the specified path for the input file is incorrect, return immediately
if (!dumpFile.exists()) {
log.error("Input file path {} doesn't exist", dumpFilePath);
return;
}
FileLineIterator it;
if (dumpFilePath.endsWith(".bz2")) {
// default compression format from http://download.wikimedia.org
CompressionCodec codec = new BZip2Codec();
it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
} else {
// assume the user has previously de-compressed the dump file
it = new FileLineIterator(dumpFile);
}
int fileNumber = 0;
while (it.hasNext()) {
String thisLine = it.next();
if (thisLine.trim().startsWith("")) {
boolean end = false;
while (!thisLine.trim().startsWith(" ")) {
content.append(thisLine).append('\n');
if (it.hasNext()) {
thisLine = it.next();
} else {
end = true;
break;
}
}
content.append(thisLine).append('\n');
if (content.length() > chunkSize || end) {
content.append(" ");
fileNumber++;
String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
try (BufferedWriter chunkWriter =
new BufferedWriter(new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"))) {
chunkWriter.write(content.toString(), 0, content.length());
}
if (fileNumber >= numChunks) {
break;
}
content = new StringBuilder();
content.append(header);
}
}
}
}
}