![JAR search and dependency download from the Maven repository](/logo.png)
eu.fbk.twm.wiki.ExtractAllDumps Maven / Gradle / Ivy
package eu.fbk.twm.wiki;
import eu.fbk.twm.utils.*;
import org.apache.commons.cli.*;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: alessio
* Date: 29/01/14
* Time: 11:33
* To change this template use File | Settings | File Templates.
*
* To speedup the download use the mirror site, e.g.:
* http://dumps.wikimedia.your.org/enwiki/20150112/enwiki-20150112-pages-articles.xml.bz2
*/
public class ExtractAllDumps extends BaseFolder {
static Logger logger = Logger.getLogger(ExtractAllDumps.class.getName());
static final Integer ARTICLE_LIMIT = 1500000;
static final Pattern wikiDumpBZippedPattern = Pattern.compile("^(\\w{2})wiki-(\\d+)-pages-articles.xml.bz2$");
static final Pattern wikiDumpPattern = Pattern.compile("^(\\w{2})wiki-(\\d+)-pages-articles.xml$");
static final Pattern wikiDataDumpPattern = Pattern.compile("^wikidatawiki-(\\d+)-pages-articles.xml$");
// String pattern = "--category-similarity --base-dir -t 12 -r --templates --file --person-info --abstract --vectors --example --incoming-outgoing --one-example-per-sense";
String pattern = "--category-similarity --base-dir -t 8 -r --templates --file --person-info --abstract --incoming-outgoing";
boolean decompress;
boolean download;
boolean extract;
boolean cleanDumps;
boolean cleanModels;
String[] givenLanguages;
boolean fake;
String server = "dumps.wikimedia.org";
private static String downloadPage(String urlPath) {
URL url;
InputStream is = null;
BufferedReader br;
String line;
try {
url = new URL(urlPath);
is = url.openStream(); // throws an IOException
br = new BufferedReader(new InputStreamReader(is));
StringBuffer b = new StringBuffer();
while ((line = br.readLine()) != null) {
b.append(line).append(System.getProperty("line.separator"));
}
return b.toString();
} catch (MalformedURLException e) {
logger.warn(e.getMessage());
} catch (IOException e) {
logger.warn(e.getMessage());
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException e) {
logger.warn(e.getMessage());
}
}
return null;
}
private static void downloadLastWikipediaVersion(String server, String lang, TreeSet alreadyPresent, String dumpFolder, boolean fake) {
// String urlPath = String.format("http://dumps.wikimedia.org/%1$swiki/", lang);
// String urlPath = String.format("http://dumps.wikimedia.your.org/%1$swiki/", lang);
// String urlPath = String.format("http://wikipedia.c3sl.ufpr.br/%1$swiki/", lang);
String urlPath = String.format("http://" + server + "/%1$swiki/", lang);
logger.debug("Path: " + urlPath);
logger.info(String.format("Downloading %s version of Wikipedia", lang));
TreeSet versions = new TreeSet(Collections.reverseOrder());
logger.info("Getting information on dumps for language " + lang);
try {
Document doc = Jsoup.connect(urlPath).get();
Elements newsHeadlines = doc.select("a");
for (Object newsHeadline : newsHeadlines) {
Element e = (Element) newsHeadline;
String text = e.html();
Matcher m = wikiVersionPattern.matcher(text);
if (m.matches()) {
versions.add(Integer.parseInt(m.group(1)));
}
}
} catch (Exception e) {
logger.warn(e.getMessage());
return;
}
if (alreadyPresent.size() > 0 &&
versions.size() > 0 &&
alreadyPresent.first() >= versions.first()) {
logger.info("No new versions for " + lang);
return;
}
if (versions.size() == 0) {
logger.warn("No versions for " + lang);
return;
}
logger.info("There is a new version for " + lang + ": " + versions.first());
if (!dumpFolder.endsWith(File.separator)) {
dumpFolder += File.separator;
}
String url = String.format("http://" + server + "/%1$swiki/%2$s/%1$swiki-%2$s-pages-articles.xml.bz2", lang, versions.first().toString());
String md5file = String.format("http://" + server + "/%1$swiki/%2$s/%1$swiki-%2$s-md5sums.txt", lang, versions.first().toString());
String dest = dumpFolder + String.format("%1$swiki-%2$s-pages-articles.xml.bz2", lang, versions.first().toString());
String filename = String.format("%1$swiki-%2$s-pages-articles.xml.bz2", lang, versions.first().toString());
String md5content = downloadPage(md5file);
if (md5content == null) {
logger.warn("Unable to open MD5 file");
return;
}
String[] lines = md5content.split(System.getProperty("line.separator"));
boolean download = false;
for (String line : lines) {
line = line.trim();
String[] parts = line.split("\\s+");
if (parts.length < 2) {
continue;
}
if (parts[1].equals(filename)) {
logger.debug(String.format("Found %s", parts[1]));
download = true;
}
}
if (!download) {
logger.info("Version of file is not ready yet.");
return;
}
logger.info("Downloading file: " + url);
logger.info("Destination file: " + dest);
if (fake) {
logger.info("Not downloading, it's fake!");
return;
}
try {
Downloader.Download(url, dest);
} catch (Exception e) {
logger.error(e.getMessage());
}
}
private static Map getWikipediaInfo(String lang) {
String urlPath = String.format("http://%s.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=statistics&maxlag=5&format=json", lang);
String b = downloadPage(urlPath);
// Parse JSON
ObjectMapper mapper = new ObjectMapper();
try {
Map pageData = mapper.readValue(b, Map.class);
Map query = (Map) pageData.get(new String("query"));
Map statistics = (Map) query.get(new String("statistics"));
return statistics;
} catch (Exception e) {
logger.warn(e.getMessage());
}
return null;
}
private static File[] getListOfDumps(File folder, final String[] okLangs) {
File[] dumps = folder.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
Matcher matcher = wikiDumpPattern.matcher(name);
if (matcher.find()) {
String lang = matcher.group(1);
if (okLangs == null || Arrays.asList(okLangs).contains(lang)) {
return true;
}
}
return false;
}
});
return dumps;
}
private static File[] getListOfZippedDumps(File folder, final String[] okLangs) {
File[] zippedDumps = folder.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
Matcher matcher = wikiDumpBZippedPattern.matcher(name);
if (matcher.find()) {
String lang = matcher.group(1);
if (okLangs == null || Arrays.asList(okLangs).contains(lang)) {
return true;
}
}
return false;
}
});
return zippedDumps;
}
protected void init(CommandLine commandLine) {
super.init(commandLine, null);
decompress = commandLine.hasOption("decompress");
download = commandLine.hasOption("download");
extract = commandLine.hasOption("extract");
if (commandLine.hasOption("server")) {
server = commandLine.getOptionValue("server");
}
cleanDumps = commandLine.hasOption("clean-dumps");
cleanModels = commandLine.hasOption("clean-models");
if (commandLine.hasOption("all")) {
decompress = download = extract = cleanDumps = cleanModels = true;
}
givenLanguages = commandLine.getOptionValues("languages");
if (givenLanguages != null) {
logger.info("Language filter: " + Arrays.toString(givenLanguages));
}
fake = commandLine.hasOption("fake");
}
public void start() {
addMandatoryFolder("dump");
addMandatoryFolder("base");
exitOnMissingStuff(true);
// Check versions
logger.info("Retrieving already present information");
Map> alreadyPresentVersions = getPresentVersions(givenLanguages);
Map> alreadyExtractedVersions = new HashMap>(alreadyPresentVersions);
File dumpFolderFile = new File(folders.get("dump"));
File[] listOfVersions = dumpFolderFile.listFiles();
if (listOfVersions != null) {
for (File f : listOfVersions) {
Pattern[] patterns = {wikiDumpBZippedPattern, wikiDumpPattern};
for (Pattern p : patterns) {
Matcher m = p.matcher(f.getName());
if (m.find()) {
String lang = m.group(1);
String version = m.group(2);
if (alreadyPresentVersions.keySet().contains(lang)) {
alreadyPresentVersions.get(lang).add(Integer.parseInt(version));
}
}
}
}
}
// Download part
if (download) {
for (String lang : alreadyPresentVersions.keySet()) {
downloadLastWikipediaVersion(server, lang, alreadyPresentVersions.get(lang), folders.get("dump"), fake);
}
}
if (decompress) {
File[] zippedDumps = getListOfZippedDumps(dumpFolderFile, givenLanguages);
for (File f : zippedDumps) {
String destFile = f.getAbsolutePath().substring(0, f.getAbsolutePath().length() - 4);
if ((new File(destFile)).exists()) {
logger.warn("File " + destFile + " exists, skipping");
continue;
}
if (fake) {
logger.info("Not unzipping, it's fake!");
}
else {
try {
UnixBunzip2Wrapper.bunzip2(f.getAbsolutePath());
} catch (Exception e) {
logger.error(e.getMessage());
}
}
}
}
if (extract) {
addMandatoryFolder("lsa");
addMandatoryFolder("res");
addMandatoryFolder("cl");
addMandatoryFolder("airpedia");
addMandatoryFolder("namnom");
addMandatoryFolder("topic");
addMandatoryFile("ontology");
exitOnMissingStuff(true);
File[] dumps = getListOfDumps(dumpFolderFile, givenLanguages);
for (File f : dumps) {
ExtractorParameters extractorParameters = new ExtractorParameters(f.getAbsolutePath(), folders.get("base"), true);
if (givenLanguages != null &&
!Arrays.asList(givenLanguages).contains(extractorParameters.getLang())) {
continue;
}
String folder = extractorParameters.getExtractionOutputDirName();
File outFolder = new File(folder);
if (outFolder.exists()) {
logger.info("Folder " + folder + " exists, skipping");
continue;
}
logger.info("Getting Wikipedia information");
Map info = getWikipediaInfo(extractorParameters.getLang());
if (info.get("articles") > ARTICLE_LIMIT && givenLanguages == null) {
logger.info(String.format("Wikipedia in [%s] is too big [%d articles], skipping", extractorParameters.getLang(), info.get("articles")));
continue;
}
StringBuffer pars = new StringBuffer();
pars.append(pattern);
pars.append(CharacterTable.SPACE);
pars.append("-o").append(CharacterTable.SPACE).append(folders.get("base"));
pars.append(CharacterTable.SPACE);
pars.append("-d").append(CharacterTable.SPACE).append(f.getAbsolutePath());
pars.append(CharacterTable.SPACE);
pars.append("--lsm-dir").append(CharacterTable.SPACE).append(folders.get("lsa"));
// DBpedia mappings
String mappingFilename = folders.get("res") + "topic-type-mapping/datasets/dbpedia-mappings.tsv";
exitOnMissingFile(mappingFilename);
pars.append(CharacterTable.SPACE);
pars.append("--dbpedia-pars").append(CharacterTable.SPACE)
.append(mappingFilename).append(CharacterTable.SPACE).append(files.get("ontology"));
// Cross-languages links
String wikidataCurrentLangsFolder = folders.get("cl") + "current" + File.separator;
wikidataCurrentLangsFolder += "langs" + File.separator;
exitOnMissingFolder(wikidataCurrentLangsFolder);
pars.append(CharacterTable.SPACE);
pars.append("--cross-language-dir").append(CharacterTable.SPACE).append(wikidataCurrentLangsFolder);
// Airpedia links
String airpediaCurrentFolder = folders.get("airpedia") + "current" + File.separator;
exitOnMissingFolder(airpediaCurrentFolder);
pars.append(CharacterTable.SPACE);
pars.append("--airpedia2-dir").append(CharacterTable.SPACE).append(airpediaCurrentFolder);
// NamNom links
String namnomCurrentFolder = folders.get("namnom") + "current" + File.separator;
exitOnMissingFolder(namnomCurrentFolder);
pars.append(CharacterTable.SPACE);
pars.append("--namnom-dir").append(CharacterTable.SPACE).append(namnomCurrentFolder);
// NamNom links
String topicCurrentFolder = folders.get("topic") + "current" + File.separator;
exitOnMissingFolder(topicCurrentFolder);
pars.append(CharacterTable.SPACE);
pars.append("--topic-dir").append(CharacterTable.SPACE).append(topicCurrentFolder);
String[] parameters = pars.toString().trim().split("\\s+");
logger.info("Starting model extraction: " + Arrays.toString(parameters));
if (fake) {
logger.info("Not extracting, it's fake!");
}
else {
try {
ModelExtractor.main(parameters);
} catch (Exception e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
}
}
//todo: clean part
if (cleanDumps) {
//todo: remember fake
System.out.println(alreadyExtractedVersions);
}
if (cleanModels) {
//todo: remember fake
}
}
public static void main(String[] args) {
ExtractAllDumps extractAllDumps = new ExtractAllDumps();
CommandLineWithLogger commandLineWithLogger = new CommandLineWithLogger();
extractAllDumps.extendCommandLine(commandLineWithLogger);
commandLineWithLogger.addOption(OptionBuilder.isRequired().withDescription("Languages filter").hasArg().withArgName("iso-codes").withLongOpt("languages").create("l"));
commandLineWithLogger.addOption(OptionBuilder.isRequired().withDescription("Data folder").hasArg().withArgName("folder").withLongOpt("data-folder").create("d"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Server").hasArg().withArgName("address").withLongOpt("server").create());
// Actions
commandLineWithLogger.addOption(OptionBuilder.withDescription("Decompress bzip files").withLongOpt("decompress").create("z"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Download Wikipedia dumps").withLongOpt("download").create("w"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Extract models from Wikipedia dumps").withLongOpt("extract").create("e"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Clean dumps folder").withLongOpt("clean-dumps").create("r"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Clean models folder").withLongOpt("clean-models").create("m"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Do everything").withLongOpt("all").create());
commandLineWithLogger.addOption(OptionBuilder.withDescription("Fake (do not do anything)").withLongOpt("fake").create());
CommandLine commandLine = null;
try {
commandLine = commandLineWithLogger.getCommandLine(args);
PropertyConfigurator.configure(commandLineWithLogger.getLoggerProps());
} catch (Exception e) {
System.exit(1);
}
extractAllDumps.init(commandLine);
extractAllDumps.start();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy