org.wikibrain.download.RequestedLinkGetter Maven / Gradle / Ivy
package org.wikibrain.download;
import com.google.common.collect.Multimap;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;
import org.jsoup.select.Elements;
import org.jsoup.nodes.Element;
import org.wikibrain.core.lang.LanguageSet;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* Get URLs of the dump file links with specified language, type of dump file and the date before which the dumps
* are pulled.
*
* @author Ari Weiland
* @author Yulun Li
*
*/
public class RequestedLinkGetter {
private static final Logger LOG = LoggerFactory.getLogger(RequestedLinkGetter.class);
private static final String DATE_FORMAT = "yyyyMMdd";
private final Language lang;
private final List matchers;
private final Date requestDate; // This is the date requested by the user.
public RequestedLinkGetter(Language lang, List matchers, Date requestDate) {
this.lang = lang;
this.matchers = matchers;
this.requestDate = requestDate;
}
/**
* Return all dates on the dump index page of a particular language.
* @return list of Date objects.
* @throws IOException
* @throws ParseException
*/
protected List getAllDates() throws IOException, ParseException {
List availableDate = new ArrayList();
URL langWikiPageUrl = new URL(DumpLinkGetter.BASEURL_STRING + "/" + lang.getLangCode().replace("-", "_") + "wiki/");
Document doc = Jsoup.parse(IOUtils.toString(langWikiPageUrl.openStream()));
Elements availableDates = doc.select("body").select("pre").select("a[href]");
for (Element element : availableDates) {
Matcher dateMatcher = Pattern.compile("(\\d{8})/").matcher(element.attr("href"));
while (dateMatcher.find()) {
availableDate.add(stringToDate(dateMatcher.group(1)));
}
}
return availableDate;
}
/**
* Return a sorted list of dump dates before the date requested.
* @param dateList list of Date object
* @return list of dates as String.
*/
protected List availableDumpDatesSorted(List dateList) throws WikiBrainException {
List dateListSorted = new ArrayList();
Collections.sort(dateList, new Comparator() {
public int compare(Date date1, Date date2) {
return date1.compareTo(date2);
}
});
for (Date date : dateList) {
if (!date.after(requestDate)) {
dateListSorted.add(new SimpleDateFormat(DATE_FORMAT).format(date));
}
}
if (dateListSorted.isEmpty()) {
throw new WikiBrainException("No dumps for " + lang.getLangCode() + " found before " + new SimpleDateFormat(DATE_FORMAT).format(requestDate));
}
return dateListSorted;
}
/**
* Convert a String to a Date object.
* @param dateString Date formatted in 'yyyyMMdd' as a string.
* @return Date as java.util.Date object.
* @throws java.text.ParseException
*/
private static Date stringToDate(String dateString) throws java.text.ParseException {
SimpleDateFormat dateFormatter = new SimpleDateFormat(DATE_FORMAT);
dateFormatter.setLenient(false);
return dateFormatter.parse(dateString);
}
/**
* Get dump file links of the most recent available before the requestDate.
* @return
* @throws ParseException
* @throws IOException
* @throws WikiBrainException
*/
protected Map> getDumps() throws ParseException, IOException, WikiBrainException {
List availableDates = availableDumpDatesSorted(getAllDates());
Map> map = new HashMap>();
List unfoundMatchers = new ArrayList(matchers);
for (int i = availableDates.size() - 1; i > -1; i--) {
DumpLinkGetter dumpLinkGetter = new DumpLinkGetter(lang, unfoundMatchers, availableDates.get(i));
Multimap batchDumps = dumpLinkGetter.getDumpFiles(dumpLinkGetter.getFileLinks());
map.put(availableDates.get(i), batchDumps);
for (int j = 0; j < unfoundMatchers.size(); j++) {
FileMatcher linkMatcher = unfoundMatchers.get(j);
if (batchDumps.keySet().contains(linkMatcher)) {
unfoundMatchers.remove(linkMatcher);
j--;
}
}
if (unfoundMatchers.isEmpty()) {
return map;
}
if (i == 0) {
LOG.warn("Some matchers not found: " + unfoundMatchers);
}
}
return map;
}
public List getLangLinks() throws WikiBrainException, IOException, ParseException {
List result = new ArrayList();
Map> dumpLinks = this.getDumps();
for (String dumpDate : dumpLinks.keySet()) {
for (FileMatcher linkMatcher : dumpLinks.get(dumpDate).keySet()) {
for (DumpLinkInfo linkInfo : dumpLinks.get(dumpDate).get(linkMatcher)) {
result.add(linkInfo.getLanguage().getLangCode() + "\t" +
linkInfo.getDate() + "\t" +
linkInfo.getLinkMatcher().getName() + "\t" +
linkInfo.getCounter() + "\t" +
linkInfo.getUrl() + "\t" +
linkInfo.getMd5());
}
}
}
return result;
}
/**
* Parse command line and generate .tsv file containing language code, date of dump, name of file type and link url.
* @param args command line prompt
* @throws IOException
* @throws WikiBrainException
* @throws ParseException
*/
public static void main(String[] args) throws IOException, WikiBrainException, ParseException, ConfigurationException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.hasArgs()
.withValueSeparator(',')
.withLongOpt("names")
.withDescription("Names of file types, separated by comma (e.g. 'articles,abstracts'). \nDefault is " + new Configuration().get().getStringList("download.matcher"))
.create("f"));
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("output")
.withDescription("Path to output file.")
.create("o"));
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("date")
.withDescription("Dumps are pulled from on or before this date. Default is today")
.create("y"));
EnvBuilder.addStandardOptions(options);
// You MUST specify a language set when downloading files
Option o = options.getOption("l");
o.setRequired(true);
options.addOption(o);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (org.apache.commons.cli.ParseException e) {
System.err.println("Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("RequestedLinkGetter", options);
System.exit(1);
return;
}
Env env = new EnvBuilder(cmd).build();
Configurator conf = env.getConfigurator();
List linkMatchers;
if (cmd.hasOption("n")) {
linkMatchers = new ArrayList();
for (String name : cmd.getOptionValues("n")) {
FileMatcher matcher = FileMatcher.getByName(name);
if (matcher == null) {
System.err.println("Invalid matcher name: " + name + "\nValid matcher names: \n" + FileMatcher.getAllNames().toString());
System.exit(1);
}
linkMatchers.add(matcher);
}
} else {
linkMatchers = FileMatcher.getListByNames(conf.getConf().get().getStringList("download.matcher"));
}
LanguageSet languages = env.getLanguages();
Date getDumpByDate = new Date();
if (cmd.hasOption("d")) {
try {
getDumpByDate = stringToDate(cmd.getOptionValue("d"));
} catch (java.text.ParseException e) {
System.err.println("Invalid date: " + cmd.getOptionValue("d")
+ "\nValid date format: \n" + "yyyyMMdd");
System.exit(1);
}
}
String filePath = conf.getConf().get().getString("download.listFile");
if (cmd.hasOption('o')) {
filePath = cmd.getOptionValue('o');
}
LOG.info("writing download list to " + filePath);
List result = new ArrayList();
for (Language language : languages) {
RequestedLinkGetter getter = new RequestedLinkGetter(
language, linkMatchers, getDumpByDate);
result.addAll(getter.getLangLinks());
}
if (languages.size() >= 2) {
RequestedLinkGetter getter = new RequestedLinkGetter(
Language.WIKIDATA,
Arrays.asList(FileMatcher.WIKIDATA_ITEMS),
getDumpByDate);
result.addAll(getter.getLangLinks());
}
if (!result.isEmpty()) {
File file = new File(filePath);
FileUtils.writeLines(file, result, "\n");
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy