All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.download.RequestedLinkGetter Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
package org.wikibrain.download;

import com.google.common.collect.Multimap;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;

import org.jsoup.select.Elements;
import org.jsoup.nodes.Element;
import org.wikibrain.core.lang.LanguageSet;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * Get URLs of the dump file links with specified language, type of dump file and the date before which the dumps
 * are pulled.
 *
 * @author Ari Weiland
 * @author Yulun Li
 *
 */
public class RequestedLinkGetter {

    private static final Logger LOG = LoggerFactory.getLogger(RequestedLinkGetter.class);
    private static final String DATE_FORMAT = "yyyyMMdd";

    private final Language lang;
    private final List matchers;
    private final Date requestDate;    // This is the date requested by the user.

    public RequestedLinkGetter(Language lang, List matchers, Date requestDate) {
        this.lang = lang;
        this.matchers = matchers;
        this.requestDate = requestDate;
    }

    /**
     * Return all dates on the dump index page of a particular language.
     * @return list of Date objects.
     * @throws IOException
     * @throws ParseException
     */
    protected List getAllDates() throws IOException, ParseException {
        List availableDate = new ArrayList();
        URL langWikiPageUrl = new URL(DumpLinkGetter.BASEURL_STRING + "/" + lang.getLangCode().replace("-", "_") + "wiki/");
        Document doc = Jsoup.parse(IOUtils.toString(langWikiPageUrl.openStream()));
        Elements availableDates = doc.select("body").select("pre").select("a[href]");
        for (Element element : availableDates) {
            Matcher dateMatcher = Pattern.compile("(\\d{8})/").matcher(element.attr("href"));
            while (dateMatcher.find()) {
                availableDate.add(stringToDate(dateMatcher.group(1)));
            }
        }
        return availableDate;
    }

    /**
     * Return a sorted list of dump dates before the date requested.
     * @param dateList list of Date object
     * @return list of dates as String.
     */
    protected List availableDumpDatesSorted(List dateList) throws WikiBrainException {
        List dateListSorted = new ArrayList();
        Collections.sort(dateList, new Comparator() {
            public int compare(Date date1, Date date2) {
                return date1.compareTo(date2);
            }
        });
        for (Date date : dateList) {
            if (!date.after(requestDate)) {
                dateListSorted.add(new SimpleDateFormat(DATE_FORMAT).format(date));
            }
        }
        if (dateListSorted.isEmpty()) {
            throw new WikiBrainException("No dumps for " + lang.getLangCode() + " found before " + new SimpleDateFormat(DATE_FORMAT).format(requestDate));
        }
        return dateListSorted;
    }

    /**
     * Convert a String to a Date object.
     * @param dateString Date formatted in 'yyyyMMdd' as a string.
     * @return Date as java.util.Date object.
     * @throws java.text.ParseException
     */
    private static Date stringToDate(String dateString) throws java.text.ParseException {
        SimpleDateFormat dateFormatter = new SimpleDateFormat(DATE_FORMAT);
        dateFormatter.setLenient(false);
        return dateFormatter.parse(dateString);
    }

    /**
     * Get dump file links of the most recent available before the requestDate.
     * @return
     * @throws ParseException
     * @throws IOException
     * @throws WikiBrainException
     */
    protected Map> getDumps() throws ParseException, IOException, WikiBrainException {
        List availableDates = availableDumpDatesSorted(getAllDates());
        Map> map = new HashMap>();
        List unfoundMatchers = new ArrayList(matchers);
        for (int i = availableDates.size() - 1; i > -1; i--) {
            DumpLinkGetter dumpLinkGetter = new DumpLinkGetter(lang, unfoundMatchers, availableDates.get(i));
            Multimap batchDumps = dumpLinkGetter.getDumpFiles(dumpLinkGetter.getFileLinks());
            map.put(availableDates.get(i), batchDumps);
            for (int j = 0; j < unfoundMatchers.size(); j++) {
                FileMatcher linkMatcher = unfoundMatchers.get(j);
                if (batchDumps.keySet().contains(linkMatcher)) {
                    unfoundMatchers.remove(linkMatcher);
                    j--;
                }
            }
            if (unfoundMatchers.isEmpty()) {
                return map;
            }
            if (i == 0) {
                LOG.warn("Some matchers not found: " + unfoundMatchers);
            }
        }
        return map;
    }

    public List getLangLinks() throws WikiBrainException, IOException, ParseException {
        List result = new ArrayList();
        Map> dumpLinks = this.getDumps();
        for (String dumpDate : dumpLinks.keySet()) {
            for (FileMatcher linkMatcher : dumpLinks.get(dumpDate).keySet()) {
                for (DumpLinkInfo linkInfo : dumpLinks.get(dumpDate).get(linkMatcher)) {
                    result.add(linkInfo.getLanguage().getLangCode() + "\t" +
                            linkInfo.getDate() + "\t" +
                            linkInfo.getLinkMatcher().getName() + "\t" +
                            linkInfo.getCounter() + "\t" +
                            linkInfo.getUrl() + "\t" +
                            linkInfo.getMd5());
                }
            }
        }
        return result;
    }

    /**
     * Parse command line and generate .tsv file containing language code, date of dump, name of file type and link url.
     * @param args command line prompt
     * @throws IOException
     * @throws WikiBrainException
     * @throws ParseException
     */
    public static void main(String[] args) throws IOException, WikiBrainException, ParseException, ConfigurationException {

        Options options = new Options();
        options.addOption(
                new DefaultOptionBuilder()
                        .hasArgs()
                        .withValueSeparator(',')
                        .withLongOpt("names")
                        .withDescription("Names of file types, separated by comma (e.g. 'articles,abstracts'). \nDefault is " + new Configuration().get().getStringList("download.matcher"))
                        .create("f"));
        options.addOption(
                new DefaultOptionBuilder()
                        .hasArg()
                        .withLongOpt("output")
                        .withDescription("Path to output file.")
                        .create("o"));
        options.addOption(
                new DefaultOptionBuilder()
                        .hasArg()
                        .withLongOpt("date")
                        .withDescription("Dumps are pulled from on or before this date. Default is today")
                        .create("y"));

        EnvBuilder.addStandardOptions(options);

        // You MUST specify a language set when downloading files
        Option o = options.getOption("l");
        o.setRequired(true);
        options.addOption(o);

        CommandLineParser parser = new PosixParser();
        CommandLine cmd;

        try {
            cmd = parser.parse(options, args);
        } catch (org.apache.commons.cli.ParseException e) {
            System.err.println("Invalid option usage: " + e.getMessage());
            new HelpFormatter().printHelp("RequestedLinkGetter", options);
            System.exit(1);
            return;
        }

        Env env = new EnvBuilder(cmd).build();
        Configurator conf = env.getConfigurator();

        List linkMatchers;
        if (cmd.hasOption("n")) {
            linkMatchers = new ArrayList();
            for (String name : cmd.getOptionValues("n")) {
                FileMatcher matcher = FileMatcher.getByName(name);
                if (matcher == null) {
                    System.err.println("Invalid matcher name: " + name + "\nValid matcher names: \n" + FileMatcher.getAllNames().toString());
                    System.exit(1);
                }
                linkMatchers.add(matcher);
            }
        } else {
            linkMatchers = FileMatcher.getListByNames(conf.getConf().get().getStringList("download.matcher"));
        }

        LanguageSet languages = env.getLanguages();

        Date getDumpByDate = new Date();
        if (cmd.hasOption("d")) {
            try {
                getDumpByDate = stringToDate(cmd.getOptionValue("d"));
            } catch (java.text.ParseException e) {
                System.err.println("Invalid date: " + cmd.getOptionValue("d")
                        + "\nValid date format: \n" + "yyyyMMdd");
                System.exit(1);
            }
        }

        String filePath = conf.getConf().get().getString("download.listFile");
        if (cmd.hasOption('o')) {
            filePath = cmd.getOptionValue('o');
        }
        LOG.info("writing download list to " + filePath);

        List result = new ArrayList();
        for (Language language : languages) {
            RequestedLinkGetter getter = new RequestedLinkGetter(
                    language, linkMatchers, getDumpByDate);
            result.addAll(getter.getLangLinks());
        }
        if (languages.size() >= 2) {
            RequestedLinkGetter getter = new RequestedLinkGetter(
                    Language.WIKIDATA,
                    Arrays.asList(FileMatcher.WIKIDATA_ITEMS),
                    getDumpByDate);
            result.addAll(getter.getLangLinks());
        }

        if (!result.isEmpty()) {
            File file = new File(filePath);
            FileUtils.writeLines(file, result, "\n");
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy