org.wikibrain.download.DumpLinkGetter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of wikibrain-download Show documentation
There is a newer version: 0.9.1
package org.wikibrain.download;

import java.io.IOException;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;

import java.net.URL;
import java.util.*;

/**
 *
 * @author Yulun Li
 *
 * Parses a command line script and generates a .tsv file with the links to the dumps
 * of specified file type and languages.
 *
 */
public class DumpLinkGetter {

    protected static final String BASEURL_STRING = "http://dumps.wikimedia.org";

    private Language lang;
    private List matchers;
    private String dumpDate;    // This is the date of the dump.

    public DumpLinkGetter(Language lang, List matchers, String dumpDate) {
        this.lang = lang;
        this.matchers = matchers;
        this.dumpDate = dumpDate;
    }

    /**
     * Get the URL of the index wiki page of a specified language.
     * @return
     */
    protected String getLanguageWikiUrl() {
        // langCode with dashes like "roa-tara" should be 'roa_tara' in dump links
        return BASEURL_STRING + "/" + lang.getLangCode().replace("-", "_") + "wiki/";
    }

    /**
     * Get file links that are marked "done" (plus MD5sum) on a dump page of the specified language and specified dumpDate
     * @return
     * @throws IOException
     */
    public List getFileLinks() throws IOException {
        List links = new ArrayList();
        URL dumpPageUrl = new URL(getLanguageWikiUrl() + dumpDate + "/");
        Document doc = Jsoup.parse(IOUtils.toString(dumpPageUrl.openStream()));
        Elements linkElements = doc.select("ul").select("li.done").select("li.file").select("a[href]");
        linkElements.addAll(doc.select("p.checksum").select("a[href]"));
        for (Element linkElement : linkElements) {
            links.add(linkElement.attr("href"));
        }
        return links;
    }

    /**
     * Return all links of a particular language the fits one of the patterns
     * @return  hashmap with dump urls and names of dump type
     */
    public Multimap getDumpFiles(List links) throws IOException {
        Multimap dumpLinks = HashMultimap.create();
        Map md5s = getMd5Sums(links);
        for(FileMatcher linkMatcher : matchers){
            List results = linkMatcher.match(links);
            if (!results.isEmpty()) {
                for (String url : results){
                    URL linkURL = new URL(BASEURL_STRING + url);
                    DumpLinkInfo linkInfo = new DumpLinkInfo(lang, dumpDate, linkMatcher, linkURL, linkMatcher.getNumber(url));
                    linkInfo.setMd5(md5s.get(linkInfo.getDownloadName()));
                    dumpLinks.put(linkMatcher, linkInfo);
                }
            }
        }
        return dumpLinks;
    }

    /**
     * Get MD5 of the dump of the specified language and dumpDate.
     * Maps download name to MD5 sum.
     * @param links
     * @return
     * @throws IOException
     */
    protected Map getMd5Sums(List links) throws IOException {
        FileMatcher md5Matcher = FileMatcher.MD5;
        URL md5Url = new URL(BASEURL_STRING + md5Matcher.match(links).get(0));
        List lines = IOUtils.readLines(md5Url.openStream(), "UTF-8");
        HashMap md5s = new HashMap();
        for (String line : lines) {
            String[] parsedInfo = line.split("\\W{2}");
            String md5 = parsedInfo[0];
            String fileName = parsedInfo[1];
            md5s.put(fileName, md5);
        }
        return md5s;
    }

    public static void main(String[] args) throws IOException {
        DumpLinkGetter testGetter = new DumpLinkGetter(Language.getByLangCode("en"), Arrays.asList(FileMatcher.ARTICLES), "20130604");
//        System.out.println(testGetter.getMd5Sums(testGetter.getFileLinks()));
        System.out.println(testGetter.getDumpFiles(testGetter.getFileLinks()));

    }

}