All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.download.DumpFileDownloader Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
package org.wikibrain.download;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Multimap;
import org.apache.commons.cli.*;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;
import org.wikibrain.utils.WpIOUtils;

/**
 *
 * Downloads dumps from a specified tsv file containing lines of dump links.
 *
 * @author Ari Weiland
 *
 */

public class DumpFileDownloader {

    private static final Logger LOG = LoggerFactory.getLogger(DumpFileDownloader.class);
    private static final int SLEEP_TIME = 10000;    // getOneFile takes a break from downloading
    private static final int MAX_ATTEMPT = 30;      // number of attempts before getOneFile gives up downloading the dump
    private static final int DISPLAY_INFO = 10000;  // amount of time between displaying download progress

    private FileDownloader downloader = new FileDownloader();
    private final File tmpDir;
    private final File outputDir;

    public DumpFileDownloader(File outputDir) {
        this.outputDir = outputDir;
        try {
            tmpDir = WpIOUtils.createTempDirectory("download");
        } catch (IOException e) {
            throw new RuntimeException(e);  // shouldn't happen.
        }
    }

    /**
     * Attempts to download the specified file. Returns the success of the download.
     * @param link
     * @return true if successful, else false
     * @throws InterruptedException
     */
    public File getOneFile(DumpLinkInfo link) throws InterruptedException, IOException {
        return downloader.download(link.getUrl(), new File(tmpDir, link.getFileName()));
    }

    /**
     * Processes a tsv file containing dump link info and initiates the download process
     * on that info. Files are downloaded one language at a time, then one type at a time.
     * Within each language, all of one type is downloaded before moving the files
     * to the destination directory.
     * @param file the tsv file containing the dump link info
     * @throws InterruptedException
     */
    public void downloadFrom(File file) throws InterruptedException, WikiBrainException, IOException {
        if (tmpDir.isDirectory()) {
            if (tmpDir.listFiles().length != 0) {
                for (File f : tmpDir.listFiles()) {
                    f.delete();
                }
            }
        } else {
            tmpDir.mkdirs();
        }
        DumpLinkCluster linkCluster = DumpLinkInfo.parseFile(file);
        int numTotalFiles = linkCluster.size();
        LOG.info("Starting to download " + numTotalFiles + " files");
        int success = 0;

        for (Language language : linkCluster) {
            success = downloadLanguageFiles(linkCluster, numTotalFiles, success, language);
        }
        LOG.info(success + " files downloaded out of " + numTotalFiles + " files.");
        tmpDir.delete();
    }

    private int downloadLanguageFiles(DumpLinkCluster linkCluster, int numTotalFiles, int success, Language language) throws InterruptedException, IOException, WikiBrainException {
        Multimap map = linkCluster.get(language);
        for (FileMatcher linkMatcher : map.keySet()) {
            for (DumpLinkInfo link : map.get(linkMatcher)) {
                File download = new File(outputDir, link.getLocalPath()+"/"+link.getFileName());
                if (download.exists()) {
                    success++;
                    LOG.info("File already downloaded: " + link.getFileName());
                } else {
                    download = getOneFile(link);
                    if (download == null) {
                        throw new WikiBrainException("Download malfunction! Download timed out!");
                    }
                    success++;
                    LOG.info(success + "/" + numTotalFiles + " file(s) downloaded");
                }
                FileInputStream fis = FileUtils.openInputStream(download);
                try {
                    String md5 = DigestUtils.md5Hex(fis);
                    if (!link.getMd5().equalsIgnoreCase(md5)) {
                        throw new WikiBrainException("Download malfunction! MD5 strings do not match!");
                    }
                } finally {
                    IOUtils.closeQuietly(fis);
                }
            }
            for (DumpLinkInfo link : map.get(linkMatcher)) {
                File download = new File(tmpDir, link.getFileName());
                File target = FileUtils.getFile(outputDir, link.getLocalPath(), download.getName());
                if (!target.exists()) {
                    if (!target.getParentFile().exists()) target.getParentFile().mkdirs();
                    FileUtils.moveFile(download, target);   // thros an exception on failure.
                }
            }
        }
        return success;
    }

    public static void main(String[] args) throws ConfigurationException, WikiBrainException, IOException, InterruptedException {
        Options options = new Options();
        options.addOption(
                new DefaultOptionBuilder()
                        .hasArg()
                        .withLongOpt("output")
                        .withDescription("Path to output file.")
                        .create("o"));
        options.addOption(
                new DefaultOptionBuilder()
                        .hasArg()
                        .withLongOpt("input")
                        .withDescription("Path to input tsv file.")
                        .create("i"));

        EnvBuilder.addStandardOptions(options);
        CommandLineParser parser = new PosixParser();
        CommandLine cmd;

        try {
            cmd = parser.parse(options, args);
        } catch (ParseException e) {
            System.err.println("Invalid option usage: " + e.getMessage());
            new HelpFormatter().printHelp("DumpFileDownloader", options);
            return;
        }

        Env env = new EnvBuilder(cmd).build();
        Configurator conf = env.getConfigurator();

        List argList = Arrays.asList(conf.getConf().get().getString("download.listFile"));
        String filePath = cmd.getOptionValue('o', conf.getConf().get().getString("download.path"));
        if (cmd.hasOption("i")) {
            argList = Arrays.asList(cmd.getOptionValues("i"));
        }

        DumpFileDownloader downloader = new DumpFileDownloader(new File(filePath));
        for (Object path : argList) {
            downloader.downloadFrom(new File((String) path));
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy