org.wikibrain.download.DumpFileDownloader Maven / Gradle / Ivy
package org.wikibrain.download;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Multimap;
import org.apache.commons.cli.*;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;
import org.wikibrain.utils.WpIOUtils;
/**
*
* Downloads dumps from a specified tsv file containing lines of dump links.
*
* @author Ari Weiland
*
*/
public class DumpFileDownloader {
private static final Logger LOG = LoggerFactory.getLogger(DumpFileDownloader.class);
private static final int SLEEP_TIME = 10000; // getOneFile takes a break from downloading
private static final int MAX_ATTEMPT = 30; // number of attempts before getOneFile gives up downloading the dump
private static final int DISPLAY_INFO = 10000; // amount of time between displaying download progress
private FileDownloader downloader = new FileDownloader();
private final File tmpDir;
private final File outputDir;
public DumpFileDownloader(File outputDir) {
this.outputDir = outputDir;
try {
tmpDir = WpIOUtils.createTempDirectory("download");
} catch (IOException e) {
throw new RuntimeException(e); // shouldn't happen.
}
}
/**
* Attempts to download the specified file. Returns the success of the download.
* @param link
* @return true if successful, else false
* @throws InterruptedException
*/
public File getOneFile(DumpLinkInfo link) throws InterruptedException, IOException {
return downloader.download(link.getUrl(), new File(tmpDir, link.getFileName()));
}
/**
* Processes a tsv file containing dump link info and initiates the download process
* on that info. Files are downloaded one language at a time, then one type at a time.
* Within each language, all of one type is downloaded before moving the files
* to the destination directory.
* @param file the tsv file containing the dump link info
* @throws InterruptedException
*/
public void downloadFrom(File file) throws InterruptedException, WikiBrainException, IOException {
if (tmpDir.isDirectory()) {
if (tmpDir.listFiles().length != 0) {
for (File f : tmpDir.listFiles()) {
f.delete();
}
}
} else {
tmpDir.mkdirs();
}
DumpLinkCluster linkCluster = DumpLinkInfo.parseFile(file);
int numTotalFiles = linkCluster.size();
LOG.info("Starting to download " + numTotalFiles + " files");
int success = 0;
for (Language language : linkCluster) {
success = downloadLanguageFiles(linkCluster, numTotalFiles, success, language);
}
LOG.info(success + " files downloaded out of " + numTotalFiles + " files.");
tmpDir.delete();
}
private int downloadLanguageFiles(DumpLinkCluster linkCluster, int numTotalFiles, int success, Language language) throws InterruptedException, IOException, WikiBrainException {
Multimap map = linkCluster.get(language);
for (FileMatcher linkMatcher : map.keySet()) {
for (DumpLinkInfo link : map.get(linkMatcher)) {
File download = new File(outputDir, link.getLocalPath()+"/"+link.getFileName());
if (download.exists()) {
success++;
LOG.info("File already downloaded: " + link.getFileName());
} else {
download = getOneFile(link);
if (download == null) {
throw new WikiBrainException("Download malfunction! Download timed out!");
}
success++;
LOG.info(success + "/" + numTotalFiles + " file(s) downloaded");
}
FileInputStream fis = FileUtils.openInputStream(download);
try {
String md5 = DigestUtils.md5Hex(fis);
if (!link.getMd5().equalsIgnoreCase(md5)) {
throw new WikiBrainException("Download malfunction! MD5 strings do not match!");
}
} finally {
IOUtils.closeQuietly(fis);
}
}
for (DumpLinkInfo link : map.get(linkMatcher)) {
File download = new File(tmpDir, link.getFileName());
File target = FileUtils.getFile(outputDir, link.getLocalPath(), download.getName());
if (!target.exists()) {
if (!target.getParentFile().exists()) target.getParentFile().mkdirs();
FileUtils.moveFile(download, target); // thros an exception on failure.
}
}
}
return success;
}
public static void main(String[] args) throws ConfigurationException, WikiBrainException, IOException, InterruptedException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("output")
.withDescription("Path to output file.")
.create("o"));
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("input")
.withDescription("Path to input tsv file.")
.create("i"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println("Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("DumpFileDownloader", options);
return;
}
Env env = new EnvBuilder(cmd).build();
Configurator conf = env.getConfigurator();
List argList = Arrays.asList(conf.getConf().get().getString("download.listFile"));
String filePath = cmd.getOptionValue('o', conf.getConf().get().getString("download.path"));
if (cmd.hasOption("i")) {
argList = Arrays.asList(cmd.getOptionValues("i"));
}
DumpFileDownloader downloader = new DumpFileDownloader(new File(filePath));
for (Object path : argList) {
downloader.downloadFrom(new File((String) path));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy