org.wikibrain.download.DumpLinkInfo Maven / Gradle / Ivy
package org.wikibrain.download;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* A wrapper class for storing and processing information about a dump link.
* Contains static parser methods to generate DumpLinkInfo instances, and
* useful getters for all parameters plus custom information related to
* processing and downloading a dump.
*
* @author Ari Weiland
*
*/
public class DumpLinkInfo {
private static final Logger LOG = LoggerFactory.getLogger(DumpLinkGetter.class);
private Language language;
private String date;
private FileMatcher linkMatcher;
private URL url;
private String md5;
private int counter;
public DumpLinkInfo(Language language, String date, FileMatcher linkMatcher, URL url, int counter) {
this.language = language;
this.date = date;
this.linkMatcher = linkMatcher;
this.url = url;
this.counter = counter;
}
public DumpLinkInfo(String langCode, String date, String linkMatcher, String url, String md5, int counter) throws MalformedURLException {
this.language = Language.getByLangCode(langCode);
this.date = date;
this.linkMatcher = FileMatcher.getByName(linkMatcher);
this.url = new URL(url);
this.md5 = md5;
this.counter = counter;
}
public Language getLanguage() {
return language;
}
public void setLanguage(Language language) {
this.language = language;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public FileMatcher getLinkMatcher() {
return linkMatcher;
}
public void setLinkMatcher(FileMatcher linkMatcher) {
this.linkMatcher = linkMatcher;
}
public URL getUrl() {
return url;
}
public void setUrl(URL url) {
this.url = url;
}
public String getMd5() {
return md5;
}
public void setMd5(String md5) {
this.md5 = md5;
}
public int getCounter() {
return counter;
}
public void setCounter(int counter) {
this.counter = counter;
}
/**
* Returns a string for the local path in which to save this dump file
* @return
*/
public String getLocalPath() {
return language.getLangCode() + "/" + date;
}
/**
* Returns a string for the file name with which to save this dump file
* @return
*/
public String getFileName() {
return FilenameUtils.getName(url.getPath());
}
public String getDownloadName() {
return url.toString().substring(url.toString().lastIndexOf("/") + 1);
}
/**
* Parses a file of info pertaining to dump links into a cluster of DumpLinkInfo.
* Info must be listed in order: lang code, date, FileMatcher, URL, MD5 checksum
* with each DumpLink reference on a new line.
* @param file
* @return
*/
public static DumpLinkCluster parseFile(File file) {
InputStream stream = null;
try {
stream = FileUtils.openInputStream(file);
List lines = IOUtils.readLines(stream, "UTF-8");
DumpLinkCluster dumpLinks = new DumpLinkCluster();
for (String line : lines) {
String[] parsedInfo = line.split("\t");
String langCode = parsedInfo[0];
String date = parsedInfo[1];
String linkMatcher = parsedInfo[2];
String counter = parsedInfo[3];
String url = parsedInfo[4];
String md5 = null;
if (parsedInfo.length == 6) md5 = parsedInfo[5];
try {
DumpLinkInfo temp = new DumpLinkInfo(
langCode,
date,
linkMatcher,
url,
md5,
Integer.valueOf(counter)
);
dumpLinks.add(temp);
} catch (MalformedURLException e) {
LOG.warn("Malformed URL \"" + url + "\" : ", e);
}
}
return dumpLinks;
} catch (IOException e) {
throw new RuntimeException(e); // Something went horribly wrong!
} finally {
if (stream != null) IOUtils.closeQuietly(stream);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy