
eu.fbk.twm.wiki.wikipedia.LoadStatisticsFromWeb Maven / Gradle / Ivy
package eu.fbk.twm.wiki.wikipedia;
import org.apache.commons.cli.*;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: aprosio
* Date: 1/31/13
* Time: 2:45 PM
* To change this template use File | Settings | File Templates.
*/
@Deprecated public class LoadStatisticsFromWeb {
// URL website = new URL("http://www.website.com/information.asp");
// ReadableByteChannel rbc = Channels.newChannel(website.openStream());
// FileOutputStream fos = new FileOutputStream("information.html");
// fos.getChannel().transferFrom(rbc, 0, 1 << 24);
static final int sleepTime = 10000;
public static void main(String[] args) {
Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[1].getClassName());
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "configuration/log-config.txt";
}
PropertyConfigurator.configure(logConfig);
CommandLineParser parser = new PosixParser();
Options options = new Options();
options.addOption(OptionBuilder.withDescription("URL of page with statistics").isRequired().hasArg().withArgName("url").create("u"));
options.addOption(OptionBuilder.withDescription("Output folder").isRequired().hasArg().withArgName("folder").create("o"));
options.addOption(OptionBuilder.withDescription("Sleep time (default 10000)").withArgName("milliseconds").hasArg().withArgName("sleep").create("s"));
options.addOption("c", "clean", false, "Clean the output folder before writing on it");
options.addOption("h", "help", false, "Print this message");
CommandLine commandLine = null;
try {
commandLine = parser.parse(options, args);
if (commandLine.hasOption("help")) {
throw new ParseException("");
}
} catch (ParseException exp) {
System.out.println();
if (exp.getMessage().length() > 0) {
System.out.println("ERR: " + exp.getMessage());
System.out.println();
}
HelpFormatter formatter = new HelpFormatter();
//formatter.printHelp(400, "java -mx4g " + Thread.currentThread().getStackTrace()[1].getClassName(), "\n", options, "\n", true);
formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.LoadStatisticsFromWeb", "\n", options, "\n", true);
System.out.println();
System.exit(0);
}
String output = commandLine.getOptionValue('o');
String folder = commandLine.getOptionValue('u');
int sleep = sleepTime;
if (commandLine.hasOption("sleep")) {
try {
sleep = Integer.parseInt(commandLine.getOptionValue("sleep"));
} catch (Exception ignored) {
}
}
boolean clean = false;
if (commandLine.hasOption('c')) {
clean = true;
}
if (!output.endsWith(System.getProperty("file.separator"))) {
output += System.getProperty("file.separator");
}
if (!folder.endsWith(System.getProperty("file.separator"))) {
folder += System.getProperty("file.separator");
}
HashMap md5 = new HashMap();
Pattern p = Pattern.compile("\\s+");
String dir = output;
if (clean) {
logger.info("Cleaning the folder");
File d2 = new File(dir);
if (!d2.exists()) {
if (!d2.mkdirs()) {
logger.error("Unable to create directory " + dir);
System.exit(0);
}
}
else {
String a2[] = d2.list();
for (String anA2 : a2) {
String fileName = dir + anA2;
File f = new File(fileName);
f.delete();
}
}
}
else {
logger.info("Loading MD5 file");
try {
URL website = new URL(folder + "md5sums.txt");
BufferedReader s = new BufferedReader(new InputStreamReader(website.openStream()));
String line;
while ((line = s.readLine()) != null) {
String[] parts = p.split(line);
if (parts.length < 2) {
continue;
}
md5.put(parts[1], parts[0]);
}
} catch (Exception e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
try {
Document doc = null;
doc = Jsoup.connect(folder).get();
Elements newsHeadlines = doc.select("a");
for (Object newsHeadline : newsHeadlines) {
Element e = (Element) newsHeadline;
String text = e.html();
if (text.startsWith("pagecounts")) {
String fileToDownload = e.attr("href");
String localPath = output + fileToDownload;
File localFile = new File(localPath);
if (localFile.exists()) {
String thisMD5 = DigestUtils.md5Hex(new FileInputStream(localFile));
if (thisMD5.equals(md5.get(fileToDownload))) {
logger.info("Skipping file " + fileToDownload);
continue;
}
localFile.delete();
}
logger.info("Downloading " + fileToDownload);
FileUtils.copyURLToFile(new URL(folder + fileToDownload), localFile);
Thread.sleep(sleep);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy