attic.java.cc.twittertools.download.AsyncJsonStatusBlockCrawler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of twitter-tools Show documentation
Show all versions of twitter-tools Show documentation
Twitter tools for researchers
package cc.twittertools.download;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Map;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;
import cc.twittertools.corpus.data.Status;
import cc.twittertools.corpus.demo.ReadStatuses;
import com.google.common.base.Preconditions;
import com.ning.http.client.AsyncCompletionHandler;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.Response;
// NOTE: this was originally designed for Twitter API v1.0, which no longer works with API v1.1
@Deprecated
public class AsyncJsonStatusBlockCrawler {
private static final Logger LOG = Logger.getLogger(AsyncJsonStatusBlockCrawler.class);
private static final int MAX_RETRY_ATTEMPTS = 3;
public static final String DEFAULT_URL_PREFIX = "http://twitter.com";
// Change these values at your own risk.
private static final int TWEET_BLOCK_SIZE = 500;
private static final int TWEET_BLOCK_SLEEP = 5000;
private final File file;
private final String output;
private final AsyncHttpClient asyncHttpClient;
private final ConcurrentSkipListMap tweets = new ConcurrentSkipListMap();
private final ConcurrentSkipListMap retries = new ConcurrentSkipListMap();
private final String prefix;
public AsyncJsonStatusBlockCrawler(File file, String output) throws IOException {
this(file, output, DEFAULT_URL_PREFIX);
}
public AsyncJsonStatusBlockCrawler(File file, String output, String prefix) throws IOException {
this.file = Preconditions.checkNotNull(file);
this.output = Preconditions.checkNotNull(output);
this.prefix = Preconditions.checkNotNull(prefix);
if (!file.exists()) {
throw new IOException(file + " does not exist!");
}
this.asyncHttpClient = new AsyncHttpClient();
}
public static String getUrl(String prefix, long id, String username) {
Preconditions.checkNotNull(username);
return String.format("%s/statuses/show/%s.json", prefix, id);
}
public void fetch() throws IOException {
long start = System.currentTimeMillis();
LOG.info("Processing " + file);
int cnt = 0;
BufferedReader data = null;
try {
data = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
String line;
while ((line = data.readLine()) != null) {
String[] arr = line.split("\t");
long id = Long.parseLong(arr[0]);
String username = arr[1];
String url = getUrl(prefix, id, username);
asyncHttpClient.prepareGet(url).execute(new TweetFetcherHandler(id, username));
cnt++;
if ( cnt % TWEET_BLOCK_SIZE == 0 ) {
LOG.info(cnt + " requests submitted");
try{
Thread.sleep(TWEET_BLOCK_SLEEP);
} catch (Exception e) {
e.printStackTrace();
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
data.close();
}
// Wait for the last requests to complete.
try {
Thread.sleep(10000);
} catch (Exception e) {
e.printStackTrace();
}
asyncHttpClient.close();
long end = System.currentTimeMillis();
long duration = end - start;
LOG.info("Total request submitted: " + cnt);
LOG.info(tweets.size() + " tweets fetched in " + duration + "ms");
LOG.info("Writing tweets...");
int written = 0;
OutputStreamWriter out =
new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(output)));
for ( Map.Entry entry : tweets.entrySet()) {
written++;
out.write(entry.getValue() + "\n");
}
out.close();
LOG.info(written + " statuses written.");
LOG.info("Done!");
}
private class TweetFetcherHandler extends AsyncCompletionHandler {
private long id;
private String username;
public TweetFetcherHandler(long id, String username) {
this.id = id;
this.username = username;
}
@Override
public Response onCompleted(Response response) throws Exception {
if (response.getStatusCode() == 200) {
String s = response.getResponseBody();
if ( "".equals(s) ) {
LOG.warn("Empty result: " + id);
return response;
}
// Try to decode the JSON.
try {
Status.fromJson(s).getId();
} catch (Exception e) {
// If there's an exception, it means we got an incomplete JSON result. Try again.
LOG.warn("Incomplete JSON status: " + id);
String url = getUrl(prefix, id, username);
retry(url, id, username);
return response;
}
tweets.put(id, s);
} else if (response.getStatusCode() >= 500) {
// Retry by submitting another request.
LOG.warn("Error status " + response.getStatusCode() + ": " + id);
String url = getUrl(prefix, id, username);
retry(url, id, username);
}
return response;
}
@Override
public void onThrowable(Throwable t) {
// Retry by submitting another request.
LOG.warn("Error: " + t);
String url = getUrl(prefix, id, username);
try {
retry(url, id, username);
} catch (IOException e) {
e.printStackTrace();
}
}
private synchronized void retry(String url, long id, String username) throws IOException {
// Wait before retrying.
try {
Thread.sleep(1000);
} catch (Exception e) {
e.printStackTrace();
}
if ( !retries.containsKey(id)) {
retries.put(id, 1);
LOG.warn("Retrying: " + url + " attempt 1");
asyncHttpClient.prepareGet(url).execute(new TweetFetcherHandler(id, username));
return;
}
int attempts = retries.get(id);
if (attempts > MAX_RETRY_ATTEMPTS) {
LOG.warn("Abandoning: " + url + " after max retry attempts");
return;
}
attempts++;
LOG.warn("Retrying: " + url + " attempt " + attempts);
asyncHttpClient.prepareGet(url).execute(new TweetFetcherHandler(id, username));
retries.put(id, attempts);
}
}
private static final String URL_PREFIX_OPTION = "url_prefix";
private static final String DATA_OPTION = "data";
private static final String OUTPUT_OPTION = "output";
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("url").hasArg()
.withDescription("URL prefix (optional argument)").create(URL_PREFIX_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("data file with tweet ids").create(DATA_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("output file").create(OUTPUT_OPTION));
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
System.exit(-1);
}
if (!cmdline.hasOption(DATA_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(ReadStatuses.class.getName(), options);
System.exit(-1);
}
String output = cmdline.getOptionValue(OUTPUT_OPTION);
if ( !output.endsWith(".gz")) {
output += ".gz";
LOG.warn("Output file specified does not contain the .gz suffix. Appending automatically.");
}
if (cmdline.hasOption(URL_PREFIX_OPTION)) {
new AsyncJsonStatusBlockCrawler(new File(cmdline.getOptionValue(DATA_OPTION)),
output, cmdline.getOptionValue(URL_PREFIX_OPTION)).fetch();
} else {
new AsyncJsonStatusBlockCrawler(new File(cmdline.getOptionValue(DATA_OPTION)), output).fetch();
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy