All Downloads are FREE. Search and download functionalities are using the official Maven repository.

attic.java.cc.twittertools.download.VerifyJsonStatusBlockCrawl Maven / Gradle / Ivy

There is a newer version: 1.3.0
Show newest version
package cc.twittertools.download;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;

import cc.twittertools.corpus.data.JsonStatusBlockReader;
import cc.twittertools.corpus.data.Status;
import cc.twittertools.corpus.data.StatusStream;

import com.google.common.base.Preconditions;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.Response;

//NOTE: this was originally designed for Twitter API v1.0, which no longer works with API v1.1
@Deprecated
public class VerifyJsonStatusBlockCrawl {
  private static final Logger LOG = Logger.getLogger(VerifyJsonStatusBlockCrawl.class);

  private final File data;
  private final File statuses;
  private final AsyncHttpClient client = new AsyncHttpClient();

  private File outputSuccess = null;
  private File outputFailure = null;
  private File repairedOutput = null;

  public VerifyJsonStatusBlockCrawl(File data, File statuses) {
    this.statuses = Preconditions.checkNotNull(statuses);
    this.data = Preconditions.checkNotNull(data);

    if (!statuses.exists()) {
      throw new RuntimeException(statuses + " does not exist!");
    }
  }

  public VerifyJsonStatusBlockCrawl withOutputSuccess(File file) {
    this.outputSuccess = Preconditions.checkNotNull(file);
    return this;
  }

  public VerifyJsonStatusBlockCrawl withOutputFailure(File file) {
    this.outputFailure = Preconditions.checkNotNull(file);
    return this;
  }

  public VerifyJsonStatusBlockCrawl withRepairedOutput(File file) {
    this.repairedOutput = Preconditions.checkNotNull(file);
    return this;
  }

  public boolean verify() throws IOException {
    LOG.info(String.format("Reading statuses read from %s.", statuses));

    StatusStream stream;
    if (statuses.isDirectory()) {
      throw new RuntimeException(statuses + " cannot be a directory!");
    }
    stream = new JsonStatusBlockReader(statuses);

    Map ids = new HashMap();

    int cnt = 0;
    Status status;
    while ((status = stream.next()) != null) {
      ids.put(status.getId(), status.getJsonString());
      cnt++;
    }
    LOG.info(String.format("Total of %d statuses read.", cnt));

    BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(data)));

    FileWriter successOut = null;
    if (outputSuccess != null) {
      successOut = new FileWriter(outputSuccess);
    }
    FileWriter failureOut = null;
    if (outputFailure != null) {
      failureOut = new FileWriter(outputFailure);
    }

    int totalCnt = 0;
    int successCnt = 0;
    int failureCnt = 0;
    int fetchedCnt = 0;
    int notAvailableCnt = 0;
    String line;
    while ((line = in.readLine()) != null) {
      String[] arr = line.split("\\t");
      long id = Long.parseLong(arr[0]);
      totalCnt++;

      if (ids.containsKey(id)) {
        if (successOut != null) {
          successOut.write(line + "\n");
        }
        successCnt++;
      } else {
        // Check to see if we should actually bother repairing.
        if (repairedOutput != null) {
          Response response = null;
          while (true) {
            try {
              response = client.prepareGet(
                  AsyncJsonStatusBlockCrawler.getUrl(
                      AsyncJsonStatusBlockCrawler.DEFAULT_URL_PREFIX, id, arr[1]))
                  .execute().get();

              if (response.getStatusCode() < 500) {
                break;
              }
            } catch (InterruptedException e) {
              // Do nothing, just retry.
            } catch (ExecutionException e) {
              // Do nothing, just retry.
            }

            try {
              Thread.sleep(1000);
            } catch (Exception e) {
            }
            LOG.warn("Error: retrying.");
          }

          String s = response.getResponseBody();
          if (isTweetNoLongerAvailable(s)) {
            LOG.info(String.format("Missing status %d: no longer available.", id));
            notAvailableCnt++;
          } else {
            LOG.info(String.format("Missing status %d: successfully fetched.", id));
            ids.put(id, response.getResponseBody());
            fetchedCnt++;
          }
        }
      
        if (failureOut != null) {
          failureOut.write(line + "\n");
        }
        failureCnt++;
      }
    }

    LOG.info(String.format("Total of %d statuses in %s.", cnt, statuses));
    LOG.info(String.format("Total of %d entries in %s.", totalCnt, data));
    LOG.info(String.format("%d statuses no longer available.", notAvailableCnt));
    LOG.info(String.format("%d missing statuses fetched.", fetchedCnt));

    if (cnt + notAvailableCnt + fetchedCnt == totalCnt) {
      LOG.info("SUCCESS! All statuses accounted for.");
    }

    if (outputSuccess != null) {
      LOG.info(String.format("Total of %d status id written to %s.", successCnt, outputSuccess));
      successOut.close();
    }
    if (outputFailure != null) {
      LOG.info(String.format("Total of %d status id written to %s", failureCnt, outputFailure));
      failureOut.close();
    }

    in.close();
    client.close();

    if (repairedOutput != null) {
      LOG.info("Writing tweets...");
      int written = 0;
      OutputStreamWriter out = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(repairedOutput)));
      for (Map.Entry entry : ids.entrySet()) {
        written++;
        out.write(entry.getValue() + "\n");
      }
      out.close();
      LOG.info(written + " statuses written.");
      LOG.info("Done!");
    }

    return true;
  }

  public static boolean isTweetNoLongerAvailable(String s) {
    return s.contains("Sorry, you are not authorized to see this status.") ||
        s.contains("No status found with that ID.") || s.equals("");
  }

  private static final String STATUSES_OPTION = "statuses_input";
  private static final String STATUSES_REPAIRED_OPTION = "statuses_repaired";
  private static final String DATA_OPTION = "data";
  private static final String OUTPUT_SUCCESS_OPTION = "output_success";
  private static final String OUTPUT_FAILURE_OPTION = "output_failure";

  @SuppressWarnings("static-access")
  public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("input JSON statuses")
        .create(STATUSES_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("repaired JSON statuses")
        .create(STATUSES_REPAIRED_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("data file with tweet ids").create(DATA_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("output file for tweet fetch successes").create(OUTPUT_SUCCESS_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("output file for tweet fetch failures").create(OUTPUT_FAILURE_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      System.exit(-1);
    }

    if (!cmdline.hasOption(STATUSES_OPTION) || !cmdline.hasOption(DATA_OPTION)) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(VerifyJsonStatusBlockCrawl.class.getName(), options);
      System.exit(-1);
    }

    VerifyJsonStatusBlockCrawl v = new VerifyJsonStatusBlockCrawl(new File(cmdline.getOptionValue(DATA_OPTION)),
        new File(cmdline.getOptionValue(STATUSES_OPTION)));

    if (cmdline.hasOption(OUTPUT_SUCCESS_OPTION)) {
      v.withOutputSuccess(new File(cmdline.getOptionValue(OUTPUT_SUCCESS_OPTION)));
    }

    if (cmdline.hasOption(OUTPUT_FAILURE_OPTION)) {
      v.withOutputFailure(new File(cmdline.getOptionValue(OUTPUT_FAILURE_OPTION)));
    }

    if (cmdline.hasOption(STATUSES_REPAIRED_OPTION)) {
      v.withRepairedOutput(new File(cmdline.getOptionValue(STATUSES_REPAIRED_OPTION)));
    }

    v.verify();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy