All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gobblin.ingestion.google.webmaster.GoogleWebmasterDataFetcherImpl Maven / Gradle / Ivy

package gobblin.ingestion.google.webmaster;

import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.googleapis.batch.BatchRequest;
import com.google.api.client.googleapis.batch.json.JsonBatchCallback;
import com.google.api.client.repackaged.com.google.common.base.Preconditions;
import com.google.api.services.webmasters.model.ApiDimensionFilter;
import com.google.api.services.webmasters.model.SearchAnalyticsQueryResponse;
import com.google.common.base.Optional;
import gobblin.util.ExecutorsUtils;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.*;


public class GoogleWebmasterDataFetcherImpl extends GoogleWebmasterDataFetcher {

  private final static Logger LOG = LoggerFactory.getLogger(GoogleWebmasterDataFetcherImpl.class);
  private final String _siteProperty;
  private final GoogleWebmasterClient _client;
  private final List _jobs;

  public GoogleWebmasterDataFetcherImpl(String siteProperty, Credential credential, String appName,
      List jobs) throws IOException {
    this(siteProperty, new GoogleWebmasterClientImpl(credential, appName), jobs);
  }

  /**
   * For test only
   */
  GoogleWebmasterDataFetcherImpl(String siteProperty, GoogleWebmasterClient client, List jobs)
      throws IOException {
    Preconditions.checkArgument(siteProperty.endsWith("/"), "The site property must end in \"/\"");
    _siteProperty = siteProperty;
    _client = client;
    _jobs = jobs;
  }

  /**
   * Due to the limitation of the API, we can get a maximum of 5000 rows at a time. Another limitation is that, results are sorted by click count descending. If two rows have the same click count, they are sorted in an arbitrary way. (Read more at https://developers.google.com/webmaster-tools/v3/searchanalytics). So we try to get all pages by partitions, if a partition has 5000 rows returned. We try partition current partition into more granular levels.
   *
   */
  @Override
  public Collection getAllPages(String startDate, String endDate, String country, int rowLimit)
      throws IOException {
    if (!_jobs.isEmpty()) {
      LOG.info("Service got hot started.");
      return _jobs;
    }

    ApiDimensionFilter countryFilter = GoogleWebmasterFilter.countryEqFilter(country);

    List requestedDimensions = new ArrayList<>();
    requestedDimensions.add(GoogleWebmasterFilter.Dimension.PAGE);

    Collection allPages =
        _client.getPages(_siteProperty, startDate, endDate, country, rowLimit, requestedDimensions,
            Arrays.asList(countryFilter), 0);
    int actualSize = allPages.size();

    if (rowLimit < GoogleWebmasterClient.API_ROW_LIMIT || actualSize < GoogleWebmasterClient.API_ROW_LIMIT) {
      LOG.info(String.format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize,
          _siteProperty, country, startDate, endDate));
    } else {
      int expectedSize = getPagesSize(startDate, endDate, country, requestedDimensions, Arrays.asList(countryFilter));
      LOG.info(String.format("Total number of pages is %d for market-%s from %s to %s", expectedSize,
          GoogleWebmasterFilter.countryFilterToString(countryFilter), startDate, endDate));
      Queue> jobs = new ArrayDeque<>();
      expandJobs(jobs, _siteProperty);

      allPages = getPages(startDate, endDate, requestedDimensions, countryFilter, jobs);
      allPages.add(_siteProperty);
      actualSize = allPages.size();
      if (actualSize != expectedSize) {
        LOG.warn(
            String.format("Expected page size for country-%s is %d, but only able to get %d", country, expectedSize,
                actualSize));
      }
      LOG.info(String.format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize,
          _siteProperty, country, startDate, endDate));
    }

    ArrayDeque jobs = new ArrayDeque<>(actualSize);
    for (String page : allPages) {
      jobs.add(new SimpleProducerJob(page, startDate, endDate));
    }
    return jobs;
  }

  private int getPagesSize(final String startDate, final String endDate, final String country,
      final List requestedDimensions, final List apiDimensionFilters)
      throws IOException {
    int REQUESTS_COUNT_EACH_ROUND = 4;
    //Each round will give you 20K(4*5000) pages. Doing 100 rounds can give 2 million pages.
    //Normally 1 week has less than 200K pages. So 2 million pages are more than enough.
    int MAXIMUM_ROUNDS = 100;
    final ExecutorService es = Executors.newFixedThreadPool(REQUESTS_COUNT_EACH_ROUND,
        ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOG), Optional.of(this.getClass().getSimpleName())));

    int startRow = 0;
    int r = 0;
    while (r < MAXIMUM_ROUNDS) {
      r++;
      List> results = new ArrayList<>(REQUESTS_COUNT_EACH_ROUND);

      for (int i = 0; i < REQUESTS_COUNT_EACH_ROUND; ++i) {
        startRow += GoogleWebmasterClient.API_ROW_LIMIT;
        final int start = startRow;
        //Submit the job.
        Future submit = es.submit(new Callable() {
          @Override
          public Integer call() {
            LOG.info(String.format("Getting page size from %s...", start));
            while (true) {
              if (Thread.interrupted()) {
                LOG.error(String.format(
                    "Interrupted while trying to get the size of all pages for %s. Current start row is %d.", country,
                    start));
                return -1;
              }
              try {
                List pages =
                    _client.getPages(_siteProperty, startDate, endDate, country, GoogleWebmasterClient.API_ROW_LIMIT,
                        requestedDimensions, apiDimensionFilters, start);
                if (pages.size() < GoogleWebmasterClient.API_ROW_LIMIT) {
                  return pages.size() + start;  //Figured out the size
                } else {
                  return -1;
                }
              } catch (IOException e) {
                LOG.info(String.format("Getting page size from %s failed. Retrying...", start));
              }
              try {
                Thread.sleep(200);
              } catch (InterruptedException e) {
                LOG.error(e.getMessage());
                LOG.error(String.format(
                    "Interrupted while trying to get the size of all pages for %s. Current start row is %d.", country,
                    start));
                return -1;
              }
            }
          }
        });

        results.add(submit);
        try {
          //Send 4 jobs per second.
          Thread.sleep(250);
        } catch (InterruptedException e) {
          LOG.error(e.getMessage());
        }
      }

      for (Future result : results) {
        try {
          Integer integer = result.get(2, TimeUnit.MINUTES);
          if (integer > 0) {
            es.shutdownNow();
            return integer;
          }
        } catch (InterruptedException e) {
          throw new RuntimeException(e);
        } catch (ExecutionException e) {
          throw new RuntimeException(e);
        } catch (TimeoutException e) {
          LOG.error("Exceeding the timeout of 2 minutes to get the total size of pages.");
          throw new RuntimeException(e);
        }
      }
    }
    throw new RuntimeException(String.format("Exceeding the limit of getting pages count. Having more than %d pages?",
        GoogleWebmasterClient.API_ROW_LIMIT * REQUESTS_COUNT_EACH_ROUND * MAXIMUM_ROUNDS));
  }

  /**
   * Get all pages in an async mode.
   */
  private Collection getPages(String startDate, String endDate, List dimensions,
      ApiDimensionFilter countryFilter, Queue> toProcess) throws IOException {
    String country = GoogleWebmasterFilter.countryFilterToString(countryFilter);

    ConcurrentLinkedDeque allPages = new ConcurrentLinkedDeque<>();
    Random random = new Random();
    //we need to retry many times because
    // 1. the shared prefix path may be very long
    // 2. the number of retries
    final int retry = 120;
    int r = 0;
    while (r <= retry) {
      ++r;
      LOG.info(String.format("Get pages at round %d with size %d.", r, toProcess.size()));
      ConcurrentLinkedDeque> nextRound = new ConcurrentLinkedDeque<>();
      ExecutorService es = Executors.newFixedThreadPool(10,
          ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOG), Optional.of(this.getClass().getSimpleName())));

      while (!toProcess.isEmpty()) {
        submitJob(toProcess.poll(), countryFilter, startDate, endDate, dimensions, es, allPages, nextRound);
        try {
          //Not a random number on a whim, this is a good practical number
          Thread.sleep(275); //Submit roughly 4 jobs per second.
        } catch (InterruptedException e) {
          throw new RuntimeException(e);
        }
      }
      //wait for jobs to finish and start next round if necessary.
      try {
        es.shutdown();
        LOG.info(String.format("Wait for get-all-pages jobs to finish at round %d... Next round now has size %d.", r,
            nextRound.size()));
        boolean terminated = es.awaitTermination(5, TimeUnit.MINUTES);
        if (!terminated) {
          es.shutdownNow();
          LOG.warn(
              String.format("Timed out while getting all pages for country-%s at round %d. Next round now has size %d.",
                  country, r, nextRound.size()));
        }
        //Cool down before next round. Starting from about 1/3 of a second.
        Thread.sleep(333 + 50 * random.nextInt(r));
      } catch (InterruptedException e) {
        throw new RuntimeException(e);
      }

      if (nextRound.isEmpty()) {
        break;
      }
      toProcess = nextRound;
    }
    if (r == retry) {
      throw new RuntimeException(
          String.format("Getting all pages reaches the maximum number of retires. Date range: %s ~ %s. Country: %s.",
              startDate, endDate, country));
    }
    return allPages;
  }

  private void submitJob(final Pair job, final ApiDimensionFilter countryFilter,
      final String startDate, final String endDate, final List dimensions, ExecutorService es,
      final ConcurrentLinkedDeque allPages,
      final ConcurrentLinkedDeque> nextRound) {
    es.submit(new Runnable() {
      @Override
      public void run() {
        String countryString = countryFilterToString(countryFilter);
        List filters = new LinkedList<>();
        filters.add(countryFilter);

        String prefix = job.getLeft();
        FilterOperator operator = job.getRight();
        String jobString = String.format("job(prefix: %s, operator: %s)", prefix, operator);
        filters.add(GoogleWebmasterFilter.pageFilter(operator, prefix));
        List pages;
        try {
          pages =
              _client.getPages(_siteProperty, startDate, endDate, countryString, GoogleWebmasterClient.API_ROW_LIMIT,
                  dimensions, filters, 0);
          LOG.debug(
              String.format("%d pages fetched for %s market-%s from %s to %s.", pages.size(), jobString, countryString,
                  startDate, endDate));
        } catch (IOException e) {
          //OnFailure
          LOG.debug(jobString + " failed. " + e.getMessage());
          nextRound.add(job);
          return;
        }

        //If the number of pages is at the LIMIT, it must be a "CONTAINS" job.
        //We need to create sub-tasks, and check current page with "EQUALS"
        if (pages.size() == GoogleWebmasterClient.API_ROW_LIMIT) {
          LOG.info(String.format("Expanding the prefix '%s'", prefix));
          expandJobs(nextRound, prefix);
          nextRound.add(Pair.of(prefix, FilterOperator.EQUALS));
        } else {
          //Otherwise, we've done with current job.
          allPages.addAll(pages);
        }
      }
    });
  }

  private void expandJobs(Queue> jobs, String prefix) {
    for (String expanded : getUrlPartitions(prefix)) {
      jobs.add(Pair.of(expanded, FilterOperator.CONTAINS));
    }
  }

  /**
   * This doesn't cover all cases but more than 99.9% captured.
   *
   * According to the standard (RFC-3986), here are possible characters:
   * unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
   * reserved      = gen-delims / sub-delims
   * gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
   * sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
   *
   *
   * Not included:
   * reserved      = gen-delims / sub-delims
   * gen-delims    = "[" / "]"
   * sub-delims    = "(" / ")" / "," / ";"
   */
  private ArrayList getUrlPartitions(String prefix) {
    ArrayList expanded = new ArrayList<>();
    //The page prefix is case insensitive, A-Z is not necessary.
    for (char c = 'a'; c <= 'z'; ++c) {
      expanded.add(prefix + c);
    }
    for (int num = 0; num <= 9; ++num) {
      expanded.add(prefix + num);
    }
    expanded.add(prefix + "-");
    expanded.add(prefix + ".");
    expanded.add(prefix + "_"); //most important
    expanded.add(prefix + "~");

    expanded.add(prefix + "/"); //most important
    expanded.add(prefix + "%"); //most important
    expanded.add(prefix + ":");
    expanded.add(prefix + "?");
    expanded.add(prefix + "#");
    expanded.add(prefix + "@");
    expanded.add(prefix + "!");
    expanded.add(prefix + "$");
    expanded.add(prefix + "&");
    expanded.add(prefix + "+");
    expanded.add(prefix + "*");
    expanded.add(prefix + "'");
    expanded.add(prefix + "=");
    return expanded;
  }

  @Override
  public List performSearchAnalyticsQuery(String startDate, String endDate, int rowLimit,
      List requestedDimensions, List requestedMetrics, Collection filters)
      throws IOException {
    SearchAnalyticsQueryResponse response =
        _client.createSearchAnalyticsQuery(_siteProperty, startDate, endDate, requestedDimensions,
            GoogleWebmasterFilter.andGroupFilters(filters), rowLimit, 0).execute();
    return convertResponse(requestedMetrics, response);
  }

  @Override
  public void performSearchAnalyticsQueryInBatch(List jobs, List> filterList,
      List> callbackList, List requestedDimensions,
      int rowLimit) throws IOException {
    BatchRequest batchRequest = _client.createBatch();

    for (int i = 0; i < jobs.size(); ++i) {
      ProducerJob job = jobs.get(i);
      ArrayList filters = filterList.get(i);
      JsonBatchCallback callback = callbackList.get(i);
      _client.createSearchAnalyticsQuery(_siteProperty, job.getStartDate(), job.getEndDate(), requestedDimensions,
          GoogleWebmasterFilter.andGroupFilters(filters), rowLimit, 0).queue(batchRequest, callback);
    }

    batchRequest.execute();
  }

  @Override
  public String getSiteProperty() {
    return _siteProperty;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy