All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterDataFetcherImpl Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.ingestion.google.webmaster;

import com.google.api.client.googleapis.batch.BatchRequest;
import com.google.api.client.googleapis.batch.json.JsonBatchCallback;
import com.google.api.client.repackaged.com.google.common.base.Preconditions;
import com.google.api.services.webmasters.model.ApiDimensionFilter;
import com.google.api.services.webmasters.model.SearchAnalyticsQueryResponse;
import com.google.common.base.Optional;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.util.ExecutorsUtils;
import org.apache.gobblin.util.limiter.RateBasedLimiter;

import static org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.*;


@Slf4j
public class GoogleWebmasterDataFetcherImpl extends GoogleWebmasterDataFetcher {
  private final double API_REQUESTS_PER_SECOND;
  private final RateBasedLimiter LIMITER;
  private final int PAGES_COUNT_COOLDOWN_TIME; //In seconds
  private final int PAGES_GET_COOLDOWN_TIME; //In seconds
  private final int GET_PAGES_RETRIES;

  private final String _siteProperty;
  private final GoogleWebmasterClient _client;
  private final List _jobs;

  GoogleWebmasterDataFetcherImpl(String siteProperty, GoogleWebmasterClient client, State wuState) throws IOException {
    _siteProperty = siteProperty;
    Preconditions.checkArgument(_siteProperty.endsWith("/"), "The site property must end in \"/\"");
    _client = client;
    _jobs = getHotStartJobs(wuState);
    API_REQUESTS_PER_SECOND = wuState.getPropAsDouble(GoogleWebMasterSource.KEY_PAGES_TUNING_REQUESTS_PER_SECOND, 4.5);
    PAGES_COUNT_COOLDOWN_TIME = wuState.getPropAsInt(GoogleWebMasterSource.KEY_PAGES_COUNT_TUNING_COOLDOWN_TIME, 30);
    PAGES_GET_COOLDOWN_TIME = wuState.getPropAsInt(GoogleWebMasterSource.KEY_PAGES_GET_TUNING_COOLDOWN_TIME, 5);
    LIMITER = new RateBasedLimiter(API_REQUESTS_PER_SECOND, TimeUnit.SECONDS);
    GET_PAGES_RETRIES = wuState.getPropAsInt(GoogleWebMasterSource.KEY_PAGES_TUNING_MAX_RETRIES, 120);
  }

  private static List getHotStartJobs(State wuState) {
    String hotStartString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_HOT_START, "");
    if (!hotStartString.isEmpty()) {
      return SimpleProducerJob.deserialize(hotStartString);
    }
    return new ArrayList<>();
  }

  /**
   * Due to the limitation of the API, we can get a maximum of 5000 rows at a time. Another limitation is that, results are sorted by click count descending. If two rows have the same click count, they are sorted in an arbitrary way. (Read more at https://developers.google.com/webmaster-tools/v3/searchanalytics). So we try to get all pages by partitions, if a partition has 5000 rows returned. We try partition current partition into more granular levels.
   *
   */
  @Override
  public Collection getAllPages(String startDate, String endDate, String country, int rowLimit)
      throws IOException {
    log.info("Requested row limit: " + rowLimit);
    if (!_jobs.isEmpty()) {
      log.info("Service got hot started.");
      return _jobs;
    }
    ApiDimensionFilter countryFilter = GoogleWebmasterFilter.countryEqFilter(country);

    List requestedDimensions = new ArrayList<>();
    requestedDimensions.add(GoogleWebmasterFilter.Dimension.PAGE);
    int expectedSize = -1;
    if (rowLimit >= GoogleWebmasterClient.API_ROW_LIMIT) {
      //expected size only makes sense when the data set size is larger than GoogleWebmasterClient.API_ROW_LIMIT
      expectedSize = getPagesSize(startDate, endDate, country, requestedDimensions, Arrays.asList(countryFilter));
      log.info(String.format("Expected number of pages is %d for market-%s from %s to %s", expectedSize,
          GoogleWebmasterFilter.countryFilterToString(countryFilter), startDate, endDate));
    }

    Queue> jobs = new ArrayDeque<>();
    jobs.add(Pair.of(_siteProperty, FilterOperator.CONTAINS));

    Collection allPages = getPages(startDate, endDate, requestedDimensions, countryFilter, jobs,
        Math.min(rowLimit, GoogleWebmasterClient.API_ROW_LIMIT));
    int actualSize = allPages.size();
    log.info(String.format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize,
        _siteProperty, country, startDate, endDate));

    if (expectedSize != -1 && actualSize != expectedSize) {
      log.warn(String.format("Expected page size is %d, but only able to get %d", expectedSize, actualSize));
    }

    ArrayDeque producerJobs = new ArrayDeque<>(actualSize);
    for (String page : allPages) {
      producerJobs.add(new SimpleProducerJob(page, startDate, endDate));
    }
    return producerJobs;
  }

  /**
   * @return the size of all pages data set
   */
  int getPagesSize(final String startDate, final String endDate, final String country,
      final List requestedDimensions, final List apiDimensionFilters) {
    final ExecutorService es = Executors.newCachedThreadPool(
        ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of(this.getClass().getSimpleName())));

    int startRow = 0;
    long groupSize = Math.max(1, Math.round(API_REQUESTS_PER_SECOND));
    List> results = new ArrayList<>((int) groupSize);

    int max = -1;
    while (true) {
      for (int i = 0; i < groupSize; ++i) {
        final int start = startRow;
        startRow += GoogleWebmasterClient.API_ROW_LIMIT;

        Future submit = es.submit(() -> {
          log.info(String.format("Getting page size from %s...", start));
          String interruptedMsg =
              String.format("Interrupted while trying to get the size of all pages for %s. Current start row is %d.",
                  country, start);
          int r = 0;
          while (r <= GET_PAGES_RETRIES) {
            ++r;
            try {
              LIMITER.acquirePermits(1);
            } catch (InterruptedException e) {
              log.error("RateBasedLimiter: " + interruptedMsg, e);
              return -1;
            }

            try {
              List pages =
                  _client.getPages(_siteProperty, startDate, endDate, country, GoogleWebmasterClient.API_ROW_LIMIT,
                      requestedDimensions, apiDimensionFilters, start);
              if (pages.size() == 0) {
                return 0;
              }
              int totalPages = pages.size() + start;
              log.info(String.format("At least %s pages exist. Continuing...", totalPages));
              return totalPages;
            } catch (IOException e) {
              log.info(String.format("Getting page size from %s failed due to %s. Retrying...", start, e.getMessage()));
              coolDown(r, PAGES_COUNT_COOLDOWN_TIME);
            }
          }
          throw new RuntimeException(String.format(
              "Getting all pages reaches the maximum number of retires %d. Date range: %s ~ %s. Country: %s.",
              GET_PAGES_RETRIES, startDate, endDate, country));
        });
        results.add(submit);
      }

      List pagesCount = new ArrayList<>();
      for (Future result : results) {
        try {
          pagesCount.add(result.get());
        } catch (InterruptedException | ExecutionException e) {
          throw new RuntimeException(e);
        }
      }

      if (pagesCount.stream().allMatch(x -> x == 0)) {
        return max;
      }
      max = Math.max(max, Collections.max(pagesCount));
      if (max % GoogleWebmasterClient.API_ROW_LIMIT != 0) {
        return max;
      }

      results.clear();
    }
  }

  private void coolDown(int r, int secondsInterval) {
    int milliSeconds = secondsInterval + (r / 5) * secondsInterval;
    milliSeconds *= 1000;
    log.info(String.format("Sleeping for %s seconds", milliSeconds / 1000));
    try {
      Thread.sleep(milliSeconds);
    } catch (InterruptedException e1) {
      throw new RuntimeException(e1);
    }
  }

  /**
   * Get all pages in an async mode.
   */
  private Collection getPages(String startDate, String endDate, List dimensions,
      ApiDimensionFilter countryFilter, Queue> toProcess, int rowLimit) {
    String country = GoogleWebmasterFilter.countryFilterToString(countryFilter);

    ConcurrentLinkedDeque allPages = new ConcurrentLinkedDeque<>();
    int r = 0;
    while (r <= GET_PAGES_RETRIES) {
      ++r;
      log.info(String.format("Get pages at round %d with size %d.", r, toProcess.size()));
      ConcurrentLinkedDeque> nextRound = new ConcurrentLinkedDeque<>();
      ExecutorService es = Executors.newFixedThreadPool(10,
          ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of(this.getClass().getSimpleName())));

      while (!toProcess.isEmpty()) {
        submitJob(toProcess.poll(), countryFilter, startDate, endDate, dimensions, es, allPages, nextRound, rowLimit);
      }
      //wait for jobs to finish and start next round if necessary.
      try {
        es.shutdown();
        boolean terminated = es.awaitTermination(5, TimeUnit.MINUTES);
        if (!terminated) {
          es.shutdownNow();
          log.warn("Timed out while getting all pages for country-{} at round {}. Next round now has size {}.", country,
              r, nextRound.size());
        }
      } catch (InterruptedException e) {
        throw new RuntimeException(e);
      }

      if (nextRound.isEmpty()) {
        break;
      }
      toProcess = nextRound;
      coolDown(r, PAGES_GET_COOLDOWN_TIME);
    }
    if (r == GET_PAGES_RETRIES + 1) {
      throw new RuntimeException(
          String.format("Getting all pages reaches the maximum number of retires %d. Date range: %s ~ %s. Country: %s.",
              GET_PAGES_RETRIES, startDate, endDate, country));
    }
    return allPages;
  }

  private void submitJob(final Pair job, final ApiDimensionFilter countryFilter,
      final String startDate, final String endDate, final List dimensions, ExecutorService es,
      final ConcurrentLinkedDeque allPages, final ConcurrentLinkedDeque> nextRound,
      final int rowLimit) {
    es.submit(new Runnable() {
      @Override
      public void run() {
        try {
          LIMITER.acquirePermits(1);
        } catch (InterruptedException e) {
          throw new RuntimeException("RateBasedLimiter got interrupted.", e);
        }

        String countryString = countryFilterToString(countryFilter);
        List filters = new LinkedList<>();
        filters.add(countryFilter);

        String prefix = job.getLeft();
        FilterOperator operator = job.getRight();
        String jobString = String.format("job(prefix: %s, operator: %s)", prefix, operator);
        filters.add(GoogleWebmasterFilter.pageFilter(operator, prefix));
        List pages;
        try {
          pages = _client.getPages(_siteProperty, startDate, endDate, countryString, rowLimit, dimensions, filters, 0);
          log.debug(
              String.format("%d pages fetched for %s market-%s from %s to %s.", pages.size(), jobString, countryString,
                  startDate, endDate));
        } catch (IOException e) {
          log.debug(String.format("%s failed due to %s. Retrying...", jobString, e.getMessage()));
          nextRound.add(job);
          return;
        }

        //If the number of pages is at the LIMIT, it must be a "CONTAINS" job.
        //We need to create sub-tasks, and check current page with "EQUALS"
        if (pages.size() == GoogleWebmasterClient.API_ROW_LIMIT) {
          log.info(String.format("Expanding the prefix '%s'", prefix));
          nextRound.add(Pair.of(prefix, FilterOperator.EQUALS));
          for (String expanded : getUrlPartitions(prefix)) {
            nextRound.add(Pair.of(expanded, FilterOperator.CONTAINS));
          }
        } else {
          //Otherwise, we've done with current job.
          allPages.addAll(pages);
        }
      }
    });
  }

  /**
   * This doesn't cover all cases but more than 99.9% captured.
   *
   * According to the standard (RFC-3986), here are possible characters:
   * unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
   * reserved      = gen-delims / sub-delims
   * gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
   * sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
   *
   *
   * Not included:
   * reserved      = gen-delims / sub-delims
   * gen-delims    = "[" / "]"
   * sub-delims    = "(" / ")" / "," / ";"
   */
  private ArrayList getUrlPartitions(String prefix) {
    ArrayList expanded = new ArrayList<>();
    //The page prefix is case insensitive, A-Z is not necessary.
    for (char c = 'a'; c <= 'z'; ++c) {
      expanded.add(prefix + c);
    }
    for (int num = 0; num <= 9; ++num) {
      expanded.add(prefix + num);
    }
    expanded.add(prefix + "-");
    expanded.add(prefix + ".");
    expanded.add(prefix + "_"); //most important
    expanded.add(prefix + "~");

    expanded.add(prefix + "/"); //most important
    expanded.add(prefix + "%"); //most important
    expanded.add(prefix + ":");
    expanded.add(prefix + "?");
    expanded.add(prefix + "#");
    expanded.add(prefix + "@");
    expanded.add(prefix + "!");
    expanded.add(prefix + "$");
    expanded.add(prefix + "&");
    expanded.add(prefix + "+");
    expanded.add(prefix + "*");
    expanded.add(prefix + "'");
    expanded.add(prefix + "=");
    return expanded;
  }

  @Override
  public List performSearchAnalyticsQuery(String startDate, String endDate, int rowLimit,
      List requestedDimensions, List requestedMetrics, Collection filters)
      throws IOException {
    SearchAnalyticsQueryResponse response =
        _client.createSearchAnalyticsQuery(_siteProperty, startDate, endDate, requestedDimensions,
            GoogleWebmasterFilter.andGroupFilters(filters), rowLimit, 0).execute();
    return convertResponse(requestedMetrics, response);
  }

  @Override
  public void performSearchAnalyticsQueryInBatch(List jobs, List> filterList,
      List> callbackList, List requestedDimensions,
      int rowLimit) throws IOException {
    BatchRequest batchRequest = _client.createBatch();

    for (int i = 0; i < jobs.size(); ++i) {
      ProducerJob job = jobs.get(i);
      ArrayList filters = filterList.get(i);
      JsonBatchCallback callback = callbackList.get(i);
      _client.createSearchAnalyticsQuery(_siteProperty, job.getStartDate(), job.getEndDate(), requestedDimensions,
          GoogleWebmasterFilter.andGroupFilters(filters), rowLimit, 0).queue(batchRequest, callback);
    }

    batchRequest.execute();
  }

  @Override
  public String getSiteProperty() {
    return _siteProperty;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy