All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.ingestion.google.webmaster.GoogleWebMasterSource Maven / Gradle / Ivy

Go to download

A distributed data integration framework for streaming and batch data ecosystems.

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.ingestion.google.webmaster;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;

import org.apache.gobblin.annotation.Alpha;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.WorkUnitState;
import org.apache.gobblin.converter.avro.JsonElementConversionFactory;
import org.apache.gobblin.ingestion.google.util.SchemaUtil;
import org.apache.gobblin.source.extractor.Extractor;
import org.apache.gobblin.source.extractor.extract.QueryBasedSource;
import org.apache.gobblin.source.workunit.WorkUnit;


/**
 * Google Webmaster API enables you to download data from Google Search Console for search analytics of the verified sites. See more here https://developers.google.com/webmaster-tools/. Configure the Google Webmaster Source for starting a daily job to download search analytics data. This gobblin job partitions the whole task into sub-tasks for each day. Each sub-task is handled by a GoogleWebmasterExtractor for that date, and each GoogleWebmasterExtractor holds a queue of GoogleWebmasterExtractorIterators, each of which does the query task for each filter(Currently, only the country filter is supported.) on that date.
 *
 * The minimum unit of querying range is date. Change the range by configuring "source.querybased.start.value" and "source.querybased.end.value". Note that the analytics data for Google Search Console has a delay or 3 days. So cap your configuration of "source.querybased.append.max.watermark.limit" by "CURRENTDATE-3". See the documentation details of each configuration in the GoogleWebMasterSource fields.
 *
 */
@Alpha
abstract class GoogleWebMasterSource extends QueryBasedSource {
  public static final String SOURCE_GOOGLE_WEBMASTER_PREFIX = "source.google_webmasters.";
  /**
   * Must Provide.
   * Provide the property site URL whose google search analytics data you want to download
   */
  public static final String KEY_PROPERTY = SOURCE_GOOGLE_WEBMASTER_PREFIX + "property_urls";
  /**
   * Optional. Default to false.
   * Determine whether to add source property as the last column to your configured schema
   */
  public static final String KEY_INCLUDE_SOURCE_PROPERTY = SOURCE_GOOGLE_WEBMASTER_PREFIX + "source_property.include";
  /**
   * Optional. Default to "Source".
   * Determine the column name for the additional source property origin column if included
   */
  public static final String KEY_SOURCE_PROPERTY_COLUMN_NAME =
      SOURCE_GOOGLE_WEBMASTER_PREFIX + "source_property.column_name";
  /**
   * The filters that will be passed to all your API requests.
   * Filter format is [GoogleWebmasterFilter.Dimension].[DimensionValue]
   * Currently, this filter operator is "EQUALS" and only Country dimension is supported. Will extend this feature according to more use cases in the futher.
   */
  public static final String KEY_REQUEST_FILTERS = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.filters";
  /**
   * Must Provide.
   *
   * Allowed dimensions can be found in the enum GoogleWebmasterFilter.Dimension
   */
  public static final String KEY_REQUEST_DIMENSIONS = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.dimensions";
  /**
   * Must Provide.
   *
   * Allowed metrics can be found in the enum GoogleWebmasterDataFetcher.Metric
   */
  public static final String KEY_REQUEST_METRICS = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.metrics";
  /**
   * Optional: Default to 5000, which is the maximum allowed.
   *
   * The response row limit when you ask for pages. Set it to 5000 when you want to get all pages, which might be larger than 5000.
   */
  public static final String KEY_REQUEST_PAGE_LIMIT = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.page_limit";
  /**
   * Optional: Default to String.empty
   * Hot start this service with pre-set pages. Once this is set, the service will ignore KEY_REQUEST_PAGE_LIMIT, and won't get all pages, but use the pre-set pages instead.
   */
  public static final String KEY_REQUEST_HOT_START = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.hot_start";
  /**
   * Optional: Default to 5000, which is the maximum allowed.
   *
   * The response row limit when you ask for queries.
   */
  public static final String KEY_REQUEST_QUERY_LIMIT = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.query_limit";
  public static final String TUNING = SOURCE_GOOGLE_WEBMASTER_PREFIX + "request.tuning.";

  // ===============================================
  // =========   GET QUERIES TUNING BEGIN ==========
  // ===============================================
  public static final String QUERIES_TUNING = TUNING + "get_queries.";
  /**
   * Optional. Default to 120 minutes.
   * Set the time out in minutes for each round while getting queries.
   */
  public static final String KEY_QUERIES_TUNING_TIME_OUT = QUERIES_TUNING + "time_out";
  /**
   * Optional. Default to 40.
   * Tune the maximum rounds of retries while getting queries.
   */
  public static final String KEY_QUERIES_TUNING_RETRIES = QUERIES_TUNING + "max_reties";
  /**
   * Optional. Default to 250 millisecond.
   * Tune the cool down time between each round of retry.
   */
  public static final String KEY_QUERIES_TUNING_COOL_DOWN = QUERIES_TUNING + "cool_down_time";
  /**
   * Optional. Default to 2.25 batches per second.
   * Tune the speed of API requests.
   */
  public static final String KEY_QUERIES_TUNING_BATCHES_PER_SECOND = QUERIES_TUNING + "batches_per_second";
  /**
   * Optional. Default to 2.
   * Tune the size of a batch. Batch API calls together to reduce the number of HTTP connections.
   * Note: A set of n requests batched together counts toward your usage limit as n requests, not as one request. The batch request is taken apart into a set of requests before processing.
   * Read more at https://developers.google.com/webmaster-tools/v3/how-tos/batch
   */
  public static final String KEY_QUERIES_TUNING_BATCH_SIZE = QUERIES_TUNING + "batch_size";
  /**
   * Optional. Default to 500.
   * Set the group size for UrlTriePrefixGrouper
   */
  public static final String KEY_QUERIES_TUNING_GROUP_SIZE = QUERIES_TUNING + "trie_group_size";

  /**
   * Optional. Default to false.
   * Choose whether to apply the trie based algorithm while getting all queries.
   *
   * If set to true, you also need to set page_limit to 5000 indicating that you want to get all pages because trie based algorithm won't give you expected results if you just need a subset of all pages.
   */
  public static final String KEY_REQUEST_TUNING_ALGORITHM = QUERIES_TUNING + "apply_trie";
  // =============================================
  // =========   GET QUERIES TUNING END ==========
  // =============================================

  // =============================================
  // =========   GET PAGES TUNING BEGIN ==========
  // =============================================
  public static final String PAGES_TUNING = TUNING + "get_pages.";
  /**
   * Optional. Default to 5.0.
   * Tune the speed of API requests while getting all pages.
   */
  public static final String KEY_PAGES_TUNING_REQUESTS_PER_SECOND = PAGES_TUNING + "requests_per_second";
  /**
   * Optional. Default to 120.
   * Tune the number of maximum retries while getting all pages. Consider the following affecting factors while setting this number:
   * 1. the length of shared prefix path may be very long
   * 2. the Quota Exceeded exception
   */
  public static final String KEY_PAGES_TUNING_MAX_RETRIES = PAGES_TUNING + "max_retries";
  /**
   * Optional. Default to 30 seconds.
   * Set the cooldown time in seconds while getting the page count.
   */
  public static final String KEY_PAGES_COUNT_TUNING_COOLDOWN_TIME = PAGES_TUNING + "size.cooldown";
  /**
   * Optional. Default to 5 seconds.
   * Set the cooldown time in seconds while getting all pages.
   */
  public static final String KEY_PAGES_GET_TUNING_COOLDOWN_TIME = PAGES_TUNING + "get.cooldown";
  // =============================================
  // =========   GET PAGES TUNING END ============
  // =============================================

  private final static Splitter splitter = Splitter.on(",").omitEmptyStrings().trimResults();
  public static final boolean DEFAULT_INCLUDE_SOURCE_PROPERTY = false;
  public static final String DEFAULT_SOURCE_PROPERTY_COLUMN_NAME = "Source";

  @Override
  public Extractor getExtractor(WorkUnitState state) throws IOException {
    List requestedDimensions = getRequestedDimensions(state);
    List requestedMetrics = getRequestedMetrics(state);

    WorkUnit workunit = state.getWorkunit();
    String schema = workunit.getProp(ConfigurationKeys.SOURCE_SCHEMA);

    JsonArray schemaJson = new JsonParser().parse(schema).getAsJsonArray();
    Map columnPositionMap = new HashMap<>();
    for (int i = 0; i < schemaJson.size(); ++i) {
      JsonElement jsonElement = schemaJson.get(i);
      String columnName = jsonElement.getAsJsonObject().get("columnName").getAsString().toUpperCase();
      columnPositionMap.put(columnName, i);
    }

    if (workunit.getPropAsBoolean(GoogleWebMasterSource.KEY_INCLUDE_SOURCE_PROPERTY, DEFAULT_INCLUDE_SOURCE_PROPERTY)) {
      String columnName = workunit.getProp(KEY_SOURCE_PROPERTY_COLUMN_NAME, DEFAULT_SOURCE_PROPERTY_COLUMN_NAME);
      schemaJson.add(SchemaUtil.createColumnJson(columnName, false, JsonElementConversionFactory.Type.STRING));
    }

    validateFilters(state.getProp(GoogleWebMasterSource.KEY_REQUEST_FILTERS));
    validateRequests(columnPositionMap, requestedDimensions, requestedMetrics);
    return createExtractor(state, columnPositionMap, requestedDimensions, requestedMetrics, schemaJson);
  }

  abstract GoogleWebmasterExtractor createExtractor(WorkUnitState state, Map columnPositionMap,
      List requestedDimensions,
      List requestedMetrics, JsonArray schemaJson) throws IOException;

  private void validateFilters(String filters) {
    String countryPrefix = "COUNTRY.";

    for (String filter : splitter.split(filters)) {
      if (filter.toUpperCase().startsWith(countryPrefix)) {
        GoogleWebmasterFilter.validateCountryCode(filter.substring(countryPrefix.length()));
      }
    }
  }

  private void validateRequests(Map columnPositionMap,
      List requestedDimensions,
      List requestedMetrics) {
    for (GoogleWebmasterFilter.Dimension dimension : requestedDimensions) {
      Preconditions.checkState(columnPositionMap.containsKey(dimension.toString()),
          "Your requested dimension must exist in the source.schema.");
    }
    for (GoogleWebmasterDataFetcher.Metric metric : requestedMetrics) {
      Preconditions.checkState(columnPositionMap.containsKey(metric.toString()),
          "Your requested metric must exist in the source.schema.");
    }
  }

  private List getRequestedDimensions(WorkUnitState wuState) {
    List dimensions = new ArrayList<>();
    String dimensionsString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_DIMENSIONS);
    for (String dim : splitter.split(dimensionsString)) {
      dimensions.add(GoogleWebmasterFilter.Dimension.valueOf(dim.toUpperCase()));
    }
    return dimensions;
  }

  private List getRequestedMetrics(WorkUnitState wuState) {
    List metrics = new ArrayList<>();
    String metricsString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_METRICS);
    for (String metric : splitter.split(metricsString)) {
      metrics.add(GoogleWebmasterDataFetcher.Metric.valueOf(metric.toUpperCase()));
    }
    return metrics;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy