
gobblin.ingestion.google.webmaster.GoogleWebMasterSource Maven / Gradle / Ivy
package gobblin.ingestion.google.webmaster;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.extract.QueryBasedSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Google Webmaster API enables you to download data from Google Search Console for search analytics of the verified sites. See more here https://developers.google.com/webmaster-tools/. Configure the Google Webmaster Source for starting a daily job to download search analytics data. This gobblin job partitions the whole task into sub-tasks for each day. Each sub-task is handled by a GoogleWebmasterExtractor for that date, and each GoogleWebmasterExtractor holds a queue of GoogleWebmasterExtractorIterators, each of which does the query task for each filter(Currently, only the country filter is supported.) on that date.
*
* The minimum unit of querying range is date. Change the range by configuring "source.querybased.start.value" and "source.querybased.end.value". Note that the analytics data for Google Search Console has a delay or 3 days. So cap your configuration of "source.querybased.append.max.watermark.limit" by "CURRENTDATE-3". See the documentation details of each configuration in the GoogleWebMasterSource fields.
*
*/
abstract class GoogleWebMasterSource extends QueryBasedSource {
/**
* Must Provide.
* Provide the property site URL whose google search analytics data you want to download
*/
public static final String KEY_PROPERTY = "source.google_webmasters.property_url";
/**
* The filters that will be passed to all your API requests.
* Filter format is [GoogleWebmasterFilter.Dimension].[DimensionValue]
* Currently, this filter operator is "EQUALS" and only Country dimension is supported. Will extend this feature according to more use cases in the futher.
*/
public static final String KEY_REQUEST_FILTERS = "source.google_webmasters.request.filters";
/**
* Must Provide.
*
* Allowed dimensions can be found in the enum GoogleWebmasterFilter.Dimension
*/
public static final String KEY_REQUEST_DIMENSIONS = "source.google_webmasters.request.dimensions";
/**
* Must Provide.
*
* Allowed metrics can be found in the enum GoogleWebmasterDataFetcher.Metric
*/
public static final String KEY_REQUEST_METRICS = "source.google_webmasters.request.metrics";
/**
* Optional: Default to 5000, which is the maximum allowed.
*
* The response row limit when you ask for pages. Set it to 5000 when you want to get all pages, which might be larger than 5000.
*/
public static final String KEY_REQUEST_PAGE_LIMIT = "source.google_webmasters.request.page_limit";
/**
* Optional: Default to String.empty
* Hot start this service with pre-set pages. Once this is set, the service will ignore KEY_REQUEST_PAGE_LIMIT, and won't get all pages, but use the pre-set pages instead.
*/
public static final String KEY_REQUEST_HOT_START = "source.google_webmasters.request.hot_start";
/**
* Optional: Default to 5000, which is the maximum allowed.
*
* The response row limit when you ask for queries.
*/
public static final String KEY_REQUEST_QUERY_LIMIT = "source.google_webmasters.request.query_limit";
/**
* Set the time out in minutes for each round.
*/
public static final String KEY_REQUEST_TIME_OUT = "source.google_webmasters.request.time_out";
/**
* Tune the maximum rounds of retries allowed when API calls failed because of exceeding quota.
*/
public static final String KEY_REQUEST_TUNING_RETRIES =
"source.google_webmasters.request.performance_tuning.max_retry_rounds";
/**
* Tune the initial cool down time before starting another round of retry.
*/
public static final String KEY_REQUEST_TUNING_INITIAL_COOL_DOWN =
"source.google_webmasters.request.performance_tuning.initial_cool_down";
/**
* Tune the extra cool down sleep time for each round before starting another round of retry.
* The total cool down time will be calculated as "initial_cool_down + cool_down_step * round"
*/
public static final String KEY_REQUEST_TUNING_COOL_DOWN_STEP =
"source.google_webmasters.request.performance_tuning.cool_down_step";
/**
* Tune the speed of API requests.
*/
public static final String KEY_REQUEST_TUNING_REQUESTS_PER_SECOND =
"source.google_webmasters.request.performance_tuning.requests_per_second";
/**
* Tune the size of a batch. Batch API calls together to reduce the number of HTTP connections.
* Note: A set of n requests batched together counts toward your usage limit as n requests, not as one request. The batch request is taken apart into a set of requests before processing.
* Read more at https://developers.google.com/webmaster-tools/v3/how-tos/batch
*/
public static final String KEY_REQUEST_TUNING_BATCH_SIZE =
"source.google_webmasters.request.performance_tuning.batch_size";
/**
* Set the group size for UrlTriePrefixGrouper
*/
public static final String KEY_REQUEST_TUNING_GROUP_SIZE =
"source.google_webmasters.request.performance_tuning.group_size";
/**
* True: Trie based
* False: Queue based
*/
public static final String KEY_REQUEST_TUNING_ALGORITHM =
"source.google_webmasters.request.performance_tuning.advanced";
private final static Splitter splitter = Splitter.on(",").omitEmptyStrings().trimResults();
@Override
public Extractor getExtractor(WorkUnitState state) throws IOException {
List requestedDimensions = getRequestedDimensions(state);
List requestedMetrics = getRequestedMetrics(state);
String schema = state.getWorkunit().getProp(ConfigurationKeys.SOURCE_SCHEMA);
JsonArray schemaJson = new JsonParser().parse(schema).getAsJsonArray();
Map columnPositionMap = new HashMap<>();
for (int i = 0; i < schemaJson.size(); ++i) {
JsonElement jsonElement = schemaJson.get(i);
String columnName = jsonElement.getAsJsonObject().get("columnName").getAsString().toUpperCase();
columnPositionMap.put(columnName, i);
}
validateFilters(state.getProp(GoogleWebMasterSource.KEY_REQUEST_FILTERS));
validateRequests(columnPositionMap, requestedDimensions, requestedMetrics);
return createExtractor(state, columnPositionMap, requestedDimensions, requestedMetrics);
}
abstract GoogleWebmasterExtractor createExtractor(WorkUnitState state, Map columnPositionMap,
List requestedDimensions,
List requestedMetrics) throws IOException;
private void validateFilters(String filters) {
String countryPrefix = "COUNTRY.";
for (String filter : splitter.split(filters)) {
if (filter.toUpperCase().startsWith(countryPrefix)) {
GoogleWebmasterFilter.validateCountryCode(filter.substring(countryPrefix.length()));
}
}
}
private void validateRequests(Map columnPositionMap,
List requestedDimensions,
List requestedMetrics) {
for (GoogleWebmasterFilter.Dimension dimension : requestedDimensions) {
Preconditions.checkState(columnPositionMap.containsKey(dimension.toString()),
"Your requested dimension must exist in the source.schema.");
}
for (GoogleWebmasterDataFetcher.Metric metric : requestedMetrics) {
Preconditions.checkState(columnPositionMap.containsKey(metric.toString()),
"Your requested metric must exist in the source.schema.");
}
}
private List getRequestedDimensions(WorkUnitState wuState) {
List dimensions = new ArrayList<>();
String dimensionsString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_DIMENSIONS);
for (String dim : splitter.split(dimensionsString)) {
dimensions.add(GoogleWebmasterFilter.Dimension.valueOf(dim.toUpperCase()));
}
return dimensions;
}
private List getRequestedMetrics(WorkUnitState wuState) {
List metrics = new ArrayList<>();
String metricsString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_METRICS);
for (String metric : splitter.split(metricsString)) {
metrics.add(GoogleWebmasterDataFetcher.Metric.valueOf(metric.toUpperCase()));
}
return metrics;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy