
gobblin.ingestion.google.webmaster.GoogleWebmasterExtractor Maven / Gradle / Ivy
package gobblin.ingestion.google.webmaster;
import avro.shaded.com.google.common.collect.Iterables;
import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.repackaged.com.google.common.base.Preconditions;
import com.google.api.services.webmasters.WebmastersScopes;
import com.google.api.services.webmasters.model.ApiDimensionFilter;
import com.google.common.base.Splitter;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.DataRecordException;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.extract.LongWatermark;
import gobblin.source.extractor.extract.google.GoogleCommon;
import gobblin.source.extractor.extract.google.GoogleCommonKeys;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Queue;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static gobblin.configuration.ConfigurationKeys.*;
import static gobblin.source.extractor.extract.google.GoogleCommonKeys.*;
public class GoogleWebmasterExtractor implements Extractor {
private final static Logger LOG = LoggerFactory.getLogger(GoogleWebmasterExtractor.class);
private final static Splitter splitter = Splitter.on(",").omitEmptyStrings().trimResults();
private final String _schema;
private final WorkUnitState _wuState;
private final int _size;
public final static DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
private final static DateTimeFormatter watermarkFormatter = DateTimeFormat.forPattern("yyyyMMddHHmmss");
private Queue _iterators = new ArrayDeque<>();
/**
* Each element keeps a mapping from API response order to output schema order.
* The array index matches the order of API response.
* The array values matches the order of output schema.
*/
private Queue _positionMaps = new ArrayDeque<>();
private final DateTime _startDate;
private final DateTime _endDate;
private boolean _successful = false;
public GoogleWebmasterExtractor(WorkUnitState wuState, long lowWatermark, long highWatermark,
Map columnPositionMap, List requestedDimensions,
List requestedMetrics) throws IOException {
this(wuState, lowWatermark, highWatermark, columnPositionMap, requestedDimensions, requestedMetrics,
new GoogleWebmasterDataFetcherImpl(wuState.getProp(GoogleWebMasterSource.KEY_PROPERTY), getCredential(wuState),
wuState.getProp(ConfigurationKeys.SOURCE_ENTITY), getHotStartJobs(wuState)));
}
private static List getHotStartJobs(WorkUnitState wuState) {
String hotStartString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_HOT_START, "");
if (!hotStartString.isEmpty()) {
return SimpleProducerJob.deserialize(hotStartString);
}
return new ArrayList<>();
}
private static Credential getCredential(WorkUnitState wuState) {
String scope = wuState.getProp(GoogleCommonKeys.API_SCOPES, WebmastersScopes.WEBMASTERS_READONLY);
Preconditions.checkArgument(
Objects.equals(WebmastersScopes.WEBMASTERS_READONLY, scope) || Objects.equals(WebmastersScopes.WEBMASTERS,
scope), "The scope for WebMaster must either be WEBMASTERS_READONLY or WEBMASTERS");
String credentialFile = wuState.getProp(SOURCE_CONN_PRIVATE_KEY);
List scopes = Collections.singletonList(scope);
// return GoogleCredential.fromStream(new FileInputStream(credentialFile))
// .createScoped(Collections.singletonList(scope));
return new GoogleCommon.CredentialBuilder(credentialFile, scopes).fileSystemUri(
wuState.getProp(GoogleCommonKeys.PRIVATE_KEY_FILESYSTEM_URI))
.proxyUrl(wuState.getProp(SOURCE_CONN_USE_PROXY_URL))
.port(wuState.getProp(SOURCE_CONN_USE_PROXY_PORT))
.serviceAccountId(wuState.getProp(SOURCE_CONN_USERNAME))
.build();
}
/**
* For test only
*/
GoogleWebmasterExtractor(WorkUnitState wuState, long lowWatermark, long highWatermark,
Map columnPositionMap, List requestedDimensions,
List requestedMetrics, GoogleWebmasterDataFetcher dataFetcher) {
_startDate = watermarkFormatter.parseDateTime(Long.toString(lowWatermark));
_endDate = watermarkFormatter.parseDateTime(Long.toString(highWatermark));
_schema = wuState.getWorkunit().getProp(ConfigurationKeys.SOURCE_SCHEMA);
_size = columnPositionMap.size();
_wuState = wuState;
Iterable
© 2015 - 2025 Weber Informatics LLC | Privacy Policy