![JAR search and dependency download from the Maven repository](/logo.png)
gobblin.example.wikipedia.WikipediaExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gobblin-example Show documentation
Show all versions of gobblin-example Show documentation
Gobblin Ingestion Framework
/*
* Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package gobblin.example.wikipedia;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.io.Closer;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.DataRecordException;
import gobblin.source.extractor.Extractor;
import gobblin.source.workunit.WorkUnit;
/**
* An implementation of {@link Extractor} for the Wikipedia example.
*
*
* This extractor uses the MediaWiki web API to retrieve a certain number of latest revisions
* for each specified title from Wikipedia. Each revision is returned as a JSON document.
*
*
* @author ziliu
*/
public class WikipediaExtractor implements Extractor {
private static final Logger LOG = LoggerFactory.getLogger(WikipediaExtractor.class);
private static final String SOURCE_PAGE_TITLES = "source.page.titles";
private static final String SOURCE_REVISIONS_CNT = "source.revisions.cnt";
private static final String WIKIPEDIA_API_ROOTURL = "wikipedia.api.rooturl";
private static final String WIKIPEDIA_AVRO_SCHEMA = "wikipedia.avro.schema";
private static final String JSON_MEMBER_QUERY = "query";
private static final String JSON_MEMBER_PAGES = "pages";
private static final String JSON_MEMBER_REVISIONS = "revisions";
private static final String JSON_MEMBER_PAGEID = "pageid";
private static final String JSON_MEMBER_TITLE = "title";
private static final Splitter SPLITTER = Splitter.on(",").omitEmptyStrings().trimResults();
private static final Gson GSON = new Gson();
private final WorkUnit workUnit;
private final WikiResponseReader reader;
private final int revisionsCnt;
private final String rootUrl;
private final String schema;
private final Queue requestedTitles;
private final int numRequestedTitles;
private Queue recordsOfCurrentTitle;
private class WikiResponseReader implements Iterator {
@Override
public boolean hasNext() {
if (!WikipediaExtractor.this.recordsOfCurrentTitle.isEmpty()) {
return true;
} else if (WikipediaExtractor.this.requestedTitles.isEmpty()) {
return false;
} else {
/*
* Retrieve revisions for the next title. Repeat until we find a title that has at least one revision,
* otherwise return false
*/
while (!WikipediaExtractor.this.requestedTitles.isEmpty()) {
String currentTitle = WikipediaExtractor.this.requestedTitles.poll();
try {
WikipediaExtractor.this.recordsOfCurrentTitle = retrievePageRevisions(currentTitle);
} catch (IOException e) {
LOG.error("IOException while retrieving revisions for title '" + currentTitle + "'");
}
if (!WikipediaExtractor.this.recordsOfCurrentTitle.isEmpty()) {
return true;
}
}
return false;
}
}
@Override
public JsonElement next() {
if (!hasNext()) {
return null;
}
return WikipediaExtractor.this.recordsOfCurrentTitle.poll();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
public WikipediaExtractor(WorkUnitState workUnitState) throws IOException {
this.workUnit = workUnitState.getWorkunit();
this.rootUrl = this.workUnit.getProp(WIKIPEDIA_API_ROOTURL);
this.schema = this.workUnit.getProp(WIKIPEDIA_AVRO_SCHEMA);
this.requestedTitles = new LinkedList<>(SPLITTER.splitToList(this.workUnit.getProp(SOURCE_PAGE_TITLES)));
this.revisionsCnt = Integer.parseInt(this.workUnit.getProp(SOURCE_REVISIONS_CNT));
this.numRequestedTitles = this.requestedTitles.size();
if (this.requestedTitles.isEmpty()) {
this.recordsOfCurrentTitle = new LinkedList<>();
} else {
String firstTitle = this.requestedTitles.poll();
this.recordsOfCurrentTitle = retrievePageRevisions(firstTitle);
}
this.reader = new WikiResponseReader();
}
private Queue retrievePageRevisions(String pageTitle) throws IOException {
Queue retrievedRevisions = new LinkedList<>();
Closer closer = Closer.create();
HttpURLConnection conn = null;
StringBuilder sb = new StringBuilder();
String urlStr = this.rootUrl + "&titles=" + pageTitle + "&rvlimit=" + this.revisionsCnt;
try {
conn = getHttpConnection(urlStr);
conn.connect();
BufferedReader br = closer.register(
new BufferedReader(new InputStreamReader(conn.getInputStream(), ConfigurationKeys.DEFAULT_CHARSET_ENCODING)));
String line;
while ((line = br.readLine()) != null) {
sb.append(line + "\n");
}
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
try {
closer.close();
} catch (IOException e) {
LOG.error("IOException in Closer.close() while retrieving revisions for title '" + pageTitle + "' from URL '"
+ urlStr + "'");
}
if (conn != null) {
conn.disconnect();
}
}
if (Strings.isNullOrEmpty(sb.toString())) {
LOG.warn("Received empty response for query: " + urlStr);
return retrievedRevisions;
}
JsonElement jsonElement = GSON.fromJson(sb.toString(), JsonElement.class);
if (jsonElement == null || !jsonElement.isJsonObject()) {
return retrievedRevisions;
}
JsonObject jsonObj = jsonElement.getAsJsonObject();
if (jsonObj == null || !jsonObj.has(JSON_MEMBER_QUERY)) {
return retrievedRevisions;
}
JsonObject queryObj = jsonObj.getAsJsonObject(JSON_MEMBER_QUERY);
if (!queryObj.has(JSON_MEMBER_PAGES)) {
return retrievedRevisions;
}
JsonObject pagesObj = queryObj.getAsJsonObject(JSON_MEMBER_PAGES);
if (pagesObj.entrySet().isEmpty()) {
return retrievedRevisions;
}
JsonObject pageIdObj = pagesObj.getAsJsonObject(pagesObj.entrySet().iterator().next().getKey());
if (!pageIdObj.has(JSON_MEMBER_REVISIONS)) {
return retrievedRevisions;
}
//retrieve revisions of the current pageTitle
JsonArray jsonArr = pageIdObj.getAsJsonArray(JSON_MEMBER_REVISIONS);
for (JsonElement revElement : jsonArr) {
JsonObject revObj = revElement.getAsJsonObject();
/*'pageid' and 'title' are associated with the parent object
* of all revisions. Add them to each individual revision.
*/
if (pageIdObj.has(JSON_MEMBER_PAGEID)) {
revObj.add(JSON_MEMBER_PAGEID, pageIdObj.get(JSON_MEMBER_PAGEID));
}
if (pageIdObj.has(JSON_MEMBER_TITLE)) {
revObj.add(JSON_MEMBER_TITLE, pageIdObj.get(JSON_MEMBER_TITLE));
}
retrievedRevisions.add(revObj);
}
LOG.info(retrievedRevisions.size() + " record(s) retrieved for title " + pageTitle);
return retrievedRevisions;
}
private HttpURLConnection getHttpConnection(String urlStr) throws IOException {
URL url = new URL(urlStr);
Proxy proxy = Proxy.NO_PROXY;
if (this.workUnit.contains(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL)
&& this.workUnit.contains(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT)) {
LOG.info("Use proxy host: " + this.workUnit.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL));
LOG.info("Use proxy port: " + this.workUnit.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT));
InetSocketAddress proxyAddress =
new InetSocketAddress(this.workUnit.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL),
Integer.parseInt(this.workUnit.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT)));
proxy = new Proxy(Proxy.Type.HTTP, proxyAddress);
}
return (HttpURLConnection) url.openConnection(proxy);
}
@Override
public void close() throws IOException {
// There's nothing to close
}
@Override
public String getSchema() {
return this.schema;
}
@Override
public JsonElement readRecord(@Deprecated JsonElement reuse) throws DataRecordException, IOException {
if (this.reader == null) {
return null;
}
if (this.reader.hasNext()) {
return this.reader.next();
}
return null;
}
@Override
public long getExpectedRecordCount() {
return this.numRequestedTitles * this.revisionsCnt;
}
@Override
public long getHighWatermark() {
return 0;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy