com.digitalpebble.stormcrawler.sql.SQLSpout Maven / Gradle / Ivy
Show all versions of storm-crawler-sql Show documentation
/**
* Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
* file distributed with this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy of the
* License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.sql;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.AbstractQueryingSpout;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.StringTabScheme;
import java.nio.ByteBuffer;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Timestamp;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import org.apache.storm.spout.Scheme;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SQLSpout extends AbstractQueryingSpout {
public static final Logger LOG = LoggerFactory.getLogger(SQLSpout.class);
private static final Scheme SCHEME = new StringTabScheme();
private String tableName;
private Connection connection;
/**
* if more than one instance of the spout exist, each one is in charge of a separate bucket
* value. This is used to ensure a good diversity of URLs.
*/
private int bucketNum = -1;
/** Used to distinguish between instances in the logs * */
protected String logIdprefix = "";
private int maxDocsPerBucket;
private int maxNumResults;
private Instant lastNextFetchDate = null;
@Override
public void open(
Map conf, TopologyContext context, SpoutOutputCollector collector) {
super.open(conf, context, collector);
maxDocsPerBucket = ConfUtils.getInt(conf, Constants.SQL_MAX_DOCS_BUCKET_PARAM_NAME, 5);
tableName = ConfUtils.getString(conf, Constants.SQL_STATUS_TABLE_PARAM_NAME, "urls");
maxNumResults = ConfUtils.getInt(conf, Constants.SQL_MAXRESULTS_PARAM_NAME, 100);
try {
connection = SQLUtil.getConnection(conf);
} catch (SQLException ex) {
LOG.error(ex.getMessage(), ex);
throw new RuntimeException(ex);
}
// determine bucket this spout instance will be in charge of
int totalTasks = context.getComponentTasks(context.getThisComponentId()).size();
if (totalTasks > 1) {
logIdprefix =
"[" + context.getThisComponentId() + " #" + context.getThisTaskIndex() + "] ";
bucketNum = context.getThisTaskIndex();
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(SCHEME.getOutputFields());
}
@Override
protected void populateBuffer() {
if (lastNextFetchDate == null) {
lastNextFetchDate = Instant.now();
lastTimeResetToNOW = Instant.now();
} else if (resetFetchDateAfterNSecs != -1) {
Instant changeNeededOn =
Instant.ofEpochMilli(
lastTimeResetToNOW.toEpochMilli() + (resetFetchDateAfterNSecs * 1000));
if (Instant.now().isAfter(changeNeededOn)) {
LOG.info(
"lastDate reset based on resetFetchDateAfterNSecs {}",
resetFetchDateAfterNSecs);
lastNextFetchDate = Instant.now();
}
}
// select entries from mysql
// https://mariadb.com/kb/en/library/window-functions-overview/
// http://www.mysqltutorial.org/mysql-window-functions/mysql-rank-function/
String query =
"SELECT * from (select rank() over (partition by host order by nextfetchdate desc, url) as ranking, url, metadata, nextfetchdate from "
+ tableName;
query +=
" WHERE nextfetchdate <= '" + new Timestamp(lastNextFetchDate.toEpochMilli()) + "'";
// constraint on bucket num
if (bucketNum >= 0) {
query += " AND bucket = '" + bucketNum + "'";
}
query +=
") as urls_ranks where (urls_ranks.ranking <= "
+ maxDocsPerBucket
+ ") order by ranking";
if (maxNumResults != -1) {
query += " LIMIT " + this.maxNumResults;
}
int alreadyprocessed = 0;
int numhits = 0;
long timeStartQuery = System.currentTimeMillis();
// create the java statement
Statement st = null;
ResultSet rs = null;
try {
st = this.connection.createStatement();
// dump query to log
LOG.debug("{} SQL query {}", logIdprefix, query);
// execute the query, and get a java resultset
rs = st.executeQuery(query);
long timeTaken = System.currentTimeMillis() - timeStartQuery;
queryTimes.addMeasurement(timeTaken);
// iterate through the java resultset
while (rs.next()) {
String url = rs.getString("url");
numhits++;
// already processed? skip
if (beingProcessed.containsKey(url)) {
alreadyprocessed++;
continue;
}
String metadata = rs.getString("metadata");
if (metadata == null) {
metadata = "";
} else if (!metadata.startsWith("\t")) {
metadata = "\t" + metadata;
}
String URLMD = url + metadata;
List