com.digitalpebble.stormcrawler.sql.SQLSpout Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-sql Show documentation
Show all versions of storm-crawler-sql Show documentation
SQL-based resources for StormCrawler
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.sql;
import java.nio.ByteBuffer;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Timestamp;
import java.time.Instant;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.storm.spout.Scheme;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.persistence.AbstractQueryingSpout;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.StringTabScheme;
@SuppressWarnings("serial")
public class SQLSpout extends AbstractQueryingSpout {
public static final Logger LOG = LoggerFactory.getLogger(SQLSpout.class);
private static final Scheme SCHEME = new StringTabScheme();
private String tableName;
private Connection connection;
/**
* if more than one instance of the spout exist, each one is in charge of a
* separate bucket value. This is used to ensure a good diversity of URLs.
**/
private int bucketNum = -1;
/** Used to distinguish between instances in the logs **/
protected String logIdprefix = "";
private int maxDocsPerBucket;
private int maxNumResults;
private Instant lastNextFetchDate = null;
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void open(Map conf, TopologyContext context,
SpoutOutputCollector collector) {
super.open(conf, context, collector);
maxDocsPerBucket = ConfUtils.getInt(conf,
Constants.SQL_MAX_DOCS_BUCKET_PARAM_NAME, 5);
tableName = ConfUtils.getString(conf,
Constants.SQL_STATUS_TABLE_PARAM_NAME, "urls");
maxNumResults = ConfUtils.getInt(conf,
Constants.SQL_MAXRESULTS_PARAM_NAME, 100);
try {
connection = SQLUtil.getConnection(conf);
} catch (SQLException ex) {
LOG.error(ex.getMessage(), ex);
throw new RuntimeException(ex);
}
// determine bucket this spout instance will be in charge of
int totalTasks = context
.getComponentTasks(context.getThisComponentId()).size();
if (totalTasks > 1) {
logIdprefix = "[" + context.getThisComponentId() + " #"
+ context.getThisTaskIndex() + "] ";
bucketNum = context.getThisTaskIndex();
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(SCHEME.getOutputFields());
}
@Override
protected void populateBuffer() {
if (lastNextFetchDate == null) {
lastNextFetchDate = Instant.now();
} else if (resetFetchDateAfterNSecs != -1) {
Instant changeNeededOn = Instant.ofEpochMilli(lastNextFetchDate
.toEpochMilli() + (resetFetchDateAfterNSecs * 1000));
if (Instant.now().isAfter(changeNeededOn)) {
LOG.info("lastDate reset based on resetFetchDateAfterNSecs {}",
resetFetchDateAfterNSecs);
lastNextFetchDate = Instant.now();
}
}
// select entries from mysql
// https://mariadb.com/kb/en/library/window-functions-overview/
// http://www.mysqltutorial.org/mysql-window-functions/mysql-rank-function/
String query = "SELECT * from (select rank() over (partition by host order by nextfetchdate desc, url) as ranking, url, metadata, nextfetchdate from "
+ tableName;
query += " WHERE nextfetchdate <= '"
+ new Timestamp(lastNextFetchDate.toEpochMilli()) + "'";
// constraint on bucket num
if (bucketNum >= 0) {
query += " AND bucket = '" + bucketNum + "'";
}
query += ") as urls_ranks where (urls_ranks.ranking <= "
+ maxDocsPerBucket + ") order by ranking";
if (maxNumResults != -1) {
query += " LIMIT " + this.maxNumResults;
}
int alreadyprocessed = 0;
int numhits = 0;
long timeStartQuery = System.currentTimeMillis();
// create the java statement
Statement st = null;
ResultSet rs = null;
try {
st = this.connection.createStatement();
// dump query to log
LOG.debug("{} SQL query {}", logIdprefix, query);
// execute the query, and get a java resultset
rs = st.executeQuery(query);
long timeTaken = System.currentTimeMillis() - timeStartQuery;
queryTimes.addMeasurement(timeTaken);
// iterate through the java resultset
while (rs.next()) {
String url = rs.getString("url");
numhits++;
// already processed? skip
if (beingProcessed.containsKey(url)) {
alreadyprocessed++;
continue;
}
String metadata = rs.getString("metadata");
if (metadata == null) {
metadata = "";
} else if (!metadata.startsWith("\t")) {
metadata = "\t" + metadata;
}
String URLMD = url + metadata;
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy