com.digitalpebble.stormcrawler.sql.StatusUpdaterBolt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-sql Show documentation
Show all versions of storm-crawler-sql Show documentation
SQL-based resources for StormCrawler
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.sql;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Date;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.storm.metric.api.MeanReducer;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.metric.api.MultiReducedMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.AbstractStatusUpdaterBolt;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.URLPartitioner;
@SuppressWarnings("serial")
public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt {
public static final Logger LOG = LoggerFactory
.getLogger(StatusUpdaterBolt.class);
private MultiReducedMetric averagedMetrics;
private MultiCountMetric eventCounter;
private Connection connection;
private String tableName;
private URLPartitioner partitioner;
private int maxNumBuckets = -1;
public StatusUpdaterBolt(int maxNumBuckets) {
this.maxNumBuckets = maxNumBuckets;
}
/** Does not shard based on the total number of queues **/
public StatusUpdaterBolt() {
}
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
super.prepare(stormConf, context, collector);
partitioner = new URLPartitioner();
partitioner.configure(stormConf);
this.averagedMetrics = context.registerMetric("SQLStatusUpdater",
new MultiReducedMetric(new MeanReducer()), 10);
this.eventCounter = context.registerMetric("counter",
new MultiCountMetric(), 10);
tableName = ConfUtils.getString(stormConf,
Constants.MYSQL_TABLE_PARAM_NAME);
try {
connection = SQLUtil.getConnection(stormConf);
} catch (SQLException ex) {
LOG.error(ex.getMessage(), ex);
throw new RuntimeException(ex);
}
}
@Override
public void store(String url, Status status, Metadata metadata,
Date nextFetch) throws Exception {
// the mysql insert statement
String query = tableName
+ " (url, status, nextfetchdate, metadata, bucket)"
+ " values (?, ?, ?, ?, ?)";
StringBuffer mdAsString = new StringBuffer();
for (String mdKey : metadata.keySet()) {
String[] vals = metadata.getValues(mdKey);
for (String v : vals) {
mdAsString.append("\t").append(mdKey).append("=").append(v);
}
}
int partition = 0;
if (maxNumBuckets > 1) {
// determine which queue to send to based on the host / domain / IP
String partitionKey = partitioner.getPartition(url, metadata);
partition = Math.abs(partitionKey.hashCode() % maxNumBuckets);
}
// create in table if does not already exist
if (status.equals(Status.DISCOVERED)) {
query = "INSERT IGNORE INTO " + query;
} else
query = "REPLACE INTO " + query;
PreparedStatement preparedStmt = connection.prepareStatement(query);
preparedStmt.setString(1, url);
preparedStmt.setString(2, status.toString());
preparedStmt.setObject(3, nextFetch);
preparedStmt.setString(4, mdAsString.toString());
preparedStmt.setInt(5, partition);
long start = System.currentTimeMillis();
// execute the preparedstatement
preparedStmt.execute();
eventCounter.scope("sql_query_number").incrBy(1);
averagedMetrics.scope("sql_execute_time").update(
System.currentTimeMillis() - start);
preparedStmt.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy