com.digitalpebble.stormcrawler.solr.persistence.SolrSpout Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-solr Show documentation
Show all versions of storm-crawler-solr Show documentation
Solr resources for StormCrawler
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.solr.persistence;
import java.time.Instant;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.AbstractQueryingSpout;
import com.digitalpebble.stormcrawler.solr.SolrConnection;
import com.digitalpebble.stormcrawler.util.ConfUtils;
public class SolrSpout extends AbstractQueryingSpout {
private static final Logger LOG = LoggerFactory.getLogger(SolrSpout.class);
private static final String BOLT_TYPE = "status";
private static final String SolrDiversityFieldParam = "solr.status.bucket.field";
private static final String SolrDiversityBucketParam = "solr.status.bucket.maxsize";
private static final String SolrMetadataPrefix = "solr.status.metadata.prefix";
private static final String SolrMaxResultsParam = "solr.status.max.results";
private SolrConnection connection;
private int maxNumResults = 10;
private int lastStartOffset = 0;
private Instant lastNextFetchDate = null;
private String diversityField = null;
private int diversityBucketSize = 0;
private String mdPrefix;
@Override
public void open(Map stormConf, TopologyContext context,
SpoutOutputCollector collector) {
super.open(stormConf, context, collector);
// This implementation works only where there is a single instance
// of the spout. Having more than one instance means that they would run
// the same queries and send the same tuples down the topology.
int totalTasks = context
.getComponentTasks(context.getThisComponentId()).size();
if (totalTasks > 1) {
throw new RuntimeException(
"Can't have more than one instance of SOLRSpout");
}
diversityField = ConfUtils
.getString(stormConf, SolrDiversityFieldParam);
diversityBucketSize = ConfUtils.getInt(stormConf,
SolrDiversityBucketParam, 5);
// the results have the first hit separate from the expansions
diversityBucketSize--;
mdPrefix = ConfUtils.getString(stormConf, SolrMetadataPrefix,
"metadata");
maxNumResults = ConfUtils.getInt(stormConf, SolrMaxResultsParam, 10);
try {
connection = SolrConnection.getConnection(stormConf, BOLT_TYPE);
} catch (Exception e) {
LOG.error("Can't connect to Solr: {}", e);
throw new RuntimeException(e);
}
}
@Override
public void close() {
if (connection != null) {
try {
connection.close();
} catch (Exception e) {
LOG.error("Can't close connection to Solr: {}", e);
}
}
}
protected void populateBuffer() {
SolrQuery query = new SolrQuery();
if (lastNextFetchDate == null) {
lastNextFetchDate = Instant.now();
lastStartOffset = 0;
lastTimeResetToNOW = Instant.now();
}
// reset the value for next fetch date if the previous one is too
// old
else if (resetFetchDateAfterNSecs != -1) {
Instant changeNeededOn = Instant.ofEpochMilli(lastTimeResetToNOW
.toEpochMilli() + (resetFetchDateAfterNSecs * 1000));
if (Instant.now().isAfter(changeNeededOn)) {
LOG.info("lastDate reset based on resetFetchDateAfterNSecs {}",
resetFetchDateAfterNSecs);
lastNextFetchDate = Instant.now();
lastStartOffset = 0;
}
}
query.setQuery("*:*")
.addFilterQuery(
"nextFetchDate:[* TO " + lastNextFetchDate + "]")
.setStart(lastStartOffset).setRows(this.maxNumResults);
if (StringUtils.isNotBlank(diversityField) && diversityBucketSize > 0) {
query.addFilterQuery(String.format(
"{!collapse field=%s sort='nextFetchDate asc'}",
diversityField));
query.set("expand", "true").set("expand.rows", diversityBucketSize);
query.set("expand.sort", "nextFetchDate asc");
}
LOG.debug("QUERY => {}", query.toString());
try {
long startQuery = System.currentTimeMillis();
QueryResponse response = connection.getClient().query(query);
long endQuery = System.currentTimeMillis();
queryTimes.addMeasurement(endQuery - startQuery);
SolrDocumentList docs = new SolrDocumentList();
LOG.debug("Response : {}", response.toString());
// add the main results
docs.addAll(response.getResults());
// Add the documents collapsed by the CollapsingQParser
Map expandedResults = response
.getExpandedResults();
if (StringUtils.isNotBlank(diversityField)
&& expandedResults != null) {
for (String key : expandedResults.keySet()) {
docs.addAll(expandedResults.get(key));
}
}
int numhits = response.getResults().size();
// no more results?
if (numhits == 0) {
lastStartOffset = 0;
lastNextFetchDate = null;
} else {
lastStartOffset += numhits;
}
String prefix = mdPrefix.concat(".");
int alreadyProcessed = 0;
int docReturned = 0;
for (SolrDocument doc : docs) {
String url = (String) doc.get("url");
docReturned++;
// is already being processed - skip it!
if (beingProcessed.containsKey(url)) {
alreadyProcessed++;
continue;
}
Metadata metadata = new Metadata();
Iterator keyIterators = doc.getFieldNames().iterator();
while (keyIterators.hasNext()) {
String key = keyIterators.next();
if (key.startsWith(prefix)) {
Collection
© 2015 - 2025 Weber Informatics LLC | Privacy Policy