All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.digitalpebble.stormcrawler.elasticsearch.persistence.AbstractSpout Maven / Gradle / Ivy

There is a newer version: 2.11
Show newest version
/**
 * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License. You may obtain a copy of the
 * License at
 *
 * 

http://www.apache.org/licenses/LICENSE-2.0 * *

Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.elasticsearch.persistence; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.elasticsearch.ElasticSearchConnection; import com.digitalpebble.stormcrawler.persistence.AbstractQueryingSpout; import com.digitalpebble.stormcrawler.util.ConfUtils; import java.io.IOException; import java.util.Date; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.storm.spout.SpoutOutputCollector; import org.apache.storm.task.TopologyContext; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.search.SearchHit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class AbstractSpout extends AbstractQueryingSpout { private static final Logger LOG = LoggerFactory.getLogger(AbstractSpout.class); protected static final String ESBoltType = "status"; protected static final String ESStatusIndexNameParamName = "es.status.index.name"; /** Field name to use for aggregating * */ protected static final String ESStatusBucketFieldParamName = "es.status.bucket.field"; protected static final String ESStatusMaxBucketParamName = "es.status.max.buckets"; protected static final String ESStatusMaxURLsParamName = "es.status.max.urls.per.bucket"; /** Field name to use for sorting the URLs within a bucket, not used if empty or null. */ protected static final String ESStatusBucketSortFieldParamName = "es.status.bucket.sort.field"; /** Field name to use for sorting the buckets, not used if empty or null. */ protected static final String ESStatusGlobalSortFieldParamName = "es.status.global.sort.field"; protected static final String ESStatusFilterParamName = "es.status.filterQuery"; protected static final String ESStatusQueryTimeoutParamName = "es.status.query.timeout"; /** Query to use as a positive filter, set by es.status.filterQuery */ protected List filterQueries = null; protected String indexName; protected static RestHighLevelClient client; /** * when using multiple instances - each one is in charge of a specific shard useful when * sharding based on host or domain to guarantee a good mix of URLs */ protected int shardID = -1; /** Used to distinguish between instances in the logs * */ protected String logIdprefix = ""; /** Field name used for field collapsing e.g. key * */ protected String partitionField; protected int maxURLsPerBucket = 10; protected int maxBucketNum = 10; protected List bucketSortField = new LinkedList<>(); protected String totalSortField = ""; protected Date queryDate; protected int queryTimeout = -1; @Override public void open( Map stormConf, TopologyContext context, SpoutOutputCollector collector) { super.open(stormConf, context, collector); indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName, "status"); // one ES client per JVM synchronized (AbstractSpout.class) { try { if (client == null) { client = ElasticSearchConnection.getClient(stormConf, ESBoltType); } } catch (Exception e1) { LOG.error("Can't connect to ElasticSearch", e1); throw new RuntimeException(e1); } } // if more than one instance is used we expect their number to be the // same as the number of shards int totalTasks = context.getComponentTasks(context.getThisComponentId()).size(); if (totalTasks > 1) { logIdprefix = "[" + context.getThisComponentId() + " #" + context.getThisTaskIndex() + "] "; // determine the number of shards so that we can restrict the // search // TODO use the admin API when it gets available // TODO or the low level one with // https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-shards-stores.html // TODO identify local shards and use those if possible // ClusterSearchShardsRequest request = new // ClusterSearchShardsRequest( // indexName); // ClusterSearchShardsResponse shardresponse = client.admin() // .cluster().searchShards(request).actionGet(); // ClusterSearchShardsGroup[] shardgroups = // shardresponse.getGroups(); // if (totalTasks != shardgroups.length) { // throw new RuntimeException( // "Number of ES spout instances should be the same as number of // shards (" // + shardgroups.length + ") but is " + totalTasks); // } // shardID = shardgroups[context.getThisTaskIndex()].getShardId() // .getId(); // TEMPORARY simply use the task index as shard index shardID = context.getThisTaskIndex(); LOG.info("{} assigned shard ID {}", logIdprefix, shardID); } partitionField = ConfUtils.getString(stormConf, ESStatusBucketFieldParamName, "key"); bucketSortField = ConfUtils.loadListFromConf(ESStatusBucketSortFieldParamName, stormConf); totalSortField = ConfUtils.getString(stormConf, ESStatusGlobalSortFieldParamName); maxURLsPerBucket = ConfUtils.getInt(stormConf, ESStatusMaxURLsParamName, 1); maxBucketNum = ConfUtils.getInt(stormConf, ESStatusMaxBucketParamName, 10); queryTimeout = ConfUtils.getInt(stormConf, ESStatusQueryTimeoutParamName, -1); filterQueries = ConfUtils.loadListFromConf(ESStatusFilterParamName, stormConf); } /** Builds a query and use it retrieve the results from ES * */ protected abstract void populateBuffer(); protected final boolean addHitToBuffer(SearchHit hit) { Map keyValues = hit.getSourceAsMap(); String url = (String) keyValues.get("url"); // is already being processed - skip it! if (beingProcessed.containsKey(url)) { return false; } Metadata metadata = fromKeyValues(keyValues); addHitInfoToMetadata(metadata, hit); return buffer.add(url, metadata); } protected void addHitInfoToMetadata(Metadata metadata, SearchHit hit) {} protected final Metadata fromKeyValues(Map keyValues) { Map> mdAsMap = (Map>) keyValues.get("metadata"); Metadata metadata = new Metadata(); if (mdAsMap != null) { Iterator>> mdIter = mdAsMap.entrySet().iterator(); while (mdIter.hasNext()) { Entry> mdEntry = mdIter.next(); String key = mdEntry.getKey(); // periods are not allowed in ES2 - replace with %2E key = key.replaceAll("%2E", "\\."); Object mdValObj = mdEntry.getValue(); // single value if (mdValObj instanceof String) { metadata.addValue(key, (String) mdValObj); } // multi valued else { metadata.addValues(key, (List) mdValObj); } } } return metadata; } @Override public void ack(Object msgId) { LOG.debug("{} Ack for {}", logIdprefix, msgId); super.ack(msgId); } @Override public void fail(Object msgId) { LOG.info("{} Fail for {}", logIdprefix, msgId); super.fail(msgId); } @Override public void close() { if (client != null) try { client.close(); } catch (IOException e) { } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy