com.digitalpebble.stormcrawler.elasticsearch.persistence.AbstractSpout Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-elasticsearch Show documentation
Elasticsearch resources for StormCrawler
There is a newer version: 2.11
Show newest version
/**
 * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License. You may obtain a copy of the
 * License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.stormcrawler.elasticsearch.persistence;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.elasticsearch.ElasticSearchConnection;
import com.digitalpebble.stormcrawler.persistence.AbstractQueryingSpout;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.search.SearchHit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractSpout extends AbstractQueryingSpout {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractSpout.class);

    protected static final String ESBoltType = "status";
    protected static final String ESStatusIndexNameParamName = "es.status.index.name";

    /** Field name to use for aggregating * */
    protected static final String ESStatusBucketFieldParamName = "es.status.bucket.field";

    protected static final String ESStatusMaxBucketParamName = "es.status.max.buckets";
    protected static final String ESStatusMaxURLsParamName = "es.status.max.urls.per.bucket";

    /** Field name to use for sorting the URLs within a bucket, not used if empty or null. */
    protected static final String ESStatusBucketSortFieldParamName = "es.status.bucket.sort.field";

    /** Field name to use for sorting the buckets, not used if empty or null. */
    protected static final String ESStatusGlobalSortFieldParamName = "es.status.global.sort.field";

    protected static final String ESStatusFilterParamName = "es.status.filterQuery";

    protected static final String ESStatusQueryTimeoutParamName = "es.status.query.timeout";

    /** Query to use as a positive filter, set by es.status.filterQuery */
    protected List filterQueries = null;

    protected String indexName;

    protected static RestHighLevelClient client;

    /**
     * when using multiple instances - each one is in charge of a specific shard useful when
     * sharding based on host or domain to guarantee a good mix of URLs
     */
    protected int shardID = -1;

    /** Used to distinguish between instances in the logs * */
    protected String logIdprefix = "";

    /** Field name used for field collapsing e.g. key * */
    protected String partitionField;

    protected int maxURLsPerBucket = 10;

    protected int maxBucketNum = 10;

    protected List bucketSortField = new LinkedList<>();

    protected String totalSortField = "";

    protected Date queryDate;

    protected int queryTimeout = -1;

    @Override
    public void open(
            Map stormConf,
            TopologyContext context,
            SpoutOutputCollector collector) {

        super.open(stormConf, context, collector);

        indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName, "status");

        // one ES client per JVM
        synchronized (AbstractSpout.class) {
            try {
                if (client == null) {
                    client = ElasticSearchConnection.getClient(stormConf, ESBoltType);
                }
            } catch (Exception e1) {
                LOG.error("Can't connect to ElasticSearch", e1);
                throw new RuntimeException(e1);
            }
        }

        // if more than one instance is used we expect their number to be the
        // same as the number of shards
        int totalTasks = context.getComponentTasks(context.getThisComponentId()).size();
        if (totalTasks > 1) {
            logIdprefix =
                    "[" + context.getThisComponentId() + " #" + context.getThisTaskIndex() + "] ";

            // determine the number of shards so that we can restrict the
            // search

            // TODO use the admin API when it gets available
            // TODO or the low level one with
            // https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-shards-stores.html
            // TODO identify local shards and use those if possible

            // ClusterSearchShardsRequest request = new
            // ClusterSearchShardsRequest(
            // indexName);
            // ClusterSearchShardsResponse shardresponse = client.admin()
            // .cluster().searchShards(request).actionGet();
            // ClusterSearchShardsGroup[] shardgroups =
            // shardresponse.getGroups();
            // if (totalTasks != shardgroups.length) {
            // throw new RuntimeException(
            // "Number of ES spout instances should be the same as number of
            // shards ("
            // + shardgroups.length + ") but is " + totalTasks);
            // }
            // shardID = shardgroups[context.getThisTaskIndex()].getShardId()
            // .getId();

            // TEMPORARY simply use the task index as shard index
            shardID = context.getThisTaskIndex();
            LOG.info("{} assigned shard ID {}", logIdprefix, shardID);
        }

        partitionField = ConfUtils.getString(stormConf, ESStatusBucketFieldParamName, "key");

        bucketSortField = ConfUtils.loadListFromConf(ESStatusBucketSortFieldParamName, stormConf);

        totalSortField = ConfUtils.getString(stormConf, ESStatusGlobalSortFieldParamName);

        maxURLsPerBucket = ConfUtils.getInt(stormConf, ESStatusMaxURLsParamName, 1);
        maxBucketNum = ConfUtils.getInt(stormConf, ESStatusMaxBucketParamName, 10);

        queryTimeout = ConfUtils.getInt(stormConf, ESStatusQueryTimeoutParamName, -1);

        filterQueries = ConfUtils.loadListFromConf(ESStatusFilterParamName, stormConf);
    }

    /** Builds a query and use it retrieve the results from ES * */
    protected abstract void populateBuffer();

    protected final boolean addHitToBuffer(SearchHit hit) {
        Map keyValues = hit.getSourceAsMap();
        String url = (String) keyValues.get("url");
        // is already being processed - skip it!
        if (beingProcessed.containsKey(url)) {
            return false;
        }
        Metadata metadata = fromKeyValues(keyValues);
        addHitInfoToMetadata(metadata, hit);

        return buffer.add(url, metadata);
    }

    protected void addHitInfoToMetadata(Metadata metadata, SearchHit hit) {}

    protected final Metadata fromKeyValues(Map keyValues) {
        Map> mdAsMap = (Map>) keyValues.get("metadata");
        Metadata metadata = new Metadata();
        if (mdAsMap != null) {
            Iterator>> mdIter = mdAsMap.entrySet().iterator();
            while (mdIter.hasNext()) {
                Entry> mdEntry = mdIter.next();
                String key = mdEntry.getKey();
                // periods are not allowed in ES2 - replace with %2E
                key = key.replaceAll("%2E", "\\.");
                Object mdValObj = mdEntry.getValue();
                // single value
                if (mdValObj instanceof String) {
                    metadata.addValue(key, (String) mdValObj);
                }
                // multi valued
                else {
                    metadata.addValues(key, (List) mdValObj);
                }
            }
        }
        return metadata;
    }

    @Override
    public void ack(Object msgId) {
        LOG.debug("{}  Ack for {}", logIdprefix, msgId);
        super.ack(msgId);
    }

    @Override
    public void fail(Object msgId) {
        LOG.info("{}  Fail for {}", logIdprefix, msgId);
        super.fail(msgId);
    }

    @Override
    public void close() {
        if (client != null)
            try {
                client.close();
            } catch (IOException e) {
            }
    }
}