org.elassandra.discovery.PendingClusterStatesQueue Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
There is a newer version: 6.2.3.31
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elassandra.discovery;

import org.apache.logging.log4j.Logger;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.discovery.zen.PendingClusterStateStats;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.Map;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;

/**
 * A queue that holds all "in-flight" incoming cluster states from the master. Once a master commits a cluster
 * state, it is made available via {@link #getNextClusterStateToProcess()}. The class also takes care of batching
 * cluster states for processing and failures.
 * 
 * The queue is bound by {@link #maxQueueSize}. When the queue is at capacity and a new cluster state is inserted
 * the oldest cluster state will be dropped. This is safe because:
 * 1) Under normal operations, master will publish & commit a cluster state before processing
 *    another change (i.e., the queue length is 1)
 * 2) If the master fails to commit a change, it will step down, causing a master election, which will flush the queue.
 * 3) In general it's safe to process the incoming cluster state as a replacement to the cluster state that's dropped.
 * a) If the dropped cluster is from the same master as the incoming one is, it is likely to be superseded by the
 *    incoming state (or another state in the queue).
 * This is only not true in very extreme cases of out of order delivery.
 * b) If the dropping cluster state is not from the same master, it means that:
 * i) we are no longer following the master of the dropped cluster state but follow the incoming one
 * ii) we are no longer following any master, in which case it doesn't matter which cluster state will be processed first.
 * 

 * The class is fully thread safe and can be used concurrently.
 */

/** Strapdata NOTE:
 *  When PAXOS commit succeed, elassandra announce the CQL schema including the elasticsearch metadata.
 *
 *  The SchemaListener put the received state in the PendingClusterStatesQueue and try to apply it.
 *  When applied, the attached stateProcessedListener send an AppliedClusterState message to the coordinator node
 *  to acknowledge.
 *
 *  On the coordinator node, a BlockingClusterStatePublishResponseHandler wait for the local and remote notification.
 */
public class PendingClusterStatesQueue {

    public interface StateProcessedListener {

        void onNewClusterStateProcessed();

        void onNewClusterStateFailed(Exception e);
    }


    final ConcurrentSkipListMap pendingStates = new ConcurrentSkipListMap<>(Comparator.comparingLong(ClusterState::version));
    final Logger logger;
    final int maxQueueSize;

    public PendingClusterStatesQueue(Logger logger, int maxQueueSize) {
        this.logger = logger;
        this.maxQueueSize = maxQueueSize;
    }

    /** Add an incoming, not yet committed cluster state */
    public void addPending(ClusterState state, StateProcessedListener listener) {
        logger.trace("adding cluster state version={} metadata={}", state.version(), state.metaData().x2());
        StateProcessedListener previousListener = pendingStates.put(state, listener);
        assert previousListener == null : "ClusterState "+state.version()+" already pending";

        if (pendingStates.size() > maxQueueSize) {
            Map.Entry entry = pendingStates.firstEntry();
            logger.warn("dropping pending state.version=[{}] more than [{}] pending states.", entry.getKey().version(), maxQueueSize);
            pendingStates.remove(entry.getKey()).onNewClusterStateFailed(new ElasticsearchException("too many pending states ([{}] pending)", maxQueueSize));
        }
    }

    public int size() {
        return pendingStates.size();
    }

    /**
     * mark that the processing of the given state has failed. All committed states that are
     * {@link ClusterState#supersedes(ClusterState)}-ed by this failed state, will be failed as well
     */
    public synchronized void markAsFailed(ClusterState state, Exception reason) {
        ConcurrentNavigableMap processedStates = this.pendingStates.headMap(state, true);

        if (processedStates.isEmpty()) {
            throw new IllegalStateException("can't resolve processed cluster state with uuid [" + state.stateUUID()
                + "], version [" + state.version() + "]");
        }

        logger.debug("failing state.version={} metatdata.version={} batch size={}",
                state.version(), state.metaData().x2(), processedStates.size());

        processedStates.entrySet().forEach( entry -> {
            logger.trace("failing state.version={} metatdata.version={} with state.version={}",
                    entry.getKey().version(), entry.getKey().metaData().x2(), state.version());
            entry.getValue().onNewClusterStateFailed(reason);
            this.pendingStates.remove(entry.getKey());
        });
    }

    /**
     * indicates that a cluster state was successfully processed. Any committed state that is
     * {@link ClusterState#supersedes(ClusterState)}-ed by the processed state will be marked as processed as well.
     * 

     * NOTE: successfully processing a state indicates we are following the master it came from. Any committed state
     * from another master will be failed by this method
     */
    public synchronized void markAsProcessed(ClusterState state) {
        if (pendingStates.isEmpty()) {
            throw new IllegalStateException("can't resolve processed cluster state with uuid [" + state.stateUUID()
                + "], version [" + state.version() + "]");
        }

        ConcurrentNavigableMap processedStates = pendingStates.headMap(state, true);
        logger.debug("processed state.version={} metatdata.version={} batch size={}", state.version(), state.metaData().x2(), processedStates.size());
        processedStates.entrySet().forEach( entry -> {
            logger.trace("processed state.version={} metatdata.version={} with state.version={}",
                    entry.getKey().version(), entry.getKey().metaData().x2(), state.version());
            entry.getValue().onNewClusterStateProcessed();
            this.pendingStates.remove(entry.getKey());
        });
    }

    /** clear the incoming queue. any committed state will be failed
     */
    public synchronized void failAllStatesAndClear(Exception reason) {
        this.pendingStates.values().forEach(listener -> listener.onNewClusterStateFailed(reason));
        pendingStates.clear();
    }

    /**
     * Gets the next committed state to process.
     * 
     * The method tries to batch operation by getting the cluster state the highest possible committed states
     * which succeeds the first committed state in queue (i.e., it comes from the same master).
     */
    public synchronized ClusterState getNextClusterStateToProcess() {
        if (pendingStates.isEmpty()) {
            return null;
        }

        Map.Entry lastEntry = pendingStates.lastEntry();
        logger.trace("peek most recent clusterState.version={} metadata=[{}]", lastEntry.getKey().version(), lastEntry.getKey().metaData().x2());
        return lastEntry.getKey();
    }

    /** returns all pending states, committed or not */
    public synchronized ClusterState[] pendingClusterStates() {
        return this.pendingStates.keySet().stream().toArray(ClusterState[]::new);
    }

    public synchronized PendingClusterStateStats stats() {
        return new PendingClusterStateStats(pendingStates.size(), 0, pendingStates.size());
    }
}